summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Anholt <eric@anholt.net>2010-06-06 16:45:41 -0700
committerEric Anholt <eric@anholt.net>2010-06-07 02:36:13 -0700
commit5d30aad05f78adb42ee45b724fbff50fffa4e94f (patch)
treec6327d9e47ab753d8d7ee63cad201b3b8425f2f5
parent66375fd6e8d3e95df5d124883a1426460c1b8ed8 (diff)
intel: Stop using sized buckets for the BO cache.no-more-buckets
Theory being that we don't keep *that* many buffers around, and the pain of trying to find the right size in the list is less than the pain of using gratuitous amounts of system memory for our BOs. This brings my GM45 gnome desktop with 2 terminals from 39MB allocated (13.8MB pinned) to 32MB allocated (11.0MB pinned), and the same desktop with an openarena demo running from 69MB to 55MB. No significant performance difference on cairo-gl (firefox-talos-gfx, 3 samples, comparing lowest times), which is the most BO-allocation-sensitive workload I know of.
-rw-r--r--intel/intel_bufmgr_gem.c296
1 files changed, 136 insertions, 160 deletions
diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 398c7d07..eebcf4ab 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -68,40 +68,7 @@
typedef struct _drm_intel_bo_gem drm_intel_bo_gem;
-struct drm_intel_gem_bo_bucket {
- drmMMListHead head;
- unsigned long size;
-};
-
-/* Only cache objects up to 64MB. Bigger than that, and the rounding of the
- * size makes many operations fail that wouldn't otherwise.
- */
-#define DRM_INTEL_GEM_BO_BUCKETS 14
-typedef struct _drm_intel_bufmgr_gem {
- drm_intel_bufmgr bufmgr;
-
- int fd;
-
- int max_relocs;
-
- pthread_mutex_t lock;
-
- struct drm_i915_gem_exec_object *exec_objects;
- struct drm_i915_gem_exec_object2 *exec2_objects;
- drm_intel_bo **exec_bos;
- int exec_size;
- int exec_count;
-
- /** Array of lists of cached gem objects of power-of-two sizes */
- struct drm_intel_gem_bo_bucket cache_bucket[DRM_INTEL_GEM_BO_BUCKETS];
-
- uint64_t gtt_size;
- int available_fences;
- int pci_device;
- int gen;
- char bo_reuse;
- char fenced_relocs;
-} drm_intel_bufmgr_gem;
+#define BO_CACHE_MAX_SIZE (64 * 1024 * 1024)
#define DRM_INTEL_RELOC_FENCE (1<<0)
@@ -150,7 +117,8 @@ struct _drm_intel_bo_gem {
void *gtt_virtual;
/** BO cache list */
- drmMMListHead head;
+ drm_intel_bo_gem *next;
+ drm_intel_bo_gem *prev;
/**
* Boolean of whether this BO and its children have been included in
@@ -190,6 +158,37 @@ struct _drm_intel_bo_gem {
int reloc_tree_fences;
};
+typedef struct _drm_intel_bufmgr_gem {
+ drm_intel_bufmgr bufmgr;
+
+ int fd;
+
+ int max_relocs;
+
+ pthread_mutex_t lock;
+
+ struct drm_i915_gem_exec_object *exec_objects;
+ struct drm_i915_gem_exec_object2 *exec2_objects;
+ drm_intel_bo **exec_bos;
+ int exec_size;
+ int exec_count;
+
+ /**
+ * List structure for the BO cache. It would be nice to not
+ * store this thing as a bo_gem but just a pair of pointers,
+ * but the libdrm list macros are sadly not the linux list
+ * macros.
+ */
+ drm_intel_bo_gem bo_cache;
+
+ uint64_t gtt_size;
+ int available_fences;
+ int pci_device;
+ int gen;
+ char bo_reuse;
+ char fenced_relocs;
+} drm_intel_bufmgr_gem;
+
static unsigned int
drm_intel_gem_estimate_batch_space(drm_intel_bo ** bo_array, int count);
@@ -279,23 +278,6 @@ drm_intel_gem_bo_tile_pitch(drm_intel_bufmgr_gem *bufmgr_gem,
return i;
}
-static struct drm_intel_gem_bo_bucket *
-drm_intel_gem_bo_bucket_for_size(drm_intel_bufmgr_gem *bufmgr_gem,
- unsigned long size)
-{
- int i;
-
- for (i = 0; i < DRM_INTEL_GEM_BO_BUCKETS; i++) {
- struct drm_intel_gem_bo_bucket *bucket =
- &bufmgr_gem->cache_bucket[i];
- if (bucket->size >= size) {
- return bucket;
- }
- }
-
- return NULL;
-}
-
static void
drm_intel_gem_dump_validation_list(drm_intel_bufmgr_gem *bufmgr_gem)
{
@@ -529,23 +511,84 @@ drm_intel_gem_bo_madvise(drm_intel_bo *bo, int madv)
/* drop the oldest entries that have been purged by the kernel */
static void
-drm_intel_gem_bo_cache_purge_bucket(drm_intel_bufmgr_gem *bufmgr_gem,
- struct drm_intel_gem_bo_bucket *bucket)
+drm_intel_gem_bo_cache_purge(drm_intel_bufmgr_gem *bufmgr_gem)
{
- while (!DRMLISTEMPTY(&bucket->head)) {
- drm_intel_bo_gem *bo_gem;
+ while (!DRMLISTEMPTY(&bufmgr_gem->bo_cache)) {
+ drm_intel_bo_gem *bo_gem = bufmgr_gem->bo_cache.next;
- bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
- bucket->head.next, head);
if (drm_intel_gem_bo_madvise_internal
(bufmgr_gem, bo_gem, I915_MADV_DONTNEED))
break;
- DRMLISTDEL(&bo_gem->head);
+ DRMLISTDEL(bo_gem);
drm_intel_gem_bo_free(&bo_gem->bo);
}
}
+/* For non-render-target BOs (where we're probably going to map it
+ * first thing in order to fill it with data), check if the last
+ * appropriately-sized BO in the cache is unbusy, and only reuse in
+ * that case. Otherwise, allocating a new buffer is probably faster
+ * than waiting for the GPU to finish.
+ */
+static drm_intel_bo_gem *
+find_cached_bo_for_cpu(drm_intel_bufmgr *bufmgr, unsigned long size)
+{
+ drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bufmgr;
+ drm_intel_bo_gem *bo_gem, *temp;
+
+retry:
+ DRMLISTFOREACHSAFE(bo_gem, temp, &bufmgr_gem->bo_cache) {
+ if (bo_gem->bo.size != size)
+ continue;
+
+ if (drm_intel_gem_bo_busy(&bo_gem->bo))
+ return NULL;
+
+ DRMLISTDEL(bo_gem);
+
+ if (!drm_intel_gem_bo_madvise_internal(bufmgr_gem, bo_gem,
+ I915_MADV_WILLNEED)) {
+ drm_intel_gem_bo_free(&bo_gem->bo);
+ drm_intel_gem_bo_cache_purge(bufmgr_gem);
+ goto retry;
+ } else {
+ return bo_gem;
+ }
+ }
+
+ return NULL;
+}
+
+/* Allocate new render-target BOs from the tail (MRU) of the list, as
+ * it will likely be hot in the GPU cache and in the aperture for us.
+ */
+static drm_intel_bo_gem *
+find_cached_bo_for_render(drm_intel_bufmgr *bufmgr, unsigned long size)
+{
+ drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bufmgr;
+ drm_intel_bo_gem *bo_gem, *temp;
+
+retry:
+ DRMLISTFOREACHSAFEREVERSE(bo_gem, temp, &bufmgr_gem->bo_cache) {
+ if (bo_gem->bo.size != size)
+ continue;
+
+ DRMLISTDEL(bo_gem);
+
+ if (!drm_intel_gem_bo_madvise_internal(bufmgr_gem, bo_gem,
+ I915_MADV_WILLNEED)) {
+ drm_intel_gem_bo_free(&bo_gem->bo);
+ drm_intel_gem_bo_cache_purge(bufmgr_gem);
+ goto retry;
+ } else {
+ return bo_gem;
+ }
+ }
+
+ return NULL;
+}
+
static drm_intel_bo *
drm_intel_gem_bo_alloc_internal(drm_intel_bufmgr *bufmgr,
const char *name,
@@ -553,83 +596,35 @@ drm_intel_gem_bo_alloc_internal(drm_intel_bufmgr *bufmgr,
unsigned long flags)
{
drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bufmgr;
- drm_intel_bo_gem *bo_gem;
+ drm_intel_bo_gem *bo_gem = NULL;
unsigned int page_size = getpagesize();
int ret;
- struct drm_intel_gem_bo_bucket *bucket;
- int alloc_from_cache;
- unsigned long bo_size;
int for_render = 0;
if (flags & BO_ALLOC_FOR_RENDER)
for_render = 1;
- /* Round the allocated size up to a power of two number of pages. */
- bucket = drm_intel_gem_bo_bucket_for_size(bufmgr_gem, size);
+ size = ALIGN(size, page_size);
- /* If we don't have caching at this size, don't actually round the
- * allocation up.
- */
- if (bucket == NULL) {
- bo_size = size;
- if (bo_size < page_size)
- bo_size = page_size;
- } else {
- bo_size = bucket->size;
- }
-
- pthread_mutex_lock(&bufmgr_gem->lock);
- /* Get a buffer out of the cache if available */
-retry:
- alloc_from_cache = 0;
- if (bucket != NULL && !DRMLISTEMPTY(&bucket->head)) {
- if (for_render) {
- /* Allocate new render-target BOs from the tail (MRU)
- * of the list, as it will likely be hot in the GPU
- * cache and in the aperture for us.
- */
- bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
- bucket->head.prev, head);
- DRMLISTDEL(&bo_gem->head);
- alloc_from_cache = 1;
- } else {
- /* For non-render-target BOs (where we're probably
- * going to map it first thing in order to fill it
- * with data), check if the last BO in the cache is
- * unbusy, and only reuse in that case. Otherwise,
- * allocating a new buffer is probably faster than
- * waiting for the GPU to finish.
- */
- bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
- bucket->head.next, head);
- if (!drm_intel_gem_bo_busy(&bo_gem->bo)) {
- alloc_from_cache = 1;
- DRMLISTDEL(&bo_gem->head);
- }
- }
-
- if (alloc_from_cache) {
- if (!drm_intel_gem_bo_madvise_internal
- (bufmgr_gem, bo_gem, I915_MADV_WILLNEED)) {
- drm_intel_gem_bo_free(&bo_gem->bo);
- drm_intel_gem_bo_cache_purge_bucket(bufmgr_gem,
- bucket);
- goto retry;
- }
- }
+ if (size < BO_CACHE_MAX_SIZE) {
+ pthread_mutex_lock(&bufmgr_gem->lock);
+ if (flags & BO_ALLOC_FOR_RENDER)
+ bo_gem = find_cached_bo_for_render(bufmgr, size);
+ else
+ bo_gem = find_cached_bo_for_cpu(bufmgr, size);
+ pthread_mutex_unlock(&bufmgr_gem->lock);
}
- pthread_mutex_unlock(&bufmgr_gem->lock);
- if (!alloc_from_cache) {
+ if (!bo_gem) {
struct drm_i915_gem_create create;
bo_gem = calloc(1, sizeof(*bo_gem));
if (!bo_gem)
return NULL;
- bo_gem->bo.size = bo_size;
+ bo_gem->bo.size = size;
memset(&create, 0, sizeof(create));
- create.size = bo_size;
+ create.size = size;
do {
ret = ioctl(bufmgr_gem->fd,
@@ -653,7 +648,8 @@ retry:
bo_gem->has_error = 0;
bo_gem->tiling_mode = I915_TILING_NONE;
bo_gem->swizzle_mode = I915_BIT_6_SWIZZLE_NONE;
- bo_gem->reusable = 1;
+ if (size < BO_CACHE_MAX_SIZE)
+ bo_gem->reusable = 1;
drm_intel_bo_gem_set_in_aperture_size(bufmgr_gem, bo_gem);
@@ -820,24 +816,16 @@ drm_intel_gem_bo_free(drm_intel_bo *bo)
static void
drm_intel_gem_cleanup_bo_cache(drm_intel_bufmgr_gem *bufmgr_gem, time_t time)
{
- int i;
-
- for (i = 0; i < DRM_INTEL_GEM_BO_BUCKETS; i++) {
- struct drm_intel_gem_bo_bucket *bucket =
- &bufmgr_gem->cache_bucket[i];
-
- while (!DRMLISTEMPTY(&bucket->head)) {
- drm_intel_bo_gem *bo_gem;
+ while (!DRMLISTEMPTY(&bufmgr_gem->bo_cache)) {
+ drm_intel_bo_gem *bo_gem;
- bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
- bucket->head.next, head);
- if (time - bo_gem->free_time <= 1)
- break;
+ bo_gem = bufmgr_gem->bo_cache.next;
+ if (time - bo_gem->free_time <= 1)
+ break;
- DRMLISTDEL(&bo_gem->head);
+ DRMLISTDEL(bo_gem);
- drm_intel_gem_bo_free(&bo_gem->bo);
- }
+ drm_intel_gem_bo_free(&bo_gem->bo);
}
}
@@ -846,7 +834,6 @@ drm_intel_gem_bo_unreference_final(drm_intel_bo *bo, time_t time)
{
drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
- struct drm_intel_gem_bo_bucket *bucket;
uint32_t tiling_mode;
int i;
@@ -872,10 +859,9 @@ drm_intel_gem_bo_unreference_final(drm_intel_bo *bo, time_t time)
bo_gem->relocs = NULL;
}
- bucket = drm_intel_gem_bo_bucket_for_size(bufmgr_gem, bo->size);
/* Put the buffer into our internal cache for reuse if we can. */
tiling_mode = I915_TILING_NONE;
- if (bufmgr_gem->bo_reuse && bo_gem->reusable && bucket != NULL &&
+ if (bufmgr_gem->bo_reuse && bo_gem->reusable &&
drm_intel_gem_bo_set_tiling(bo, &tiling_mode, 0) == 0 &&
drm_intel_gem_bo_madvise_internal(bufmgr_gem, bo_gem,
I915_MADV_DONTNEED)) {
@@ -884,7 +870,7 @@ drm_intel_gem_bo_unreference_final(drm_intel_bo *bo, time_t time)
bo_gem->name = NULL;
bo_gem->validate_index = -1;
- DRMLISTADDTAIL(&bo_gem->head, &bucket->head);
+ DRMLISTADDTAIL(bo_gem, &bufmgr_gem->bo_cache);
drm_intel_gem_cleanup_bo_cache(bufmgr_gem, time);
} else {
@@ -1241,7 +1227,7 @@ static void
drm_intel_bufmgr_gem_destroy(drm_intel_bufmgr *bufmgr)
{
drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bufmgr;
- int i;
+ drm_intel_bo_gem *bo_gem;
free(bufmgr_gem->exec2_objects);
free(bufmgr_gem->exec_objects);
@@ -1250,18 +1236,12 @@ drm_intel_bufmgr_gem_destroy(drm_intel_bufmgr *bufmgr)
pthread_mutex_destroy(&bufmgr_gem->lock);
/* Free any cached buffer objects we were going to reuse */
- for (i = 0; i < DRM_INTEL_GEM_BO_BUCKETS; i++) {
- struct drm_intel_gem_bo_bucket *bucket =
- &bufmgr_gem->cache_bucket[i];
- drm_intel_bo_gem *bo_gem;
- while (!DRMLISTEMPTY(&bucket->head)) {
- bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
- bucket->head.next, head);
- DRMLISTDEL(&bo_gem->head);
+ while (!DRMLISTEMPTY(&bufmgr_gem->bo_cache)) {
+ bo_gem = bufmgr_gem->bo_cache.next;
+ DRMLISTDEL(bo_gem);
- drm_intel_gem_bo_free(&bo_gem->bo);
- }
+ drm_intel_gem_bo_free(&bo_gem->bo);
}
free(bufmgr);
@@ -1723,9 +1703,10 @@ drm_intel_gem_bo_flink(drm_intel_bo *bo, uint32_t * name)
/**
* Enables unlimited caching of buffer objects for reuse.
*
- * This is potentially very memory expensive, as the cache at each bucket
- * size is only bounded by how many buffers of that size we've managed to have
- * in flight at once.
+ * This is potentially very memory expensive, as the cache at each bo
+ * size is only bounded by how many buffers of that size we've managed
+ * to have in flight at once. However, the kernel's memory pressure
+ * handler will throw out BOs if required.
*/
void
drm_intel_bufmgr_gem_enable_reuse(drm_intel_bufmgr *bufmgr)
@@ -1985,8 +1966,7 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size)
drm_intel_bufmgr_gem *bufmgr_gem;
struct drm_i915_gem_get_aperture aperture;
drm_i915_getparam_t gp;
- int ret, i;
- unsigned long size;
+ int ret;
int exec2 = 0, has_bsd = 0;
bufmgr_gem = calloc(1, sizeof(*bufmgr_gem));
@@ -2112,11 +2092,7 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size)
drm_intel_gem_get_pipe_from_crtc_id;
bufmgr_gem->bufmgr.bo_references = drm_intel_gem_bo_references;
- /* Initialize the linked lists for BO reuse cache. */
- for (i = 0, size = 4096; i < DRM_INTEL_GEM_BO_BUCKETS; i++, size *= 2) {
- DRMINITLISTHEAD(&bufmgr_gem->cache_bucket[i].head);
- bufmgr_gem->cache_bucket[i].size = size;
- }
+ DRMINITLISTHEAD(&bufmgr_gem->bo_cache);
return &bufmgr_gem->bufmgr;
}