diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2016-09-08 19:06:54 +0100 |
---|---|---|
committer | Chris Wilson <chris@chris-wilson.co.uk> | 2016-09-08 21:11:04 +0100 |
commit | 5c81016cd250babd4e2a764af9fde8c83a320357 (patch) | |
tree | cbfeabb07f08261c976e6eae1f0494fa416c765f | |
parent | 8bde63fe23587194667f39e27ea92f9f046fb0db (diff) |
igt/gem_exec_latency: Initial sketch for measuring execbuf cost in GPU cycles
Similar to benchmarks/gem_latency, but looking more at the dispatch cost
rather than wakeup cost, and looking for inter-engine costs. Still
probably better as a perf test.
ivb over the years:
IGT-Version: 1.16-gebee919 (x86_64) (Linux: 3.10-3-amd64 x86_64)
render: dispatch latency: 50.90, execution latency: 57.29 (target 4.64)
bsd: dispatch latency: 41.45, execution latency: 41.43 (target 4.75)
blt: dispatch latency: 41.02, execution latency: 41.00 (target 4.99)
IGT-Version: 1.16-gebee919 (x86_64) (Linux: 4.8.0-rc5+ x86_64)
render: dispatch latency: 12.61, execution latency: 15.44 (target 1.71)
bsd: dispatch latency: 12.08, execution latency: 12.07 (target 1.80)
blt: dispatch latency: 12.59, execution latency: 12.58 (target 1.85)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
-rw-r--r-- | tests/Makefile.sources | 1 | ||||
-rw-r--r-- | tests/gem_exec_latency.c | 358 |
2 files changed, 359 insertions, 0 deletions
diff --git a/tests/Makefile.sources b/tests/Makefile.sources index 7d2b7289..598ec6f0 100644 --- a/tests/Makefile.sources +++ b/tests/Makefile.sources @@ -37,6 +37,7 @@ TESTS_progs_M = \ gem_exec_faulting_reloc \ gem_exec_flush \ gem_exec_gttfill \ + gem_exec_latency \ gem_exec_nop \ gem_exec_parallel \ gem_exec_params \ diff --git a/tests/gem_exec_latency.c b/tests/gem_exec_latency.c new file mode 100644 index 00000000..16f87d35 --- /dev/null +++ b/tests/gem_exec_latency.c @@ -0,0 +1,358 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "igt.h" +#include "igt_sysfs.h" +#include <unistd.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <fcntl.h> +#include <inttypes.h> +#include <errno.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <time.h> +#include "drm.h" + +#define LOCAL_I915_EXEC_NO_RELOC (1<<11) +#define LOCAL_I915_EXEC_HANDLE_LUT (1<<12) + +#define LOCAL_I915_EXEC_BSD_SHIFT (13) +#define LOCAL_I915_EXEC_BSD_MASK (3 << LOCAL_I915_EXEC_BSD_SHIFT) + +#define ENGINE_FLAGS (I915_EXEC_RING_MASK | LOCAL_I915_EXEC_BSD_MASK) + +#define RCS_TIMESTAMP (0x2000 + 0x358) +static void latency_on_ring(int fd, unsigned ring, const char *name) +{ + const int gen = intel_gen(intel_get_drm_devid(fd)); + const int has_64bit_reloc = gen >= 8; + struct drm_i915_gem_exec_object2 obj[2]; + struct drm_i915_gem_relocation_entry reloc; + struct drm_i915_gem_execbuffer2 execbuf; + volatile uint32_t *reg; + uint32_t start, end, *map, *results; + uint64_t offset; + double gpu_latency; + int i, j; + + reg = (volatile uint32_t *)((volatile char *)igt_global_mmio + RCS_TIMESTAMP); + + memset(&execbuf, 0, sizeof(execbuf)); + execbuf.buffers_ptr = (uintptr_t)obj; + execbuf.buffer_count = 2; + execbuf.flags = ring; + execbuf.flags |= LOCAL_I915_EXEC_NO_RELOC | LOCAL_I915_EXEC_HANDLE_LUT; + + memset(obj, 0, sizeof(obj)); + obj[0].handle = gem_create(fd, 4096); + obj[0].flags = EXEC_OBJECT_WRITE; + results = gem_mmap__wc(fd, obj[0].handle, 0, 4096, PROT_READ); + + obj[1].handle = gem_create(fd, 64*1024); + map = gem_mmap__wc(fd, obj[1].handle, 0, 64*1024, PROT_WRITE); + gem_set_domain(fd, obj[1].handle, + I915_GEM_DOMAIN_GTT, + I915_GEM_DOMAIN_GTT); + map[0] = MI_BATCH_BUFFER_END; + gem_execbuf(fd, &execbuf); + + memset(&reloc,0, sizeof(reloc)); + obj[1].relocation_count = 1; + obj[1].relocs_ptr = (uintptr_t)&reloc; + + gem_set_domain(fd, obj[1].handle, + I915_GEM_DOMAIN_GTT, + I915_GEM_DOMAIN_GTT); + + reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION; + reloc.write_domain = I915_GEM_DOMAIN_INSTRUCTION; + reloc.presumed_offset = obj[0].offset; + + for (j = 0; j < 1024; j++) { + execbuf.batch_start_offset = 64 * j; + reloc.offset = + execbuf.batch_start_offset + sizeof(uint32_t); + reloc.delta = sizeof(uint32_t) * j; + + offset = reloc.presumed_offset; + offset += reloc.delta; + + i = 16 * j; + /* MI_STORE_REG_MEM */ + map[i++] = 0x24 << 23 | 1; + if (has_64bit_reloc) + map[i-1]++; + map[i++] = RCS_TIMESTAMP; /* ring local! */ + map[i++] = offset; + if (has_64bit_reloc) + map[i++] = offset >> 32; + map[i++] = MI_BATCH_BUFFER_END; + } + + start = *reg; + for (j = 0; j < 1024; j++) { + execbuf.batch_start_offset = 64 * j; + reloc.offset = + execbuf.batch_start_offset + sizeof(uint32_t); + reloc.delta = sizeof(uint32_t) * j; + + gem_execbuf(fd, &execbuf); + } + end = *reg; + igt_assert(reloc.presumed_offset == obj[0].offset); + + gem_set_domain(fd, obj[0].handle, I915_GEM_DOMAIN_GTT, 0); + gpu_latency = (results[1023] - results[0]) / 1023.; + + gem_set_domain(fd, obj[1].handle, + I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT); + + execbuf.batch_start_offset = 0; + for (j = 0; j < 1023; j++) { + offset = obj[1].offset; + offset += 64 * (j + 1); + + i = 16 * j + (has_64bit_reloc ? 4 : 3); + map[i] = MI_BATCH_BUFFER_START; + if (gen >= 8) { + map[i] |= 1 << 8 | 1; + map[i + 1] = offset; + map[i + 2] = offset >> 32; + } else if (gen >= 6) { + map[i] |= 1 << 8; + map[i + 1] = offset; + } else { + map[i] |= 2 << 6; + map[i + 1] = offset; + if (gen < 4) + map[i] |= 1; + } + } + offset = obj[1].offset; + gem_execbuf(fd, &execbuf); + igt_assert(offset == obj[1].offset); + + gem_set_domain(fd, obj[0].handle, I915_GEM_DOMAIN_GTT, 0); + igt_info("%s: dispatch latency: %.2f, execution latency: %.2f (target %.2f)\n", + name, + (end - start) / 1024., + gpu_latency, (results[1023] - results[0]) / 1023.); + + munmap(map, 64*1024); + munmap(results, 4096); + gem_close(fd, obj[0].handle); + gem_close(fd, obj[1].handle); +} + +static void latency_from_ring(int fd, unsigned ring, const char *name) +{ + const struct intel_execution_engine *e; + const int gen = intel_gen(intel_get_drm_devid(fd)); + const int has_64bit_reloc = gen >= 8; + struct drm_i915_gem_exec_object2 obj[2]; + struct drm_i915_gem_relocation_entry reloc; + struct drm_i915_gem_execbuffer2 execbuf; + uint32_t *map, *results; + int i, j; + + memset(&execbuf, 0, sizeof(execbuf)); + execbuf.buffers_ptr = (uintptr_t)obj; + execbuf.buffer_count = 2; + execbuf.flags = ring; + execbuf.flags |= LOCAL_I915_EXEC_NO_RELOC | LOCAL_I915_EXEC_HANDLE_LUT; + + memset(obj, 0, sizeof(obj)); + obj[0].handle = gem_create(fd, 4096); + obj[0].flags = EXEC_OBJECT_WRITE; + results = gem_mmap__wc(fd, obj[0].handle, 0, 4096, PROT_READ); + + obj[1].handle = gem_create(fd, 64*1024); + map = gem_mmap__wc(fd, obj[1].handle, 0, 64*1024, PROT_WRITE); + gem_set_domain(fd, obj[1].handle, + I915_GEM_DOMAIN_GTT, + I915_GEM_DOMAIN_GTT); + map[0] = MI_BATCH_BUFFER_END; + gem_execbuf(fd, &execbuf); + + memset(&reloc,0, sizeof(reloc)); + obj[1].relocation_count = 1; + obj[1].relocs_ptr = (uintptr_t)&reloc; + + gem_set_domain(fd, obj[1].handle, + I915_GEM_DOMAIN_GTT, + I915_GEM_DOMAIN_GTT); + + reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION; + reloc.write_domain = I915_GEM_DOMAIN_INSTRUCTION; + reloc.presumed_offset = obj[0].offset; + + for (e = intel_execution_engines; e->name; e++) { + if (e->exec_id == 0) + continue; + + if (!gem_has_ring(fd, e->exec_id | e->flags)) + continue; + + gem_set_domain(fd, obj[1].handle, + I915_GEM_DOMAIN_GTT, + I915_GEM_DOMAIN_GTT); + + for (j = 0; j < 512; j++) { + uint64_t offset; + + execbuf.flags &= ~ENGINE_FLAGS; + execbuf.flags |= ring; + + execbuf.batch_start_offset = 64 * j; + reloc.offset = + execbuf.batch_start_offset + sizeof(uint32_t); + reloc.delta = sizeof(uint32_t) * j; + + offset = reloc.presumed_offset; + offset += reloc.delta; + + i = 16 * j; + /* MI_STORE_REG_MEM */ + map[i++] = 0x24 << 23 | 1; + if (has_64bit_reloc) + map[i-1]++; + map[i++] = RCS_TIMESTAMP; /* ring local! */ + map[i++] = offset; + if (has_64bit_reloc) + map[i++] = offset >> 32; + map[i++] = MI_BATCH_BUFFER_END; + + gem_execbuf(fd, &execbuf); + + execbuf.flags &= ~ENGINE_FLAGS; + execbuf.flags |= ring; + + execbuf.batch_start_offset = 64 * (j + 512); + reloc.offset = + execbuf.batch_start_offset + sizeof(uint32_t); + reloc.delta = sizeof(uint32_t) * (j + 512); + + offset = reloc.presumed_offset; + offset += reloc.delta; + + i = 16 * (j + 512); + /* MI_STORE_REG_MEM */ + map[i++] = 0x24 << 23 | 1; + if (has_64bit_reloc) + map[i-1]++; + map[i++] = RCS_TIMESTAMP; /* ring local! */ + map[i++] = offset; + if (has_64bit_reloc) + map[i++] = offset >> 32; + map[i++] = MI_BATCH_BUFFER_END; + + gem_execbuf(fd, &execbuf); + } + + gem_set_domain(fd, obj[0].handle, + I915_GEM_DOMAIN_GTT, + I915_GEM_DOMAIN_GTT); + + igt_info("%s-%s delay: %.2f\n", + name, e->name, (results[1023] - results[0]) / 1024.); + } + + munmap(map, 64*1024); + munmap(results, 4096); + gem_close(fd, obj[0].handle); + gem_close(fd, obj[1].handle); +} + +static void print_welcome(int fd) +{ + bool active; + int dir; + + dir = igt_sysfs_open_parameters(fd); + if (dir < 0) + return; + + active = igt_sysfs_get_boolean(dir, "enable_guc_submission"); + if (active) { + igt_info("Using GuC submission\n"); + goto out; + } + + active = igt_sysfs_get_boolean(dir, "enable_execlists"); + if (active) { + igt_info("Using Execlists submission\n"); + goto out; + } + + active = igt_sysfs_get_boolean(dir, "semaphores"); + igt_info("Using Legacy submission%s\n", + active ? ", with semaphores" : ""); + +out: + close(dir); +} + +igt_main +{ + const struct intel_execution_engine *e; + int device = -1; + + igt_fixture { + intel_register_access_init(intel_get_pci_device(), false); + device = drm_open_driver(DRIVER_INTEL); + print_welcome(device); + } + + igt_subtest_group { + igt_fixture + igt_require(intel_gen(intel_get_drm_devid(device)) >= 7); + + for (e = intel_execution_engines; e->name; e++) { + if (e->exec_id == 0) + continue; + + igt_subtest_f("%s-dispatch", e->name) { + gem_require_ring(device, e->exec_id | e->flags); + latency_on_ring(device, + e->exec_id | e->flags, + e->name); + } + igt_subtest_f("%s-synchronisation", e->name) { + gem_require_ring(device, e->exec_id | e->flags); + latency_from_ring(device, + e->exec_id | e->flags, + e->name); + } + } + } + + + igt_fixture { + close(device); + } +} |