summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAshutosh Dixit <ashutosh.dixit@intel.com>2024-07-05 14:43:10 -0700
committerAshutosh Dixit <ashutosh.dixit@intel.com>2024-07-05 18:19:13 -0700
commitbfd19ed9b764339b32d122e70916c7558b3814cf (patch)
tree5e0377751c4d684300ac5e4086ae50e2e23697d5
parent8be32d709f1b430739afac9963606ebb9ad914b5 (diff)
tests/intel/xe_oa: OAR/OAC tests
"mi-rpc", "oa-tlb-invalidate" and "unprivileged-single-ctx-counters" tests. v2: Run "oa-tlb-invalidate" only for specific product versions (Umesh) v3: Rename xe perf layer as xe observation layer Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com> Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
-rw-r--r--tests/intel/xe_oa.c701
1 files changed, 701 insertions, 0 deletions
diff --git a/tests/intel/xe_oa.c b/tests/intel/xe_oa.c
index 6dcf447e6..9b6bde30b 100644
--- a/tests/intel/xe_oa.c
+++ b/tests/intel/xe_oa.c
@@ -596,6 +596,18 @@ read_report_reason(const uint32_t *report)
return "unknown";
}
+static uint32_t
+cs_timestamp_frequency(int fd)
+{
+ return xe_gt_list(drm_fd)->gt_list[0].reference_clock;
+}
+
+static uint64_t
+cs_timebase_scale(uint32_t u32_delta)
+{
+ return ((uint64_t)u32_delta * NSEC_PER_SEC) / cs_timestamp_frequency(drm_fd);
+}
+
static uint64_t
oa_timestamp(const uint32_t *report, enum intel_xe_oa_format_name format)
{
@@ -662,6 +674,15 @@ oa_report_get_ctx_id(uint32_t *report)
return report[2];
}
+static int
+oar_unit_default_format(void)
+{
+ if (IS_DG2(devid) || IS_METEORLAKE(devid))
+ return XE_OAR_FORMAT_A32u40_A4u32_B8_C8;
+
+ return default_test_set->perf_oa_format;
+}
+
static void *buf_map(int fd, struct intel_buf *buf, bool write)
{
void *p;
@@ -700,6 +721,21 @@ scratch_buf_init(struct buf_ops *bops,
scratch_buf_memset(buf, width, height, color);
}
+static void
+emit_report_perf_count(struct intel_bb *ibb,
+ struct intel_buf *dst,
+ int dst_offset,
+ uint32_t report_id)
+{
+ intel_bb_add_intel_buf(ibb, dst, true);
+
+ intel_bb_out(ibb, OA_MI_REPORT_PERF_COUNT);
+ intel_bb_emit_reloc(ibb, dst->handle,
+ I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+ dst_offset, dst->addr.offset);
+ intel_bb_out(ibb, report_id);
+}
+
static bool
oa_report_is_periodic(uint32_t oa_exponent, const uint32_t *report)
{
@@ -741,6 +777,31 @@ get_40bit_a_delta(uint64_t value0, uint64_t value1)
}
static void
+accumulate_uint32(size_t offset,
+ uint32_t *report0,
+ uint32_t *report1,
+ uint64_t *delta)
+{
+ uint32_t value0 = *(uint32_t *)(((uint8_t *)report0) + offset);
+ uint32_t value1 = *(uint32_t *)(((uint8_t *)report1) + offset);
+
+ *delta += (uint32_t)(value1 - value0);
+}
+
+static void
+accumulate_uint40(int a_index,
+ uint32_t *report0,
+ uint32_t *report1,
+ enum intel_xe_oa_format_name format,
+ uint64_t *delta)
+{
+ uint64_t value0 = read_40bit_a_counter(report0, format, a_index),
+ value1 = read_40bit_a_counter(report1, format, a_index);
+
+ *delta += get_40bit_a_delta(value0, value1);
+}
+
+static void
accumulate_uint64(int a_index,
const uint32_t *report0,
const uint32_t *report1,
@@ -753,6 +814,78 @@ accumulate_uint64(int a_index,
*delta += (value1 - value0);
}
+static void
+accumulate_reports(struct accumulator *accumulator,
+ uint32_t *start,
+ uint32_t *end)
+{
+ struct oa_format format = get_oa_format(accumulator->format);
+ uint64_t *deltas = accumulator->deltas;
+ int idx = 0;
+
+ /* timestamp */
+ deltas[idx] += oa_timestamp_delta(end, start, accumulator->format);
+ idx++;
+
+ /* clock cycles */
+ deltas[idx] += oa_tick_delta(end, start, accumulator->format);
+ idx++;
+
+ for (int i = 0; i < format.n_a40; i++) {
+ accumulate_uint40(i, start, end, accumulator->format,
+ deltas + idx++);
+ }
+
+ for (int i = 0; i < format.n_a64; i++) {
+ accumulate_uint64(i, start, end, accumulator->format,
+ deltas + idx++);
+ }
+
+ for (int i = 0; i < format.n_a; i++) {
+ accumulate_uint32(format.a_off + 4 * i,
+ start, end, deltas + idx++);
+ }
+
+ for (int i = 0; i < format.n_b; i++) {
+ accumulate_uint32(format.b_off + 4 * i,
+ start, end, deltas + idx++);
+ }
+
+ for (int i = 0; i < format.n_c; i++) {
+ accumulate_uint32(format.c_off + 4 * i,
+ start, end, deltas + idx++);
+ }
+}
+
+static void
+accumulator_print(struct accumulator *accumulator, const char *title)
+{
+ struct oa_format format = get_oa_format(accumulator->format);
+ uint64_t *deltas = accumulator->deltas;
+ int idx = 0;
+
+ igt_debug("%s:\n", title);
+ igt_debug("\ttime delta = %"PRIu64"\n", deltas[idx++]);
+ igt_debug("\tclock cycle delta = %"PRIu64"\n", deltas[idx++]);
+
+ for (int i = 0; i < format.n_a40; i++)
+ igt_debug("\tA%u = %"PRIu64"\n", i, deltas[idx++]);
+
+ for (int i = 0; i < format.n_a64; i++)
+ igt_debug("\tA64_%u = %"PRIu64"\n", i, deltas[idx++]);
+
+ for (int i = 0; i < format.n_a; i++) {
+ int a_id = format.first_a + i;
+ igt_debug("\tA%u = %"PRIu64"\n", a_id, deltas[idx++]);
+ }
+
+ for (int i = 0; i < format.n_a; i++)
+ igt_debug("\tB%u = %"PRIu64"\n", i, deltas[idx++]);
+
+ for (int i = 0; i < format.n_c; i++)
+ igt_debug("\tC%u = %"PRIu64"\n", i, deltas[idx++]);
+}
+
/* The TestOa metric set is designed so */
static void
sanity_check_reports(const uint32_t *oa_report0, const uint32_t *oa_report1,
@@ -2074,6 +2207,92 @@ static void test_polling_small_buf(void)
0.20 * n_expect_read_bytes);
}
+static int
+num_valid_reports_captured(struct intel_xe_oa_open_prop *param,
+ int64_t *duration_ns, int fmt)
+{
+ uint8_t buf[1024 * 1024];
+ int64_t start, end;
+ int num_reports = 0;
+ size_t format_size = get_oa_format(fmt).size;
+
+ igt_debug("Expected duration = %"PRId64"\n", *duration_ns);
+
+ stream_fd = __perf_open(drm_fd, param, true);
+
+ start = get_time();
+ do_ioctl(stream_fd, DRM_XE_OBSERVATION_IOCTL_ENABLE, 0);
+ for (/* nop */; ((end = get_time()) - start) < *duration_ns; /* nop */) {
+ int ret;
+
+ while ((ret = read(stream_fd, buf, sizeof(buf))) < 0 &&
+ (errno == EINTR || errno == EIO))
+ ;
+
+ igt_assert(ret > 0);
+
+ for (int offset = 0; offset < ret; offset += format_size) {
+ uint32_t *report = (void *)(buf + offset);
+
+ if (report_reason(report) & OAREPORT_REASON_TIMER)
+ num_reports++;
+ }
+ }
+ __perf_close(stream_fd);
+
+ *duration_ns = end - start;
+
+ igt_debug("Actual duration = %"PRIu64"\n", *duration_ns);
+
+ return num_reports;
+}
+
+/**
+ * SUBTEST: oa-tlb-invalidate
+ * Description: Open OA stream twice to verify OA TLB invalidation
+ */
+static void
+test_oa_tlb_invalidate(const struct drm_xe_engine_class_instance *hwe)
+{
+ int oa_exponent = max_oa_exponent_for_period_lte(30000000);
+ struct intel_xe_perf_metric_set *test_set = metric_set(hwe);
+ uint64_t properties[] = {
+ DRM_XE_OA_PROPERTY_OA_UNIT_ID, 0,
+ DRM_XE_OA_PROPERTY_SAMPLE_OA, true,
+
+ DRM_XE_OA_PROPERTY_OA_METRIC_SET, test_set->perf_oa_metrics_set,
+ DRM_XE_OA_PROPERTY_OA_FORMAT, __ff(test_set->perf_oa_format),
+ DRM_XE_OA_PROPERTY_OA_PERIOD_EXPONENT, oa_exponent,
+ DRM_XE_OA_PROPERTY_OA_DISABLED, true,
+ DRM_XE_OA_PROPERTY_OA_ENGINE_INSTANCE, hwe->engine_instance,
+ };
+ struct intel_xe_oa_open_prop param = {
+ .num_properties = ARRAY_SIZE(properties) / 2,
+ .properties_ptr = to_user_pointer(properties),
+ };
+ int num_reports1, num_reports2, num_expected_reports;
+ int64_t duration;
+
+ /* Capture reports for 5 seconds twice and then make sure you get around
+ * the same number of reports. In the case of failure, the number of
+ * reports will vary largely since the beginning of the OA buffer
+ * will have invalid entries.
+ */
+ duration = 5LL * NSEC_PER_SEC;
+ num_reports1 = num_valid_reports_captured(&param, &duration, test_set->perf_oa_format);
+ num_expected_reports = duration / oa_exponent_to_ns(oa_exponent);
+ igt_debug("expected num reports = %d\n", num_expected_reports);
+ igt_debug("actual num reports = %d\n", num_reports1);
+ igt_assert(num_reports1 > 0.95 * num_expected_reports);
+
+ duration = 5LL * NSEC_PER_SEC;
+ num_reports2 = num_valid_reports_captured(&param, &duration, test_set->perf_oa_format);
+ num_expected_reports = duration / oa_exponent_to_ns(oa_exponent);
+ igt_debug("expected num reports = %d\n", num_expected_reports);
+ igt_debug("actual num reports = %d\n", num_reports2);
+ igt_assert(num_reports2 > 0.95 * num_expected_reports);
+}
+
/**
* SUBTEST: buffer-fill
* Description: Test filling, wraparound and overflow of OA buffer
@@ -2659,6 +2878,468 @@ test_disabled_read_error(void)
__perf_close(stream_fd);
}
+/**
+ * SUBTEST: mi-rpc
+ * Description: Test OAR/OAC using MI_REPORT_PERF_COUNT
+ */
+static void
+test_mi_rpc(struct drm_xe_engine_class_instance *hwe)
+
+{
+ uint64_t fmt = ((IS_DG2(devid) || IS_METEORLAKE(devid)) &&
+ hwe->engine_class == DRM_XE_ENGINE_CLASS_COMPUTE) ?
+ XE_OAC_FORMAT_A24u64_B8_C8 : oar_unit_default_format();
+ struct intel_xe_perf_metric_set *test_set = metric_set(hwe);
+ uint64_t properties[] = {
+ DRM_XE_OA_PROPERTY_OA_UNIT_ID, 0,
+
+ /* On Gen12, MI RPC uses OAR. OAR is configured only for the
+ * render context that wants to measure the performance. Hence a
+ * context must be specified in the gen12 MI RPC when compared
+ * to previous gens.
+ *
+ * Have a random value here for the context id, but initialize
+ * it once you figure out the context ID for the work to be
+ * measured
+ */
+ DRM_XE_OA_PROPERTY_EXEC_QUEUE_ID, UINT64_MAX,
+
+ /* OA unit configuration:
+ * DRM_XE_OA_PROPERTY_SAMPLE_OA is no longer required for Gen12
+ * because the OAR unit increments counters only for the
+ * relevant context. No other parameters are needed since we do
+ * not rely on the OA buffer anymore to normalize the counter
+ * values.
+ */
+ DRM_XE_OA_PROPERTY_OA_METRIC_SET, test_set->perf_oa_metrics_set,
+ DRM_XE_OA_PROPERTY_OA_FORMAT, __ff(fmt),
+ DRM_XE_OA_PROPERTY_OA_ENGINE_INSTANCE, hwe->engine_instance,
+ };
+ struct intel_xe_oa_open_prop param = {
+ .num_properties = ARRAY_SIZE(properties) / 2,
+ .properties_ptr = to_user_pointer(properties),
+ };
+ struct buf_ops *bops;
+ struct intel_bb *ibb;
+ struct intel_buf *buf;
+#define INVALID_CTX_ID 0xffffffff
+ uint32_t ctx_id = INVALID_CTX_ID;
+ uint32_t vm = 0;
+ uint32_t *report32;
+ size_t format_size_32;
+ struct oa_format format = get_oa_format(fmt);
+
+ /* Ensure observation_paranoid is set to 1 by default */
+ write_u64_file("/proc/sys/dev/xe/observation_paranoid", 1);
+
+ bops = buf_ops_create(drm_fd);
+ vm = xe_vm_create(drm_fd, 0, 0);
+ ctx_id = xe_exec_queue_create(drm_fd, vm, hwe, 0);
+ igt_assert_neq(ctx_id, INVALID_CTX_ID);
+ properties[3] = ctx_id;
+
+ ibb = intel_bb_create_with_context(drm_fd, ctx_id, vm, NULL, BATCH_SZ);
+ buf = intel_buf_create(bops, 4096, 1, 8, 64,
+ I915_TILING_NONE, I915_COMPRESSION_NONE);
+
+ buf_map(drm_fd, buf, true);
+ memset(buf->ptr, 0x80, 4096);
+ intel_buf_unmap(buf);
+
+ stream_fd = __perf_open(drm_fd, &param, false);
+ set_fd_flags(stream_fd, O_CLOEXEC);
+
+#define REPORT_ID 0xdeadbeef
+#define REPORT_OFFSET 0
+ emit_report_perf_count(ibb,
+ buf,
+ REPORT_OFFSET,
+ REPORT_ID);
+ intel_bb_flush_render(ibb);
+ intel_bb_sync(ibb);
+
+ buf_map(drm_fd, buf, false);
+ report32 = buf->ptr;
+ format_size_32 = format.size >> 2;
+ dump_report(report32, format_size_32, "mi-rpc");
+
+ /* Sanity check reports
+ * reportX_32[0]: report id passed with mi-rpc
+ * reportX_32[1]: timestamp. NOTE: wraps around in ~6 minutes.
+ *
+ * reportX_32[format.b_off]: check if the entire report was filled.
+ * B0 counter falls in the last 64 bytes of this report format.
+ * Since reports are filled in 64 byte blocks, we should be able to
+ * assure that the report was filled by checking the B0 counter. B0
+ * counter is defined to be zero, so we can easily validate it.
+ *
+ * reportX_32[format_size_32]: outside report, make sure only the report
+ * size amount of data was written.
+ */
+ igt_assert_eq(report32[0], REPORT_ID);
+ igt_assert(oa_timestamp(report32, test_set->perf_oa_format));
+ igt_assert_neq(report32[format.b_off >> 2], 0x80808080);
+ igt_assert_eq(report32[format_size_32], 0x80808080);
+
+ intel_buf_unmap(buf);
+ intel_buf_destroy(buf);
+ intel_bb_destroy(ibb);
+ xe_exec_queue_destroy(drm_fd, ctx_id);
+ xe_vm_destroy(drm_fd, vm);
+ buf_ops_destroy(bops);
+ __perf_close(stream_fd);
+}
+
+static void
+emit_stall_timestamp_and_rpc(struct intel_bb *ibb,
+ struct intel_buf *dst,
+ int timestamp_offset,
+ int report_dst_offset,
+ uint32_t report_id)
+{
+ uint32_t pipe_ctl_flags = (PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_WRITE_TIMESTAMP);
+
+ intel_bb_add_intel_buf(ibb, dst, true);
+ intel_bb_out(ibb, GFX_OP_PIPE_CONTROL(6));
+ intel_bb_out(ibb, pipe_ctl_flags);
+ intel_bb_emit_reloc(ibb, dst->handle,
+ I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+ timestamp_offset, dst->addr.offset);
+ intel_bb_out(ibb, 0); /* imm lower */
+ intel_bb_out(ibb, 0); /* imm upper */
+
+ emit_report_perf_count(ibb, dst, report_dst_offset, report_id);
+}
+
+static void single_ctx_helper(struct drm_xe_engine_class_instance *hwe)
+{
+ struct intel_xe_perf_metric_set *test_set = metric_set(hwe);
+ uint64_t fmt = oar_unit_default_format();
+ uint64_t properties[] = {
+ DRM_XE_OA_PROPERTY_OA_UNIT_ID, 0,
+
+ /* Have a random value here for the context id, but initialize
+ * it once you figure out the context ID for the work to be
+ * measured
+ */
+ DRM_XE_OA_PROPERTY_EXEC_QUEUE_ID, UINT64_MAX,
+
+ /* OA unit configuration:
+ * DRM_XE_OA_PROPERTY_SAMPLE_OA is no longer required for Gen12
+ * because the OAR unit increments counters only for the
+ * relevant context. No other parameters are needed since we do
+ * not rely on the OA buffer anymore to normalize the counter
+ * values.
+ */
+ DRM_XE_OA_PROPERTY_OA_METRIC_SET, test_set->perf_oa_metrics_set,
+ DRM_XE_OA_PROPERTY_OA_FORMAT, __ff(fmt),
+ DRM_XE_OA_PROPERTY_OA_ENGINE_INSTANCE, hwe->engine_instance,
+ };
+ struct intel_xe_oa_open_prop param = {
+ .num_properties = ARRAY_SIZE(properties) / 2,
+ .properties_ptr = to_user_pointer(properties),
+ };
+ struct buf_ops *bops;
+ struct intel_bb *ibb0, *ibb1;
+ struct intel_buf src[3], dst[3], *dst_buf;
+ uint32_t context0_id, context1_id, vm = 0;
+ uint32_t *report0_32, *report1_32, *report2_32, *report3_32;
+ uint64_t timestamp0_64, timestamp1_64;
+ uint64_t delta_ts64, delta_oa32;
+ uint64_t delta_ts64_ns, delta_oa32_ns;
+ uint64_t delta_delta;
+ int width = 800;
+ int height = 600;
+#define INVALID_CTX_ID 0xffffffff
+ uint32_t ctx0_id = INVALID_CTX_ID;
+ uint32_t ctx1_id = INVALID_CTX_ID;
+ int ret;
+ struct accumulator accumulator = {
+ .format = fmt
+ };
+
+ bops = buf_ops_create(drm_fd);
+
+ for (int i = 0; i < ARRAY_SIZE(src); i++) {
+ scratch_buf_init(bops, &src[i], width, height, 0xff0000ff);
+ scratch_buf_init(bops, &dst[i], width, height, 0x00ff00ff);
+ }
+
+ vm = xe_vm_create(drm_fd, 0, 0);
+ context0_id = xe_exec_queue_create(drm_fd, vm, hwe, 0);
+ context1_id = xe_exec_queue_create(drm_fd, vm, hwe, 0);
+ ibb0 = intel_bb_create_with_context(drm_fd, context0_id, vm, NULL, BATCH_SZ);
+ ibb1 = intel_bb_create_with_context(drm_fd, context1_id, vm, NULL, BATCH_SZ);
+
+ igt_debug("submitting warm up render_copy\n");
+
+ /* Submit some early, unmeasured, work to the context we want */
+ render_copy(ibb0,
+ &src[0], 0, 0, width, height,
+ &dst[0], 0, 0);
+
+ /* Initialize the context parameter to the perf open ioctl here */
+ properties[3] = context0_id;
+
+ igt_debug("opening xe oa stream\n");
+ stream_fd = __perf_open(drm_fd, &param, false);
+ set_fd_flags(stream_fd, O_CLOEXEC);
+
+ dst_buf = intel_buf_create(bops, 4096, 1, 8, 64,
+ I915_TILING_NONE,
+ I915_COMPRESSION_NONE);
+
+ /* Set write domain to cpu briefly to fill the buffer with 80s */
+ buf_map(drm_fd, dst_buf, true /* write enable */);
+ memset(dst_buf->ptr, 0x80, 2048);
+ memset((uint8_t *) dst_buf->ptr + 2048, 0, 2048);
+ intel_buf_unmap(dst_buf);
+
+ /* Submit an mi-rpc to context0 before measurable work */
+#define BO_TIMESTAMP_OFFSET0 1024
+#define BO_REPORT_OFFSET0 0
+#define BO_REPORT_ID0 0xdeadbeef
+ emit_stall_timestamp_and_rpc(ibb0,
+ dst_buf,
+ BO_TIMESTAMP_OFFSET0,
+ BO_REPORT_OFFSET0,
+ BO_REPORT_ID0);
+ intel_bb_flush_render(ibb0);
+
+ /* Remove intel_buf from ibb0 added implicitly in rendercopy */
+ intel_bb_remove_intel_buf(ibb0, dst_buf);
+
+ /* This is the work/context that is measured for counter increments */
+ render_copy(ibb0,
+ &src[0], 0, 0, width, height,
+ &dst[0], 0, 0);
+ intel_bb_flush_render(ibb0);
+
+ /* Submit an mi-rpc to context1 before work
+ *
+ * On gen12, this measurement should just yield counters that are
+ * all zeroes, since the counters will only increment for the
+ * context passed to perf open ioctl
+ */
+#define BO_TIMESTAMP_OFFSET2 1040
+#define BO_REPORT_OFFSET2 512
+#define BO_REPORT_ID2 0x00c0ffee
+ emit_stall_timestamp_and_rpc(ibb1,
+ dst_buf,
+ BO_TIMESTAMP_OFFSET2,
+ BO_REPORT_OFFSET2,
+ BO_REPORT_ID2);
+ intel_bb_flush_render(ibb1);
+
+ /* Submit two copies on the other context to avoid a false
+ * positive in case the driver somehow ended up filtering for
+ * context1
+ */
+ render_copy(ibb1,
+ &src[1], 0, 0, width, height,
+ &dst[1], 0, 0);
+
+ render_copy(ibb1,
+ &src[2], 0, 0, width, height,
+ &dst[2], 0, 0);
+ intel_bb_flush_render(ibb1);
+
+ /* Submit an mi-rpc to context1 after all work */
+#define BO_TIMESTAMP_OFFSET3 1048
+#define BO_REPORT_OFFSET3 768
+#define BO_REPORT_ID3 0x01c0ffee
+ emit_stall_timestamp_and_rpc(ibb1,
+ dst_buf,
+ BO_TIMESTAMP_OFFSET3,
+ BO_REPORT_OFFSET3,
+ BO_REPORT_ID3);
+ intel_bb_flush_render(ibb1);
+
+ /* Remove intel_buf from ibb1 added implicitly in rendercopy */
+ intel_bb_remove_intel_buf(ibb1, dst_buf);
+
+ /* Submit an mi-rpc to context0 after all measurable work */
+#define BO_TIMESTAMP_OFFSET1 1032
+#define BO_REPORT_OFFSET1 256
+#define BO_REPORT_ID1 0xbeefbeef
+ emit_stall_timestamp_and_rpc(ibb0,
+ dst_buf,
+ BO_TIMESTAMP_OFFSET1,
+ BO_REPORT_OFFSET1,
+ BO_REPORT_ID1);
+ intel_bb_flush_render(ibb0);
+ intel_bb_sync(ibb0);
+ intel_bb_sync(ibb1);
+
+ buf_map(drm_fd, dst_buf, false);
+
+ /* Sanity check reports
+ * reportX_32[0]: report id passed with mi-rpc
+ * reportX_32[1]: timestamp
+ * reportX_32[2]: context id
+ *
+ * report0_32: start of measurable work
+ * report1_32: end of measurable work
+ * report2_32: start of other work
+ * report3_32: end of other work
+ */
+ report0_32 = dst_buf->ptr;
+ igt_assert_eq(report0_32[0], 0xdeadbeef);
+ igt_assert(oa_timestamp(report0_32, fmt));
+ ctx0_id = report0_32[2];
+ igt_debug("MI_RPC(start) CTX ID: %u\n", ctx0_id);
+ dump_report(report0_32, 64, "report0_32");
+
+ report1_32 = report0_32 + 64;
+ igt_assert_eq(report1_32[0], 0xbeefbeef);
+ igt_assert(oa_timestamp(report1_32, fmt));
+ ctx1_id = report1_32[2];
+ igt_debug("CTX ID1: %u\n", ctx1_id);
+ dump_report(report1_32, 64, "report1_32");
+
+ /* Verify that counters in context1 are all zeroes */
+ report2_32 = report0_32 + 128;
+ igt_assert_eq(report2_32[0], 0x00c0ffee);
+ igt_assert(oa_timestamp(report2_32, fmt));
+ dump_report(report2_32, 64, "report2_32");
+ igt_assert_eq(0, memcmp(&report2_32[4],
+ (uint8_t *) dst_buf->ptr + 2048,
+ 240));
+
+ report3_32 = report0_32 + 192;
+ igt_assert_eq(report3_32[0], 0x01c0ffee);
+ igt_assert(oa_timestamp(report3_32, fmt));
+ dump_report(report3_32, 64, "report3_32");
+ igt_assert_eq(0, memcmp(&report3_32[4],
+ (uint8_t *) dst_buf->ptr + 2048,
+ 240));
+
+ /* Accumulate deltas for counters - A0, A21 and A26 */
+ memset(accumulator.deltas, 0, sizeof(accumulator.deltas));
+ accumulate_reports(&accumulator, report0_32, report1_32);
+ igt_debug("total: A0 = %"PRIu64", A21 = %"PRIu64", A26 = %"PRIu64"\n",
+ accumulator.deltas[2 + 0],
+ accumulator.deltas[2 + 21],
+ accumulator.deltas[2 + 26]);
+
+ igt_debug("oa_timestamp32 0 = %"PRIu64"\n", oa_timestamp(report0_32, fmt));
+ igt_debug("oa_timestamp32 1 = %"PRIu64"\n", oa_timestamp(report1_32, fmt));
+ igt_debug("ctx_id 0 = %u\n", report0_32[2]);
+ igt_debug("ctx_id 1 = %u\n", report1_32[2]);
+
+ /* The delta as calculated via the PIPE_CONTROL timestamp or
+ * the OA report timestamps should be almost identical but
+ * allow a 500 nanoseconds margin.
+ */
+ timestamp0_64 = *(uint64_t *)(((uint8_t *)dst_buf->ptr) + BO_TIMESTAMP_OFFSET0);
+ timestamp1_64 = *(uint64_t *)(((uint8_t *)dst_buf->ptr) + BO_TIMESTAMP_OFFSET1);
+
+ igt_debug("ts_timestamp64 0 = %"PRIu64"\n", timestamp0_64);
+ igt_debug("ts_timestamp64 1 = %"PRIu64"\n", timestamp1_64);
+
+ delta_ts64 = timestamp1_64 - timestamp0_64;
+ delta_oa32 = oa_timestamp_delta(report1_32, report0_32, fmt);
+
+ /* Sanity check that we can pass the delta to timebase_scale */
+ delta_oa32_ns = timebase_scale(delta_oa32);
+ delta_ts64_ns = cs_timebase_scale(delta_ts64);
+
+ igt_debug("oa32 delta = %"PRIu64", = %"PRIu64"ns\n",
+ delta_oa32, delta_oa32_ns);
+ igt_debug("ts64 delta = %"PRIu64", = %"PRIu64"ns\n",
+ delta_ts64, delta_ts64_ns);
+
+ delta_delta = delta_ts64_ns > delta_oa32_ns ?
+ (delta_ts64_ns - delta_oa32_ns) :
+ (delta_oa32_ns - delta_ts64_ns);
+ if (delta_delta > 500) {
+ igt_debug("delta_delta = %"PRIu64". exceeds margin, skipping..\n",
+ delta_delta);
+ exit(EAGAIN);
+ }
+
+ igt_debug("n samples written = %"PRIu64"/%"PRIu64" (%ix%i)\n",
+ accumulator.deltas[2 + 21],
+ accumulator.deltas[2 + 26],
+ width, height);
+ accumulator_print(&accumulator, "filtered");
+
+ /* Verify that the work actually happened by comparing the src
+ * and dst buffers
+ */
+ buf_map(drm_fd, &src[0], false);
+ buf_map(drm_fd, &dst[0], false);
+
+ ret = memcmp(src[0].ptr, dst[0].ptr, 4 * width * height);
+ intel_buf_unmap(&src[0]);
+ intel_buf_unmap(&dst[0]);
+
+ if (ret != 0) {
+ accumulator_print(&accumulator, "total");
+ exit(EAGAIN);
+ }
+
+ /* FIXME: can we deduce the presence of A26 from get_oa_format(fmt)? */
+ if (intel_graphics_ver(devid) >= IP_VER(20, 0))
+ goto skip_check;
+
+ /* Check that this test passed. The test measures the number of 2x2
+ * samples written to the render target using the counter A26. For
+ * OAR, this counter will only have increments relevant to this specific
+ * context. The value equals the width * height of the rendered work.
+ */
+ igt_assert_eq(accumulator.deltas[2 + 26], width * height);
+
+ skip_check:
+ /* Clean up */
+ for (int i = 0; i < ARRAY_SIZE(src); i++) {
+ intel_buf_close(bops, &src[i]);
+ intel_buf_close(bops, &dst[i]);
+ }
+
+ intel_buf_unmap(dst_buf);
+ intel_buf_destroy(dst_buf);
+ intel_bb_destroy(ibb0);
+ intel_bb_destroy(ibb1);
+ xe_exec_queue_destroy(drm_fd, context0_id);
+ xe_exec_queue_destroy(drm_fd, context1_id);
+ xe_vm_destroy(drm_fd, vm);
+ buf_ops_destroy(bops);
+ __perf_close(stream_fd);
+}
+
+/**
+ * SUBTEST: unprivileged-single-ctx-counters
+ * Description: A harder test for OAR/OAC using MI_REPORT_PERF_COUNT
+ */
+static void
+test_single_ctx_render_target_writes_a_counter(struct drm_xe_engine_class_instance *hwe)
+{
+ int child_ret;
+ struct igt_helper_process child = {};
+
+ /* Ensure observation_paranoid is set to 1 by default */
+ write_u64_file("/proc/sys/dev/xe/observation_paranoid", 1);
+
+ do {
+ igt_fork_helper(&child) {
+ /* A local device for local resources. */
+ drm_fd = drm_reopen_driver(drm_fd);
+
+ igt_drop_root();
+
+ single_ctx_helper(hwe);
+
+ drm_close_driver(drm_fd);
+ }
+ child_ret = igt_wait_helper(&child);
+ igt_assert(WEXITSTATUS(child_ret) == EAGAIN ||
+ WEXITSTATUS(child_ret) == 0);
+ } while (WEXITSTATUS(child_ret) == EAGAIN);
+}
+
static unsigned read_xe_module_ref(void)
{
FILE *fp = fopen("/proc/modules", "r");
@@ -2929,6 +3610,26 @@ igt_main
igt_subtest("short-reads")
test_short_reads();
+ igt_subtest_group {
+ igt_subtest_with_dynamic("mi-rpc")
+ __for_one_hwe_in_oag(hwe)
+ test_mi_rpc(hwe);
+
+ igt_subtest_with_dynamic("oa-tlb-invalidate") {
+ igt_require(intel_graphics_ver(devid) <= IP_VER(12, 70) &&
+ intel_graphics_ver(devid) != IP_VER(12, 60));
+ __for_one_hwe_in_oag(hwe)
+ test_oa_tlb_invalidate(hwe);
+ }
+
+ igt_subtest_with_dynamic("unprivileged-single-ctx-counters") {
+ igt_require_f(render_copy, "no render-copy function\n");
+ igt_require(intel_graphics_ver(devid) < IP_VER(20, 0));
+ __for_one_render_engine(hwe)
+ test_single_ctx_render_target_writes_a_counter(hwe);
+ }
+ }
+
igt_fixture {
/* leave sysctl options in their default state... */
write_u64_file("/proc/sys/dev/xe/observation_paranoid", 1);