summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIndrajit Kumar Das <indrajit-kumar.das@amd.com>2020-11-10 15:29:45 +0530
committerMarge Bot <eric+marge@anholt.net>2020-12-05 16:11:28 +0000
commit6df572532dce8fa5c09c4774e0f79e039b118a85 (patch)
tree1b6ba669a18df543c379a7c05c7b6fc9729fd5ac
parent3bd9db5be3c0e28443098dd0108e01a73c2b83e2 (diff)
radeonsi/gfx10: added support for gfx10 conditional rendering
Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7526>
-rw-r--r--src/gallium/drivers/radeonsi/gfx10_query.c47
-rw-r--r--src/gallium/drivers/radeonsi/si_query.c161
-rw-r--r--src/gallium/drivers/radeonsi/si_query.h47
3 files changed, 151 insertions, 104 deletions
diff --git a/src/gallium/drivers/radeonsi/gfx10_query.c b/src/gallium/drivers/radeonsi/gfx10_query.c
index f9aa027d16f..f31d4ed03a7 100644
--- a/src/gallium/drivers/radeonsi/gfx10_query.c
+++ b/src/gallium/drivers/radeonsi/gfx10_query.c
@@ -30,53 +30,6 @@
#include <stddef.h>
-/**
- * The query buffer is written to by ESGS NGG shaders with statistics about
- * generated and (streamout-)emitted primitives.
- *
- * The context maintains a ring of these query buffers, and queries simply
- * point into the ring, allowing an arbitrary number of queries to be active
- * without additional GPU cost.
- */
-struct gfx10_sh_query_buffer {
- struct list_head list;
- struct si_resource *buf;
- unsigned refcount;
-
- /* Offset into the buffer in bytes; points at the first un-emitted entry. */
- unsigned head;
-};
-
-/* Memory layout of the query buffer. Must be kept in sync with shaders
- * (including QBO shaders) and should be aligned to cachelines.
- *
- * The somewhat awkward memory layout is for compatibility with the
- * SET_PREDICATION packet, which also means that we're setting the high bit
- * of all those values unconditionally.
- */
-struct gfx10_sh_query_buffer_mem {
- struct {
- uint64_t generated_primitives_start_dummy;
- uint64_t emitted_primitives_start_dummy;
- uint64_t generated_primitives;
- uint64_t emitted_primitives;
- } stream[4];
- uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
- uint32_t pad[31];
-};
-
-/* Shader-based queries. */
-struct gfx10_sh_query {
- struct si_query b;
-
- struct gfx10_sh_query_buffer *first;
- struct gfx10_sh_query_buffer *last;
- unsigned first_begin;
- unsigned last_end;
-
- unsigned stream;
-};
-
static void emit_shader_query(struct si_context *sctx)
{
assert(!list_is_empty(&sctx->shader_query_buffers));
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index 17d9228e493..d3450310b18 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -1008,85 +1008,132 @@ static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf,
static void si_emit_query_predication(struct si_context *ctx)
{
- struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
- struct si_query_buffer *qbuf;
uint32_t op;
bool flag_wait, invert;
+ struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
if (!query)
return;
- if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
- query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
- assert(!"not implemented");
- }
-
invert = ctx->render_cond_invert;
flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
- if (query->workaround_buf) {
- op = PRED_OP(PREDICATION_OP_BOOL64);
- } else {
- switch (query->b.type) {
- case PIPE_QUERY_OCCLUSION_COUNTER:
- case PIPE_QUERY_OCCLUSION_PREDICATE:
- case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
- op = PRED_OP(PREDICATION_OP_ZPASS);
- break;
- case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
- op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
- invert = !invert;
- break;
- default:
- assert(0);
- return;
- }
- }
+ if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
+ struct gfx10_sh_query *gfx10_query = (struct gfx10_sh_query *)query;
+ struct gfx10_sh_query_buffer *qbuf, *first, *last;
- /* if true then invert, see GL_ARB_conditional_render_inverted */
- if (invert)
- op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
- else
- op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
+ op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
- /* Use the value written by compute shader as a workaround. Note that
- * the wait flag does not apply in this predication mode.
- *
- * The shader outputs the result value to L2. Workarounds only affect GFX8
- * and later, where the CP reads data from L2, so we don't need an
- * additional flush.
- */
- if (query->workaround_buf) {
- uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
- emit_set_predicate(ctx, query->workaround_buf, va, op);
- return;
- }
+ /* if true then invert, see GL_ARB_conditional_render_inverted */
+ if (!invert)
+ op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
+ else
+ op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
- op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
+ op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
- /* emit predicate packets for all data blocks */
- for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
- unsigned results_base = 0;
- uint64_t va_base = qbuf->buf->gpu_address;
+ first = gfx10_query->first;
+ last = gfx10_query->last;
- while (results_base < qbuf->results_end) {
+ while (first) {
+ qbuf = first;
+ if (first != last)
+ first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+ else
+ first = NULL;
+
+ unsigned results_base = gfx10_query->first_begin;
+ uint64_t va_base = qbuf->buf->gpu_address;
uint64_t va = va_base + results_base;
- if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
- for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
- emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
+ unsigned begin = qbuf == gfx10_query->first ? gfx10_query->first_begin : 0;
+ unsigned end = qbuf == gfx10_query->last ? gfx10_query->last_end : qbuf->buf->b.b.width0;
+
+ unsigned count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
+ do {
+ if (gfx10_query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+ for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+ emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * stream, op);
- /* set CONTINUE bit for all packets except the first */
+ /* set CONTINUE bit for all packets except the first */
+ op |= PREDICATION_CONTINUE;
+ }
+ } else {
+ emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * gfx10_query->stream, op);
op |= PREDICATION_CONTINUE;
}
- } else {
- emit_set_predicate(ctx, qbuf->buf, va, op);
- op |= PREDICATION_CONTINUE;
+
+ results_base += sizeof(struct gfx10_sh_query_buffer_mem);
+ } while (count--);
+ }
+ } else {
+ struct si_query_buffer *qbuf;
+
+ if (query->workaround_buf) {
+ op = PRED_OP(PREDICATION_OP_BOOL64);
+ } else {
+ switch (query->b.type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ op = PRED_OP(PREDICATION_OP_ZPASS);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
+ invert = !invert;
+ break;
+ default:
+ assert(0);
+ return;
}
+ }
- results_base += query->result_size;
+ /* if true then invert, see GL_ARB_conditional_render_inverted */
+ if (invert)
+ op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
+ else
+ op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
+
+ /* Use the value written by compute shader as a workaround. Note that
+ * the wait flag does not apply in this predication mode.
+ *
+ * The shader outputs the result value to L2. Workarounds only affect GFX8
+ * and later, where the CP reads data from L2, so we don't need an
+ * additional flush.
+ */
+ if (query->workaround_buf) {
+ uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
+ emit_set_predicate(ctx, query->workaround_buf, va, op);
+ return;
+ }
+
+ op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
+
+ /* emit predicate packets for all data blocks */
+ for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+ unsigned results_base = 0;
+ uint64_t va_base = qbuf->buf->gpu_address;
+
+ while (results_base < qbuf->results_end) {
+ uint64_t va = va_base + results_base;
+
+ if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+ for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+ emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
+
+ /* set CONTINUE bit for all packets except the first */
+ op |= PREDICATION_CONTINUE;
+ }
+ } else {
+ emit_set_predicate(ctx, qbuf->buf, va, op);
+ op |= PREDICATION_CONTINUE;
+ }
+
+ results_base += query->result_size;
+ }
}
}
}
diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h
index 1eaa3b255a6..e0be318bcac 100644
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -225,6 +225,53 @@ void si_query_hw_suspend(struct si_context *sctx, struct si_query *query);
void si_query_hw_resume(struct si_context *sctx, struct si_query *query);
/* Shader-based queries */
+
+/**
+ * The query buffer is written to by ESGS NGG shaders with statistics about
+ * generated and (streamout-)emitted primitives.
+ *
+ * The context maintains a ring of these query buffers, and queries simply
+ * point into the ring, allowing an arbitrary number of queries to be active
+ * without additional GPU cost.
+ */
+struct gfx10_sh_query_buffer {
+ struct list_head list;
+ struct si_resource *buf;
+ unsigned refcount;
+
+ /* Offset into the buffer in bytes; points at the first un-emitted entry. */
+ unsigned head;
+};
+
+/* Memory layout of the query buffer. Must be kept in sync with shaders
+ * (including QBO shaders) and should be aligned to cachelines.
+ *
+ * The somewhat awkward memory layout is for compatibility with the
+ * SET_PREDICATION packet, which also means that we're setting the high bit
+ * of all those values unconditionally.
+ */
+struct gfx10_sh_query_buffer_mem {
+ struct {
+ uint64_t generated_primitives_start_dummy;
+ uint64_t emitted_primitives_start_dummy;
+ uint64_t generated_primitives;
+ uint64_t emitted_primitives;
+ } stream[4];
+ uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
+ uint32_t pad[31];
+};
+
+struct gfx10_sh_query {
+ struct si_query b;
+
+ struct gfx10_sh_query_buffer *first;
+ struct gfx10_sh_query_buffer *last;
+ unsigned first_begin;
+ unsigned last_end;
+
+ unsigned stream;
+};
+
struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
unsigned index);