diff options
author | Indrajit Kumar Das <indrajit-kumar.das@amd.com> | 2020-11-10 15:29:45 +0530 |
---|---|---|
committer | Marge Bot <eric+marge@anholt.net> | 2020-12-05 16:11:28 +0000 |
commit | 6df572532dce8fa5c09c4774e0f79e039b118a85 (patch) | |
tree | 1b6ba669a18df543c379a7c05c7b6fc9729fd5ac | |
parent | 3bd9db5be3c0e28443098dd0108e01a73c2b83e2 (diff) |
radeonsi/gfx10: added support for gfx10 conditional rendering
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7526>
-rw-r--r-- | src/gallium/drivers/radeonsi/gfx10_query.c | 47 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_query.c | 161 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_query.h | 47 |
3 files changed, 151 insertions, 104 deletions
diff --git a/src/gallium/drivers/radeonsi/gfx10_query.c b/src/gallium/drivers/radeonsi/gfx10_query.c index f9aa027d16f..f31d4ed03a7 100644 --- a/src/gallium/drivers/radeonsi/gfx10_query.c +++ b/src/gallium/drivers/radeonsi/gfx10_query.c @@ -30,53 +30,6 @@ #include <stddef.h> -/** - * The query buffer is written to by ESGS NGG shaders with statistics about - * generated and (streamout-)emitted primitives. - * - * The context maintains a ring of these query buffers, and queries simply - * point into the ring, allowing an arbitrary number of queries to be active - * without additional GPU cost. - */ -struct gfx10_sh_query_buffer { - struct list_head list; - struct si_resource *buf; - unsigned refcount; - - /* Offset into the buffer in bytes; points at the first un-emitted entry. */ - unsigned head; -}; - -/* Memory layout of the query buffer. Must be kept in sync with shaders - * (including QBO shaders) and should be aligned to cachelines. - * - * The somewhat awkward memory layout is for compatibility with the - * SET_PREDICATION packet, which also means that we're setting the high bit - * of all those values unconditionally. - */ -struct gfx10_sh_query_buffer_mem { - struct { - uint64_t generated_primitives_start_dummy; - uint64_t emitted_primitives_start_dummy; - uint64_t generated_primitives; - uint64_t emitted_primitives; - } stream[4]; - uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */ - uint32_t pad[31]; -}; - -/* Shader-based queries. */ -struct gfx10_sh_query { - struct si_query b; - - struct gfx10_sh_query_buffer *first; - struct gfx10_sh_query_buffer *last; - unsigned first_begin; - unsigned last_end; - - unsigned stream; -}; - static void emit_shader_query(struct si_context *sctx) { assert(!list_is_empty(&sctx->shader_query_buffers)); diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index 17d9228e493..d3450310b18 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -1008,85 +1008,132 @@ static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, static void si_emit_query_predication(struct si_context *ctx) { - struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond; - struct si_query_buffer *qbuf; uint32_t op; bool flag_wait, invert; + struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond; if (!query) return; - if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || - query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) { - assert(!"not implemented"); - } - invert = ctx->render_cond_invert; flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT || ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT; - if (query->workaround_buf) { - op = PRED_OP(PREDICATION_OP_BOOL64); - } else { - switch (query->b.type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - op = PRED_OP(PREDICATION_OP_ZPASS); - break; - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: - op = PRED_OP(PREDICATION_OP_PRIMCOUNT); - invert = !invert; - break; - default: - assert(0); - return; - } - } + if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) { + struct gfx10_sh_query *gfx10_query = (struct gfx10_sh_query *)query; + struct gfx10_sh_query_buffer *qbuf, *first, *last; - /* if true then invert, see GL_ARB_conditional_render_inverted */ - if (invert) - op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */ - else - op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */ + op = PRED_OP(PREDICATION_OP_PRIMCOUNT); - /* Use the value written by compute shader as a workaround. Note that - * the wait flag does not apply in this predication mode. - * - * The shader outputs the result value to L2. Workarounds only affect GFX8 - * and later, where the CP reads data from L2, so we don't need an - * additional flush. - */ - if (query->workaround_buf) { - uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset; - emit_set_predicate(ctx, query->workaround_buf, va, op); - return; - } + /* if true then invert, see GL_ARB_conditional_render_inverted */ + if (!invert) + op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */ + else + op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */ - op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW; + op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW; - /* emit predicate packets for all data blocks */ - for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { - unsigned results_base = 0; - uint64_t va_base = qbuf->buf->gpu_address; + first = gfx10_query->first; + last = gfx10_query->last; - while (results_base < qbuf->results_end) { + while (first) { + qbuf = first; + if (first != last) + first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list); + else + first = NULL; + + unsigned results_base = gfx10_query->first_begin; + uint64_t va_base = qbuf->buf->gpu_address; uint64_t va = va_base + results_base; - if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) { - for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) { - emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op); + unsigned begin = qbuf == gfx10_query->first ? gfx10_query->first_begin : 0; + unsigned end = qbuf == gfx10_query->last ? gfx10_query->last_end : qbuf->buf->b.b.width0; + + unsigned count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem); + do { + if (gfx10_query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) { + for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) { + emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * stream, op); - /* set CONTINUE bit for all packets except the first */ + /* set CONTINUE bit for all packets except the first */ + op |= PREDICATION_CONTINUE; + } + } else { + emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * gfx10_query->stream, op); op |= PREDICATION_CONTINUE; } - } else { - emit_set_predicate(ctx, qbuf->buf, va, op); - op |= PREDICATION_CONTINUE; + + results_base += sizeof(struct gfx10_sh_query_buffer_mem); + } while (count--); + } + } else { + struct si_query_buffer *qbuf; + + if (query->workaround_buf) { + op = PRED_OP(PREDICATION_OP_BOOL64); + } else { + switch (query->b.type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + op = PRED_OP(PREDICATION_OP_ZPASS); + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + op = PRED_OP(PREDICATION_OP_PRIMCOUNT); + invert = !invert; + break; + default: + assert(0); + return; } + } - results_base += query->result_size; + /* if true then invert, see GL_ARB_conditional_render_inverted */ + if (invert) + op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */ + else + op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */ + + /* Use the value written by compute shader as a workaround. Note that + * the wait flag does not apply in this predication mode. + * + * The shader outputs the result value to L2. Workarounds only affect GFX8 + * and later, where the CP reads data from L2, so we don't need an + * additional flush. + */ + if (query->workaround_buf) { + uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset; + emit_set_predicate(ctx, query->workaround_buf, va, op); + return; + } + + op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW; + + /* emit predicate packets for all data blocks */ + for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { + unsigned results_base = 0; + uint64_t va_base = qbuf->buf->gpu_address; + + while (results_base < qbuf->results_end) { + uint64_t va = va_base + results_base; + + if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) { + for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) { + emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op); + + /* set CONTINUE bit for all packets except the first */ + op |= PREDICATION_CONTINUE; + } + } else { + emit_set_predicate(ctx, qbuf->buf, va, op); + op |= PREDICATION_CONTINUE; + } + + results_base += query->result_size; + } } } } diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h index 1eaa3b255a6..e0be318bcac 100644 --- a/src/gallium/drivers/radeonsi/si_query.h +++ b/src/gallium/drivers/radeonsi/si_query.h @@ -225,6 +225,53 @@ void si_query_hw_suspend(struct si_context *sctx, struct si_query *query); void si_query_hw_resume(struct si_context *sctx, struct si_query *query); /* Shader-based queries */ + +/** + * The query buffer is written to by ESGS NGG shaders with statistics about + * generated and (streamout-)emitted primitives. + * + * The context maintains a ring of these query buffers, and queries simply + * point into the ring, allowing an arbitrary number of queries to be active + * without additional GPU cost. + */ +struct gfx10_sh_query_buffer { + struct list_head list; + struct si_resource *buf; + unsigned refcount; + + /* Offset into the buffer in bytes; points at the first un-emitted entry. */ + unsigned head; +}; + +/* Memory layout of the query buffer. Must be kept in sync with shaders + * (including QBO shaders) and should be aligned to cachelines. + * + * The somewhat awkward memory layout is for compatibility with the + * SET_PREDICATION packet, which also means that we're setting the high bit + * of all those values unconditionally. + */ +struct gfx10_sh_query_buffer_mem { + struct { + uint64_t generated_primitives_start_dummy; + uint64_t emitted_primitives_start_dummy; + uint64_t generated_primitives; + uint64_t emitted_primitives; + } stream[4]; + uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */ + uint32_t pad[31]; +}; + +struct gfx10_sh_query { + struct si_query b; + + struct gfx10_sh_query_buffer *first; + struct gfx10_sh_query_buffer *last; + unsigned first_begin; + unsigned last_end; + + unsigned stream; +}; + struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type, unsigned index); |