radeonsi/gfx10: added support for gfx10 conditional rendering

Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7526>
author: Indrajit Kumar Das <indrajit-kumar.das@amd.com> 2020-11-10 15:29:45 +0530
committer: Marge Bot <eric+marge@anholt.net> 2020-12-05 16:11:28 +0000
commit: 6df572532dce8fa5c09c4774e0f79e039b118a85 (patch)
tree: 1b6ba669a18df543c379a7c05c7b6fc9729fd5ac
parent: 3bd9db5be3c0e28443098dd0108e01a73c2b83e2 (diff)
3 files changed, 151 insertions, 104 deletions
diff --git a/src/gallium/drivers/radeonsi/gfx10_query.c b/src/gallium/drivers/radeonsi/gfx10_query.c
index f9aa027d16f..f31d4ed03a7 100644
--- a/src/gallium/drivers/radeonsi/gfx10_query.c
+++ b/src/gallium/drivers/radeonsi/gfx10_query.c
@@ -30,53 +30,6 @@
 
 #include <stddef.h>
 
-/**
- * The query buffer is written to by ESGS NGG shaders with statistics about
- * generated and (streamout-)emitted primitives.
- *
- * The context maintains a ring of these query buffers, and queries simply
- * point into the ring, allowing an arbitrary number of queries to be active
- * without additional GPU cost.
- */
-struct gfx10_sh_query_buffer {
-   struct list_head list;
-   struct si_resource *buf;
-   unsigned refcount;
-
-   /* Offset into the buffer in bytes; points at the first un-emitted entry. */
-   unsigned head;
-};
-
-/* Memory layout of the query buffer. Must be kept in sync with shaders
- * (including QBO shaders) and should be aligned to cachelines.
- *
- * The somewhat awkward memory layout is for compatibility with the
- * SET_PREDICATION packet, which also means that we're setting the high bit
- * of all those values unconditionally.
- */
-struct gfx10_sh_query_buffer_mem {
-   struct {
-      uint64_t generated_primitives_start_dummy;
-      uint64_t emitted_primitives_start_dummy;
-      uint64_t generated_primitives;
-      uint64_t emitted_primitives;
-   } stream[4];
-   uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
-   uint32_t pad[31];
-};
-
-/* Shader-based queries. */
-struct gfx10_sh_query {
-   struct si_query b;
-
-   struct gfx10_sh_query_buffer *first;
-   struct gfx10_sh_query_buffer *last;
-   unsigned first_begin;
-   unsigned last_end;
-
-   unsigned stream;
-};
-
 static void emit_shader_query(struct si_context *sctx)
 {
    assert(!list_is_empty(&sctx->shader_query_buffers));
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index 17d9228e493..d3450310b18 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -1008,85 +1008,132 @@ static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf,
 
 static void si_emit_query_predication(struct si_context *ctx)
 {
-   struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
-   struct si_query_buffer *qbuf;
    uint32_t op;
    bool flag_wait, invert;
 
+   struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
    if (!query)
       return;
 
-   if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
-                                          query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
-      assert(!"not implemented");
-   }
-
    invert = ctx->render_cond_invert;
    flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
                ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
 
-   if (query->workaround_buf) {
-      op = PRED_OP(PREDICATION_OP_BOOL64);
-   } else {
-      switch (query->b.type) {
-      case PIPE_QUERY_OCCLUSION_COUNTER:
-      case PIPE_QUERY_OCCLUSION_PREDICATE:
-      case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-         op = PRED_OP(PREDICATION_OP_ZPASS);
-         break;
-      case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-      case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-         op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
-         invert = !invert;
-         break;
-      default:
-         assert(0);
-         return;
-      }
-   }
+   if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+                                          query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
+      struct gfx10_sh_query *gfx10_query = (struct gfx10_sh_query *)query;
+      struct gfx10_sh_query_buffer *qbuf, *first, *last;
 
-   /* if true then invert, see GL_ARB_conditional_render_inverted */
-   if (invert)
-      op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
-   else
-      op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
+      op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
 
-   /* Use the value written by compute shader as a workaround. Note that
-    * the wait flag does not apply in this predication mode.
-    *
-    * The shader outputs the result value to L2. Workarounds only affect GFX8
-    * and later, where the CP reads data from L2, so we don't need an
-    * additional flush.
-    */
-   if (query->workaround_buf) {
-      uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
-      emit_set_predicate(ctx, query->workaround_buf, va, op);
-      return;
-   }
+      /* if true then invert, see GL_ARB_conditional_render_inverted */
+      if (!invert)
+         op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
+      else
+         op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
 
-   op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
+      op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
 
-   /* emit predicate packets for all data blocks */
-   for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
-      unsigned results_base = 0;
-      uint64_t va_base = qbuf->buf->gpu_address;
+      first = gfx10_query->first;
+      last = gfx10_query->last;
 
-      while (results_base < qbuf->results_end) {
+      while (first) {
+         qbuf = first;
+         if (first != last)
+            first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+         else
+            first = NULL;
+
+         unsigned results_base = gfx10_query->first_begin;
+         uint64_t va_base = qbuf->buf->gpu_address;
          uint64_t va = va_base + results_base;
 
-         if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
-            for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
-               emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
+         unsigned begin = qbuf == gfx10_query->first ? gfx10_query->first_begin : 0;
+         unsigned end = qbuf == gfx10_query->last ? gfx10_query->last_end : qbuf->buf->b.b.width0;
+
+         unsigned count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
+         do {
+            if (gfx10_query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+               for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+                  emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * stream, op);
 
-               /* set CONTINUE bit for all packets except the first */
+                  /* set CONTINUE bit for all packets except the first */
+                  op |= PREDICATION_CONTINUE;
+               }
+            } else {
+               emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * gfx10_query->stream, op);
                op |= PREDICATION_CONTINUE;
             }
-         } else {
-            emit_set_predicate(ctx, qbuf->buf, va, op);
-            op |= PREDICATION_CONTINUE;
+
+            results_base += sizeof(struct gfx10_sh_query_buffer_mem);
+         } while (count--);
+      }
+   } else {
+      struct si_query_buffer *qbuf;
+
+      if (query->workaround_buf) {
+         op = PRED_OP(PREDICATION_OP_BOOL64);
+      } else {
+         switch (query->b.type) {
+         case PIPE_QUERY_OCCLUSION_COUNTER:
+         case PIPE_QUERY_OCCLUSION_PREDICATE:
+         case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+            op = PRED_OP(PREDICATION_OP_ZPASS);
+            break;
+         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+         case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+            op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
+            invert = !invert;
+            break;
+         default:
+            assert(0);
+            return;
          }
+      }
 
-         results_base += query->result_size;
+      /* if true then invert, see GL_ARB_conditional_render_inverted */
+      if (invert)
+         op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
+      else
+         op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
+
+      /* Use the value written by compute shader as a workaround. Note that
+       * the wait flag does not apply in this predication mode.
+       *
+       * The shader outputs the result value to L2. Workarounds only affect GFX8
+       * and later, where the CP reads data from L2, so we don't need an
+       * additional flush.
+       */
+      if (query->workaround_buf) {
+         uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
+         emit_set_predicate(ctx, query->workaround_buf, va, op);
+         return;
+      }
+
+      op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
+
+      /* emit predicate packets for all data blocks */
+      for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+         unsigned results_base = 0;
+         uint64_t va_base = qbuf->buf->gpu_address;
+
+         while (results_base < qbuf->results_end) {
+            uint64_t va = va_base + results_base;
+
+            if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+               for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+                  emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
+
+                  /* set CONTINUE bit for all packets except the first */
+                  op |= PREDICATION_CONTINUE;
+               }
+            } else {
+               emit_set_predicate(ctx, qbuf->buf, va, op);
+               op |= PREDICATION_CONTINUE;
+            }
+
+            results_base += query->result_size;
+         }
       }
    }
 }
diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h
index 1eaa3b255a6..e0be318bcac 100644
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -225,6 +225,53 @@ void si_query_hw_suspend(struct si_context *sctx, struct si_query *query);
 void si_query_hw_resume(struct si_context *sctx, struct si_query *query);
 
 /* Shader-based queries */
+
+/**
+ * The query buffer is written to by ESGS NGG shaders with statistics about
+ * generated and (streamout-)emitted primitives.
+ *
+ * The context maintains a ring of these query buffers, and queries simply
+ * point into the ring, allowing an arbitrary number of queries to be active
+ * without additional GPU cost.
+ */
+struct gfx10_sh_query_buffer {
+   struct list_head list;
+   struct si_resource *buf;
+   unsigned refcount;
+
+   /* Offset into the buffer in bytes; points at the first un-emitted entry. */
+   unsigned head;
+};
+
+/* Memory layout of the query buffer. Must be kept in sync with shaders
+ * (including QBO shaders) and should be aligned to cachelines.
+ *
+ * The somewhat awkward memory layout is for compatibility with the
+ * SET_PREDICATION packet, which also means that we're setting the high bit
+ * of all those values unconditionally.
+ */
+struct gfx10_sh_query_buffer_mem {
+   struct {
+      uint64_t generated_primitives_start_dummy;
+      uint64_t emitted_primitives_start_dummy;
+      uint64_t generated_primitives;
+      uint64_t emitted_primitives;
+   } stream[4];
+   uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
+   uint32_t pad[31];
+};
+
+struct gfx10_sh_query {
+   struct si_query b;
+
+   struct gfx10_sh_query_buffer *first;
+   struct gfx10_sh_query_buffer *last;
+   unsigned first_begin;
+   unsigned last_end;
+
+   unsigned stream;
+};
+
 struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
                                          unsigned index);
author	Indrajit Kumar Das <indrajit-kumar.das@amd.com>	2020-11-10 15:29:45 +0530
committer	Marge Bot <eric+marge@anholt.net>	2020-12-05 16:11:28 +0000
commit	6df572532dce8fa5c09c4774e0f79e039b118a85 (patch)
tree	1b6ba669a18df543c379a7c05c7b6fc9729fd5ac
parent	3bd9db5be3c0e28443098dd0108e01a73c2b83e2 (diff)