radeonsi: add Wave32 heuristics and shader profiles

This generally works well. There are new cases that select Wave32, and there are shader profiles which adjust that. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13966>
author: Marek Olšák <marek.olsak@amd.com> 2021-11-19 18:36:03 -0500
committer: Marge Bot <emma+marge@anholt.net> 2021-12-11 20:07:35 +0000
commit: b3b2f97f2e25b2c4c72e5f04c945ce9c48ce6cd6 (patch)
tree: 6368067ccc57d6939a71f87071c99eec9d34f091
parent: e2a18833372c1d060635dd9d73956fc22b7b674e (diff)
5 files changed, 123 insertions, 5 deletions
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index bd7d3f27c74..1717686fe1d 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -65,6 +65,7 @@ static const struct debug_named_value radeonsi_debug_options[] = {
    {"gisel", DBG(GISEL), "Enable LLVM global instruction selector."},
    {"w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders."},
    {"w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders."},
+   {"w32psdiscard", DBG(W32_PS_DISCARD), "Use Wave32 for pixel shaders even if they contain discard and LLVM is buggy."},
    {"w32cs", DBG(W32_CS), "Use Wave32 for computes shaders."},
    {"w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders."},
    {"w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders."},
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index f01abc3d62a..b0db57599f0 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -197,6 +197,7 @@ enum
    DBG_GISEL,
    DBG_W32_GE,
    DBG_W32_PS,
+   DBG_W32_PS_DISCARD,
    DBG_W32_CS,
    DBG_W64_GE,
    DBG_W64_PS,
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index d410b7a547c..b740e2ba6e6 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -288,6 +288,10 @@ enum
 #define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 5)
 #define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x)  (((x) >> 5) & 0xff)
 
+#define SI_PROFILE_WAVE32                    (1 << 0)
+#define SI_PROFILE_WAVE64                    (1 << 1)
+#define SI_PROFILE_IGNORE_LLVM_DISCARD_BUG   (1 << 2)
+
 /**
  * For VS shader keys, describe any fixups required for vertex fetch.
  *
@@ -344,6 +348,7 @@ struct si_shader_info {
    shader_info base;
 
    gl_shader_stage stage;
+   uint32_t options; /* bitmask of SI_PROFILE_* */
 
    ubyte num_inputs;
    ubyte num_outputs;
@@ -404,6 +409,7 @@ struct si_shader_info {
    bool uses_bindless_samplers;
    bool uses_bindless_images;
    bool uses_indirect_descriptor;
+   bool has_divergent_loop;
 
    bool uses_vmem_return_type_sampler_or_bvh;
    bool uses_vmem_return_type_other; /* all other VMEM loads and atomics with return */
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index 77fa0770efb..b43b5359da8 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -31,6 +31,29 @@
 #include "si_pipe.h"
 #include "si_shader_internal.h"
 #include "tgsi/tgsi_from_mesa.h"
+#include "util/mesa-sha1.h"
+
+
+struct si_shader_profile {
+   uint32_t sha1[SHA1_DIGEST_LENGTH32];
+   uint32_t options;
+};
+
+static struct si_shader_profile profiles[] =
+{
+   {
+      /* Viewperf/Energy isn't affected by the discard bug. */
+      {0x17118671, 0xd0102e0c, 0x947f3592, 0xb2057e7b, 0x4da5d9b0},
+      SI_PROFILE_IGNORE_LLVM_DISCARD_BUG,
+   },
+   {
+      /* Viewperf/Medical, a shader with a divergent loop doesn't benefit from Wave32,
+       * probably due to interpolation performance.
+       */
+      {0x29f0f4a0, 0x0672258d, 0x47ccdcfd, 0x31e67dcc, 0xdcb1fda8},
+      SI_PROFILE_WAVE64,
+   },
+};
 
 static const nir_src *get_texture_src(nir_tex_instr *instr, nir_tex_src_type type)
 {
@@ -397,6 +420,14 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf
    info->base = nir->info;
    info->stage = nir->info.stage;
 
+   /* Get options from shader profiles. */
+   for (unsigned i = 0; i < ARRAY_SIZE(profiles); i++) {
+      if (_mesa_printed_sha1_equal(info->base.source_sha1, profiles[i].sha1)) {
+         info->options = profiles[i].options;
+         break;
+      }
+   }
+
    if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
       if (info->base.tess.primitive_mode == GL_ISOLINES)
          info->base.tess.primitive_mode = GL_LINES;
@@ -531,6 +562,8 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf
    /* Trim output read masks based on write masks. */
    for (unsigned i = 0; i < info->num_outputs; i++)
       info->output_readmask[i] &= info->output_usagemask[i];
+
+   info->has_divergent_loop = nir_has_divergent_loop((nir_shader*)nir);
 }
 
 static bool si_alu_to_scalar_filter(const nir_instr *instr, const void *data)
@@ -932,5 +965,8 @@ char *si_finalize_nir(struct pipe_screen *screen, void *nirptr)
    if (sscreen->options.inline_uniforms)
       nir_find_inlinable_uniforms(nir);
 
+   NIR_PASS_V(nir, nir_convert_to_lcssa, true, true); /* required by divergence analysis */
+   NIR_PASS_V(nir, nir_divergence_analysis); /* to find divergent loops */
+
    return NULL;
 }
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index eaf3cf28bdb..6bd67a82c03 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -52,13 +52,87 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha
        (stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg))
       return 64;
 
-   if (stage == MESA_SHADER_COMPUTE)
-      return sscreen->debug_flags & DBG(W32_CS) ? 32 : 64;
+   /* Small workgroups use Wave32 unconditionally. */
+   if (stage == MESA_SHADER_COMPUTE && info &&
+       !info->base.workgroup_size_variable &&
+       info->base.workgroup_size[0] *
+       info->base.workgroup_size[1] *
+       info->base.workgroup_size[2] <= 32)
+      return 32;
+
+   /* Debug flags. */
+   unsigned dbg_wave_size = 0;
+   if (sscreen->debug_flags &
+       (stage == MESA_SHADER_COMPUTE ? DBG(W32_CS) :
+        stage == MESA_SHADER_FRAGMENT ? DBG(W32_PS) | DBG(W32_PS_DISCARD) : DBG(W32_GE)))
+      dbg_wave_size = 32;
+
+   if (sscreen->debug_flags &
+       (stage == MESA_SHADER_COMPUTE ? DBG(W64_CS) :
+        stage == MESA_SHADER_FRAGMENT ? DBG(W64_PS) : DBG(W64_GE))) {
+      assert(!dbg_wave_size);
+      dbg_wave_size = 64;
+   }
+
+   /* Shader profiles. */
+   unsigned profile_wave_size = 0;
+   if (info && info->options & SI_PROFILE_WAVE32)
+      profile_wave_size = 32;
+
+   if (info && info->options & SI_PROFILE_WAVE64) {
+      assert(!profile_wave_size);
+      profile_wave_size = 64;
+   }
+
+   if (profile_wave_size) {
+      /* Only debug flags override shader profiles. */
+      if (dbg_wave_size)
+         return dbg_wave_size;
+
+      return profile_wave_size;
+   }
+
+   /* LLVM 13 and 14 have a bug that causes compile failures with discard in Wave32
+    * in some cases. Alpha test in Wave32 is luckily unaffected.
+    */
+   if (stage == MESA_SHADER_FRAGMENT && info->base.fs.uses_discard &&
+       !(info && info->options & SI_PROFILE_IGNORE_LLVM_DISCARD_BUG) &&
+       LLVM_VERSION_MAJOR >= 13 && !(sscreen->debug_flags & DBG(W32_PS_DISCARD)))
+      return 64;
 
-   if (stage == MESA_SHADER_FRAGMENT)
-      return sscreen->debug_flags & DBG(W32_PS) ? 32 : 64;
+   /* Debug flags except w32psdiscard don't override the discard bug workaround,
+    * but they override everything else.
+    */
+   if (dbg_wave_size)
+      return dbg_wave_size;
+
+   /* Pixel shaders without interp instructions don't suffer from reduced interpolation
+    * performance in Wave32, so use Wave32. This helps Piano and Voloplosion.
+    */
+   if (stage == MESA_SHADER_FRAGMENT && !info->num_inputs)
+      return 32;
+
+   /* There are a few very rare cases where VS is better with Wave32, and there are no known
+    * cases where Wave64 is better.
+    */
+   if (stage <= MESA_SHADER_GEOMETRY)
+      return 32;
+
+   /* TODO: Merged shaders must use the same wave size because the driver doesn't recompile
+    * individual shaders of merged shaders to match the wave size between them.
+    */
+   bool merged_shader = shader && !shader->is_gs_copy_shader &&
+                        (shader->key.ge.as_ls || shader->key.ge.as_es ||
+                         stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_GEOMETRY);
+
+   /* Divergent loops in Wave64 can end up having too many iterations in one half of the wave
+    * while the other half is idling but occupying VGPRs, preventing other waves from launching.
+    * Wave32 eliminates the idling half to allow the next wave to start.
+    */
+   if (!merged_shader && info && info->has_divergent_loop)
+      return 32;
 
-   return sscreen->debug_flags & DBG(W32_GE) ? 32 : 64;
+   return 64;
 }
 
 /* SHADER_CACHE */
author	Marek Olšák <marek.olsak@amd.com>	2021-11-19 18:36:03 -0500
committer	Marge Bot <emma+marge@anholt.net>	2021-12-11 20:07:35 +0000
commit	b3b2f97f2e25b2c4c72e5f04c945ce9c48ce6cd6 (patch)
tree	6368067ccc57d6939a71f87071c99eec9d34f091
parent	e2a18833372c1d060635dd9d73956fc22b7b674e (diff)