summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarek Olšák <marek.olsak@amd.com>2021-11-19 18:36:03 -0500
committerMarge Bot <emma+marge@anholt.net>2021-12-11 20:07:35 +0000
commitb3b2f97f2e25b2c4c72e5f04c945ce9c48ce6cd6 (patch)
tree6368067ccc57d6939a71f87071c99eec9d34f091
parente2a18833372c1d060635dd9d73956fc22b7b674e (diff)
radeonsi: add Wave32 heuristics and shader profiles
This generally works well. There are new cases that select Wave32, and there are shader profiles which adjust that. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13966>
-rw-r--r--src/gallium/drivers/radeonsi/si_pipe.c1
-rw-r--r--src/gallium/drivers/radeonsi/si_pipe.h1
-rw-r--r--src/gallium/drivers/radeonsi/si_shader.h6
-rw-r--r--src/gallium/drivers/radeonsi/si_shader_nir.c36
-rw-r--r--src/gallium/drivers/radeonsi/si_state_shaders.cpp84
5 files changed, 123 insertions, 5 deletions
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index bd7d3f27c74..1717686fe1d 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -65,6 +65,7 @@ static const struct debug_named_value radeonsi_debug_options[] = {
{"gisel", DBG(GISEL), "Enable LLVM global instruction selector."},
{"w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders."},
{"w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders."},
+ {"w32psdiscard", DBG(W32_PS_DISCARD), "Use Wave32 for pixel shaders even if they contain discard and LLVM is buggy."},
{"w32cs", DBG(W32_CS), "Use Wave32 for computes shaders."},
{"w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders."},
{"w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders."},
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index f01abc3d62a..b0db57599f0 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -197,6 +197,7 @@ enum
DBG_GISEL,
DBG_W32_GE,
DBG_W32_PS,
+ DBG_W32_PS_DISCARD,
DBG_W32_CS,
DBG_W64_GE,
DBG_W64_PS,
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index d410b7a547c..b740e2ba6e6 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -288,6 +288,10 @@ enum
#define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 5)
#define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x) (((x) >> 5) & 0xff)
+#define SI_PROFILE_WAVE32 (1 << 0)
+#define SI_PROFILE_WAVE64 (1 << 1)
+#define SI_PROFILE_IGNORE_LLVM_DISCARD_BUG (1 << 2)
+
/**
* For VS shader keys, describe any fixups required for vertex fetch.
*
@@ -344,6 +348,7 @@ struct si_shader_info {
shader_info base;
gl_shader_stage stage;
+ uint32_t options; /* bitmask of SI_PROFILE_* */
ubyte num_inputs;
ubyte num_outputs;
@@ -404,6 +409,7 @@ struct si_shader_info {
bool uses_bindless_samplers;
bool uses_bindless_images;
bool uses_indirect_descriptor;
+ bool has_divergent_loop;
bool uses_vmem_return_type_sampler_or_bvh;
bool uses_vmem_return_type_other; /* all other VMEM loads and atomics with return */
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index 77fa0770efb..b43b5359da8 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -31,6 +31,29 @@
#include "si_pipe.h"
#include "si_shader_internal.h"
#include "tgsi/tgsi_from_mesa.h"
+#include "util/mesa-sha1.h"
+
+
+struct si_shader_profile {
+ uint32_t sha1[SHA1_DIGEST_LENGTH32];
+ uint32_t options;
+};
+
+static struct si_shader_profile profiles[] =
+{
+ {
+ /* Viewperf/Energy isn't affected by the discard bug. */
+ {0x17118671, 0xd0102e0c, 0x947f3592, 0xb2057e7b, 0x4da5d9b0},
+ SI_PROFILE_IGNORE_LLVM_DISCARD_BUG,
+ },
+ {
+ /* Viewperf/Medical, a shader with a divergent loop doesn't benefit from Wave32,
+ * probably due to interpolation performance.
+ */
+ {0x29f0f4a0, 0x0672258d, 0x47ccdcfd, 0x31e67dcc, 0xdcb1fda8},
+ SI_PROFILE_WAVE64,
+ },
+};
static const nir_src *get_texture_src(nir_tex_instr *instr, nir_tex_src_type type)
{
@@ -397,6 +420,14 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf
info->base = nir->info;
info->stage = nir->info.stage;
+ /* Get options from shader profiles. */
+ for (unsigned i = 0; i < ARRAY_SIZE(profiles); i++) {
+ if (_mesa_printed_sha1_equal(info->base.source_sha1, profiles[i].sha1)) {
+ info->options = profiles[i].options;
+ break;
+ }
+ }
+
if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
if (info->base.tess.primitive_mode == GL_ISOLINES)
info->base.tess.primitive_mode = GL_LINES;
@@ -531,6 +562,8 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf
/* Trim output read masks based on write masks. */
for (unsigned i = 0; i < info->num_outputs; i++)
info->output_readmask[i] &= info->output_usagemask[i];
+
+ info->has_divergent_loop = nir_has_divergent_loop((nir_shader*)nir);
}
static bool si_alu_to_scalar_filter(const nir_instr *instr, const void *data)
@@ -932,5 +965,8 @@ char *si_finalize_nir(struct pipe_screen *screen, void *nirptr)
if (sscreen->options.inline_uniforms)
nir_find_inlinable_uniforms(nir);
+ NIR_PASS_V(nir, nir_convert_to_lcssa, true, true); /* required by divergence analysis */
+ NIR_PASS_V(nir, nir_divergence_analysis); /* to find divergent loops */
+
return NULL;
}
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index eaf3cf28bdb..6bd67a82c03 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -52,13 +52,87 @@ unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *sha
(stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg))
return 64;
- if (stage == MESA_SHADER_COMPUTE)
- return sscreen->debug_flags & DBG(W32_CS) ? 32 : 64;
+ /* Small workgroups use Wave32 unconditionally. */
+ if (stage == MESA_SHADER_COMPUTE && info &&
+ !info->base.workgroup_size_variable &&
+ info->base.workgroup_size[0] *
+ info->base.workgroup_size[1] *
+ info->base.workgroup_size[2] <= 32)
+ return 32;
+
+ /* Debug flags. */
+ unsigned dbg_wave_size = 0;
+ if (sscreen->debug_flags &
+ (stage == MESA_SHADER_COMPUTE ? DBG(W32_CS) :
+ stage == MESA_SHADER_FRAGMENT ? DBG(W32_PS) | DBG(W32_PS_DISCARD) : DBG(W32_GE)))
+ dbg_wave_size = 32;
+
+ if (sscreen->debug_flags &
+ (stage == MESA_SHADER_COMPUTE ? DBG(W64_CS) :
+ stage == MESA_SHADER_FRAGMENT ? DBG(W64_PS) : DBG(W64_GE))) {
+ assert(!dbg_wave_size);
+ dbg_wave_size = 64;
+ }
+
+ /* Shader profiles. */
+ unsigned profile_wave_size = 0;
+ if (info && info->options & SI_PROFILE_WAVE32)
+ profile_wave_size = 32;
+
+ if (info && info->options & SI_PROFILE_WAVE64) {
+ assert(!profile_wave_size);
+ profile_wave_size = 64;
+ }
+
+ if (profile_wave_size) {
+ /* Only debug flags override shader profiles. */
+ if (dbg_wave_size)
+ return dbg_wave_size;
+
+ return profile_wave_size;
+ }
+
+ /* LLVM 13 and 14 have a bug that causes compile failures with discard in Wave32
+ * in some cases. Alpha test in Wave32 is luckily unaffected.
+ */
+ if (stage == MESA_SHADER_FRAGMENT && info->base.fs.uses_discard &&
+ !(info && info->options & SI_PROFILE_IGNORE_LLVM_DISCARD_BUG) &&
+ LLVM_VERSION_MAJOR >= 13 && !(sscreen->debug_flags & DBG(W32_PS_DISCARD)))
+ return 64;
- if (stage == MESA_SHADER_FRAGMENT)
- return sscreen->debug_flags & DBG(W32_PS) ? 32 : 64;
+ /* Debug flags except w32psdiscard don't override the discard bug workaround,
+ * but they override everything else.
+ */
+ if (dbg_wave_size)
+ return dbg_wave_size;
+
+ /* Pixel shaders without interp instructions don't suffer from reduced interpolation
+ * performance in Wave32, so use Wave32. This helps Piano and Voloplosion.
+ */
+ if (stage == MESA_SHADER_FRAGMENT && !info->num_inputs)
+ return 32;
+
+ /* There are a few very rare cases where VS is better with Wave32, and there are no known
+ * cases where Wave64 is better.
+ */
+ if (stage <= MESA_SHADER_GEOMETRY)
+ return 32;
+
+ /* TODO: Merged shaders must use the same wave size because the driver doesn't recompile
+ * individual shaders of merged shaders to match the wave size between them.
+ */
+ bool merged_shader = shader && !shader->is_gs_copy_shader &&
+ (shader->key.ge.as_ls || shader->key.ge.as_es ||
+ stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_GEOMETRY);
+
+ /* Divergent loops in Wave64 can end up having too many iterations in one half of the wave
+ * while the other half is idling but occupying VGPRs, preventing other waves from launching.
+ * Wave32 eliminates the idling half to allow the next wave to start.
+ */
+ if (!merged_shader && info && info->has_divergent_loop)
+ return 32;
- return sscreen->debug_flags & DBG(W32_GE) ? 32 : 64;
+ return 64;
}
/* SHADER_CACHE */