radeonsi/gfx9: always wrap GS and TCS in an if-block

With merged ESGS shaders, the GS part of a wave may be empty, and the hardware gets confused if any GS messages are sent from that wave. Since S_SENDMSG is executed even when EXEC = 0, we have to wrap even non-monolithic GS shaders in an if-block, so that the entire shader and hence the S_SENDMSG instructions are skipped in empty waves. This change is not required for TCS/HS, but applying it there as well simplifies the code a bit. Fixes GL45-CTS.geometry_shader.rendering.rendering.* Cc: mesa-stable@lists.freedesktop.org
author: Nicolai Hähnle <nicolai.haehnle@amd.com> 2017-07-14 14:33:37 +0200
committer: Nicolai Hähnle <nicolai.haehnle@amd.com> 2017-07-14 14:33:37 +0200
commit: 846131e50efb2cb69dbd8d4dd41e7eb6b3c7adf4 (patch)
tree: 7912113a82e59665fc55815a7affba5ae4ddf676
parent: c578649b1ee16c2a71ab7f2e810703f01f9a1b8d (diff)
2 files changed, 45 insertions, 32 deletions
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 90164ca1b6..5a613f6bda 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2820,6 +2820,9 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 
 	si_copy_tcs_inputs(bld_base);
 
+	if (ctx->screen->b.chip_class >= GFX9)
+		lp_build_endif(&ctx->merged_wrap_if_state);
+
 	rel_patch_id = get_rel_patch_id(ctx);
 	invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
 	tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
@@ -3053,6 +3056,9 @@ static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
 
 	ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
 			 si_get_gs_wave_id(ctx));
+
+	if (ctx->screen->b.chip_class >= GFX9)
+		lp_build_endif(&ctx->merged_wrap_if_state);
 }
 
 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
@@ -5666,14 +5672,20 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
 	preload_ring_buffers(ctx);
 
 	/* For GFX9 merged shaders:
-	 * - Set EXEC. If the prolog is present, set EXEC there instead.
+	 * - Set EXEC for the first shader. If the prolog is present, set
+	 *   EXEC there instead.
 	 * - Add a barrier before the second shader.
+	 * - In the second shader, reset EXEC to ~0 and wrap the main part in
+	 *   an if-statement. This is required for correctness in geometry
+	 *   shaders, to ensure that empty GS waves do not send GS_EMIT and
+	 *   GS_CUT messages.
 	 *
-	 * The same thing for monolithic shaders is done in
-	 * si_build_wrapper_function.
+	 * For monolithic merged shaders, the first shader is wrapped in an
+	 * if-block together with its prolog in si_build_wrapper_function.
 	 */
-	if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
-		if (sel->info.num_instructions > 1 && /* not empty shader */
+	if (ctx->screen->b.chip_class >= GFX9) {
+		if (!is_monolithic &&
+		    sel->info.num_instructions > 1 && /* not empty shader */
 		    (shader->key.as_es || shader->key.as_ls) &&
 		    (ctx->type == PIPE_SHADER_TESS_EVAL ||
 		     (ctx->type == PIPE_SHADER_VERTEX &&
@@ -5682,9 +5694,19 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
 						ctx->param_merged_wave_info, 0);
 		} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
 			   ctx->type == PIPE_SHADER_GEOMETRY) {
-			si_init_exec_from_input(ctx,
-						ctx->param_merged_wave_info, 8);
+			if (!is_monolithic)
+				si_init_exec_full_mask(ctx);
+
+			/* The barrier must execute for all shaders in a
+			 * threadgroup.
+			 */
 			si_llvm_emit_barrier(NULL, bld_base, NULL);
+
+			LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
+			LLVMValueRef ena =
+				LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+					    ac_get_thread_id(&ctx->ac), num_threads, "");
+			lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
 		}
 	}
 
@@ -6156,15 +6178,9 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
 
 		/* Merged shaders are executed conditionally depending
 		 * on the number of enabled threads passed in the input SGPRs. */
-		if (is_merged_shader(ctx->shader) &&
-		    (part == 0 || part == next_shader_first_part)) {
+		if (is_merged_shader(ctx->shader) && part == 0) {
 			LLVMValueRef ena, count = initial[3];
 
-			/* The thread count for the 2nd shader is at bit-offset 8. */
-			if (part == next_shader_first_part) {
-				count = LLVMBuildLShr(builder, count,
-						      LLVMConstInt(ctx->i32, 8, 0), "");
-			}
 			count = LLVMBuildAnd(builder, count,
 					     LLVMConstInt(ctx->i32, 0x7f, 0), "");
 			ena = LLVMBuildICmp(builder, LLVMIntULT,
@@ -6221,26 +6237,20 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
 		ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
 
 		if (is_merged_shader(ctx->shader) &&
-		    (part + 1 == next_shader_first_part ||
-		     part + 1 == num_parts)) {
+		    part + 1 == next_shader_first_part) {
 			lp_build_endif(&if_state);
 
-			if (part + 1 == next_shader_first_part) {
-				/* A barrier is required between 2 merged shaders. */
-				si_llvm_emit_barrier(NULL, &ctx->bld_base, NULL);
-
-				/* The second half of the merged shader should use
-				 * the inputs from the toplevel (wrapper) function,
-				 * not the return value from the last call.
-				 *
-				 * That's because the last call was executed condi-
-				 * tionally, so we can't consume it in the main
-				 * block.
-				 */
-				memcpy(out, initial, sizeof(initial));
-				num_out = initial_num_out;
-				num_out_sgpr = initial_num_out_sgpr;
-			}
+			/* The second half of the merged shader should use
+			 * the inputs from the toplevel (wrapper) function,
+			 * not the return value from the last call.
+			 *
+			 * That's because the last call was executed condi-
+			 * tionally, so we can't consume it in the main
+			 * block.
+			 */
+			memcpy(out, initial, sizeof(initial));
+			num_out = initial_num_out;
+			num_out_sgpr = initial_num_out_sgpr;
 			continue;
 		}
 
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index d2eb957ffd..9d019fa543 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -25,6 +25,7 @@
 #define SI_SHADER_PRIVATE_H
 
 #include "si_shader.h"
+#include "gallivm/lp_bld_flow.h"
 #include "gallivm/lp_bld_init.h"
 #include "gallivm/lp_bld_tgsi.h"
 #include "tgsi/tgsi_parse.h"
@@ -108,6 +109,8 @@ struct si_shader_context {
 	unsigned flow_depth;
 	unsigned flow_depth_max;
 
+	struct lp_build_if_state merged_wrap_if_state;
+
 	struct tgsi_array_info *temp_arrays;
 	LLVMValueRef *temp_array_allocas;
author	Nicolai Hähnle <nicolai.haehnle@amd.com>	2017-07-14 14:33:37 +0200
committer	Nicolai Hähnle <nicolai.haehnle@amd.com>	2017-07-14 14:33:37 +0200
commit	846131e50efb2cb69dbd8d4dd41e7eb6b3c7adf4 (patch)
tree	7912113a82e59665fc55815a7affba5ae4ddf676
parent	c578649b1ee16c2a71ab7f2e810703f01f9a1b8d (diff)