diff options
author | Marek Olšák <marek.olsak@amd.com> | 2021-06-01 01:40:37 -0400 |
---|---|---|
committer | Marge Bot <eric+marge@anholt.net> | 2021-06-28 13:23:14 +0000 |
commit | 24292cc003aeba8d624d694dd78a920a1f819b3d (patch) | |
tree | 4e490c2fc6bee2d0da839fdaeb0549d396e452d8 | |
parent | b141e50282752cd1ad6de274fb0e66a3f7e6d011 (diff) |
radeonsi: move the accepting code into the bbox cull branch in prim discard CS
This reduces the number of jumps. No change in behavior.
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11510>
-rw-r--r-- | src/gallium/drivers/radeonsi/si_compute_prim_discard.c | 76 |
1 files changed, 49 insertions, 27 deletions
diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c index 3fd9560dacd..971147d2cce 100644 --- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c +++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c @@ -224,6 +224,9 @@ static void si_exit_thread0_section(struct si_thread0_section *section, LLVMValu ac_build_readlane(&ctx->ac, LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL); } +static void si_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValueRef accepted, + void *data); + void si_build_prim_discard_compute_shader(struct si_shader_context *ctx) { struct si_shader_key *key = &ctx->shader->key; @@ -430,23 +433,49 @@ void si_build_prim_discard_compute_shader(struct si_shader_context *ctx) options.cull_zero_area = true; options.cull_w = true; - LLVMValueRef accepted = - ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate, - ac_get_arg(&ctx->ac, param_smallprim_precision), &options, - NULL, NULL); + LLVMValueRef params[] = { + instance_id, + vertex_counter, + output_indexbuf, + (void*)index, + ac_get_arg(&ctx->ac, param_start_out_index), + }; + + ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate, + ac_get_arg(&ctx->ac, param_smallprim_precision), &options, + si_build_primitive_accepted, params); + LLVMBuildRetVoid(builder); +} + +static void si_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValueRef accepted, + void *userdata) +{ + struct si_shader_context *ctx = container_of(ac, struct si_shader_context, ac); + struct si_shader_key *key = &ctx->shader->key; + LLVMBuilderRef builder = ctx->ac.builder; + unsigned vertices_per_prim = 3; + LLVMValueRef *params = (LLVMValueRef *)userdata; + LLVMValueRef instance_id = params[0]; + LLVMValueRef vertex_counter = params[1]; + LLVMValueRef output_indexbuf = params[2]; + LLVMValueRef *index = (LLVMValueRef *)params[3]; + LLVMValueRef start_out_index = params[4]; - ac_build_optimization_barrier(&ctx->ac, &accepted, false); LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted); + ac_build_ifcc(&ctx->ac, accepted, 16607); + /* Count the number of active threads by doing bitcount(accepted). */ LLVMValueRef num_prims_accepted = ac_build_bit_count(&ctx->ac, accepted_threadmask); num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, ""); + /* Get the number of bits set before the index of this thread. */ + LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask); LLVMValueRef start; /* Execute atomic_add on the vertex count. */ struct si_thread0_section section; - si_enter_thread0_section(ctx, §ion, thread_id, num_prims_accepted); + si_enter_thread0_section(ctx, §ion, prim_index, num_prims_accepted); { LLVMValueRef num_indices = LLVMBuildMul( builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); @@ -462,33 +491,26 @@ void si_build_prim_discard_compute_shader(struct si_shader_context *ctx) /* Now we need to store the indices of accepted primitives into * the output index buffer. */ - ac_build_ifcc(&ctx->ac, accepted, 16607); - { - /* Get the number of bits set before the index of this thread. */ - LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask); - /* We have lowered instancing. Pack the instance ID into vertex ID. */ - if (key->opt.cs_instancing) { - instance_id = LLVMBuildShl(builder, instance_id, LLVMConstInt(ctx->ac.i32, 16, 0), ""); + /* We have lowered instancing. Pack the instance ID into vertex ID. */ + if (key->opt.cs_instancing) { + instance_id = LLVMBuildShl(builder, instance_id, LLVMConstInt(ctx->ac.i32, 16, 0), ""); - for (unsigned i = 0; i < vertices_per_prim; i++) - index[i] = LLVMBuildOr(builder, index[i], instance_id, ""); - } + for (unsigned i = 0; i < vertices_per_prim; i++) + index[i] = LLVMBuildOr(builder, index[i], instance_id, ""); + } - /* Write indices for accepted primitives. */ - LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, ""); - vindex = LLVMBuildAdd(builder, vindex, ac_get_arg(&ctx->ac, param_start_out_index), ""); - LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3); + /* Write indices for accepted primitives. */ + LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, ""); + vindex = LLVMBuildAdd(builder, vindex, start_out_index, ""); + LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3); - if (!ac_has_vec3_support(ctx->ac.chip_class, true)) - vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3); + if (!ac_has_vec3_support(ctx->ac.chip_class, true)) + vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3); - ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0, - ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0)); - } + ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0, + ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0)); ac_build_endif(&ctx->ac, 16607); - - LLVMBuildRetVoid(builder); } /* Return false if the shader isn't ready. */ |