diff options
author | Rhys Perry <pendingchaos02@gmail.com> | 2024-06-06 15:32:38 +0100 |
---|---|---|
committer | Marge Bot <emma+marge@anholt.net> | 2024-07-01 17:34:22 +0000 |
commit | f842bd81ca94637c689e89b0513ba093350bf588 (patch) | |
tree | ced8f1e646b562b0a3ba6266fde007a393d90f17 /src | |
parent | ca161a96d1d9a06bdfa4e32e9903ca1a3a34332a (diff) |
aco: use s_pack_*_b32_b16 more in p_insert/p_extract lowering
This opcode doesn't write SCC, which gives later passes more freedom to
move instructions.
fossil-db (navi21):
Totals from 727 (0.92% of 79395) affected shaders:
Latency: 14943483 -> 14942704 (-0.01%); split: -0.01%, +0.00%
InvThroughput: 3225790 -> 3225766 (-0.00%); split: -0.00%, +0.00%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29912>
Diffstat (limited to 'src')
-rw-r--r-- | src/amd/compiler/aco_lower_to_hw_instr.cpp | 17 | ||||
-rw-r--r-- | src/amd/compiler/tests/test_to_hw_instr.cpp | 10 |
2 files changed, 19 insertions, 8 deletions
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 582028a56b5..ad98f52cfcb 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2497,14 +2497,17 @@ lower_to_hw_instr(Program* program) bool signext = !instr->operands[3].constantEquals(0); if (dst.regClass() == s1) { - if (offset == (32 - bits)) { - bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst, - bld.def(s1, scc), op, Operand::c32(offset)); - } else if (offset == 0 && signext && (bits == 8 || bits == 16)) { + if (offset == 0 && signext && (bits == 8 || bits == 16)) { bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, dst, op); } else if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) { bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero()); + } else if (ctx.program->gfx_level >= GFX9 && offset == 16 && bits == 16 && + !signext) { + bld.sop2(aco_opcode::s_pack_hh_b32_b16, dst, op, Operand::zero()); + } else if (offset == (32 - bits)) { + bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst, + bld.def(s1, scc), op, Operand::c32(offset)); } else { bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op, Operand::c32((bits << 16) | offset)); @@ -2574,7 +2577,11 @@ lower_to_hw_instr(Program* program) bool has_sdwa = program->gfx_level >= GFX8 && program->gfx_level < GFX11; if (dst.regClass() == s1) { - if (offset == (32 - bits)) { + if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) { + bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero()); + } else if (ctx.program->gfx_level >= GFX9 && offset == 16 && bits == 16) { + bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, Operand::zero(), op); + } else if (offset == (32 - bits)) { bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), op, Operand::c32(offset)); } else if (offset == 0) { diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index b2dc7a24be6..063c7a9251a 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -425,7 +425,9 @@ BEGIN_TEST(to_hw_instr.extract) //~gfx(9|11)_unsigned! s1: %_:s[0] = s_pack_ll_b32_b16 %_:s[1], 0 //~gfx.*_signed! s1: %_:s[0] = s_sext_i32_i16 %_:s[1] EXT(0, 16) - //! s1: %_:s[0], s1: %_:scc = @s_shr %_:s[1], 16 + //~gfx(7,8)_unsigned! s1: %_:s[0], s1: %_:scc = @s_shr %_:s[1], 16 + //~gfx(9|11)_unsigned! s1: %_:s[0] = s_pack_hh_b32_b16 %_:s[1], 0 + //~gfx.*_signed! s1: %_:s[0], s1: %_:scc = @s_shr %_:s[1], 16 EXT(1, 16) #undef EXT @@ -523,9 +525,11 @@ BEGIN_TEST(to_hw_instr.insert) INS(2, 8) //! s1: %_:s[0], s1: %_:scc = s_lshl_b32 %_:s[1], 24 INS(3, 8) - //! s1: %_:s[0], s1: %_:scc = s_bfe_u32 %_:s[1], 0x100000 + //~gfx(7|8)! s1: %_:s[0], s1: %_:scc = s_bfe_u32 %_:s[1], 0x100000 + //~gfx(9|11)! s1: %_:s[0] = s_pack_ll_b32_b16 %_:s[1], 0 INS(0, 16) - //! s1: %_:s[0], s1: %_:scc = s_lshl_b32 %_:s[1], 16 + //~gfx(7|8)! s1: %_:s[0], s1: %_:scc = s_lshl_b32 %_:s[1], 16 + //~gfx(9|11)! s1: %_:s[0] = s_pack_ll_b32_b16 0, %_:s[1] INS(1, 16) #undef INS |