summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorRhys Perry <pendingchaos02@gmail.com>2024-06-06 15:32:38 +0100
committerMarge Bot <emma+marge@anholt.net>2024-07-01 17:34:22 +0000
commitf842bd81ca94637c689e89b0513ba093350bf588 (patch)
treeced8f1e646b562b0a3ba6266fde007a393d90f17 /src
parentca161a96d1d9a06bdfa4e32e9903ca1a3a34332a (diff)
aco: use s_pack_*_b32_b16 more in p_insert/p_extract lowering
This opcode doesn't write SCC, which gives later passes more freedom to move instructions. fossil-db (navi21): Totals from 727 (0.92% of 79395) affected shaders: Latency: 14943483 -> 14942704 (-0.01%); split: -0.01%, +0.00% InvThroughput: 3225790 -> 3225766 (-0.00%); split: -0.00%, +0.00% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29912>
Diffstat (limited to 'src')
-rw-r--r--src/amd/compiler/aco_lower_to_hw_instr.cpp17
-rw-r--r--src/amd/compiler/tests/test_to_hw_instr.cpp10
2 files changed, 19 insertions, 8 deletions
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 582028a56b5..ad98f52cfcb 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -2497,14 +2497,17 @@ lower_to_hw_instr(Program* program)
bool signext = !instr->operands[3].constantEquals(0);
if (dst.regClass() == s1) {
- if (offset == (32 - bits)) {
- bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst,
- bld.def(s1, scc), op, Operand::c32(offset));
- } else if (offset == 0 && signext && (bits == 8 || bits == 16)) {
+ if (offset == 0 && signext && (bits == 8 || bits == 16)) {
bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16,
dst, op);
} else if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
+ } else if (ctx.program->gfx_level >= GFX9 && offset == 16 && bits == 16 &&
+ !signext) {
+ bld.sop2(aco_opcode::s_pack_hh_b32_b16, dst, op, Operand::zero());
+ } else if (offset == (32 - bits)) {
+ bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst,
+ bld.def(s1, scc), op, Operand::c32(offset));
} else {
bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,
bld.def(s1, scc), op, Operand::c32((bits << 16) | offset));
@@ -2574,7 +2577,11 @@ lower_to_hw_instr(Program* program)
bool has_sdwa = program->gfx_level >= GFX8 && program->gfx_level < GFX11;
if (dst.regClass() == s1) {
- if (offset == (32 - bits)) {
+ if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
+ bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
+ } else if (ctx.program->gfx_level >= GFX9 && offset == 16 && bits == 16) {
+ bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, Operand::zero(), op);
+ } else if (offset == (32 - bits)) {
bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), op,
Operand::c32(offset));
} else if (offset == 0) {
diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp
index b2dc7a24be6..063c7a9251a 100644
--- a/src/amd/compiler/tests/test_to_hw_instr.cpp
+++ b/src/amd/compiler/tests/test_to_hw_instr.cpp
@@ -425,7 +425,9 @@ BEGIN_TEST(to_hw_instr.extract)
//~gfx(9|11)_unsigned! s1: %_:s[0] = s_pack_ll_b32_b16 %_:s[1], 0
//~gfx.*_signed! s1: %_:s[0] = s_sext_i32_i16 %_:s[1]
EXT(0, 16)
- //! s1: %_:s[0], s1: %_:scc = @s_shr %_:s[1], 16
+ //~gfx(7,8)_unsigned! s1: %_:s[0], s1: %_:scc = @s_shr %_:s[1], 16
+ //~gfx(9|11)_unsigned! s1: %_:s[0] = s_pack_hh_b32_b16 %_:s[1], 0
+ //~gfx.*_signed! s1: %_:s[0], s1: %_:scc = @s_shr %_:s[1], 16
EXT(1, 16)
#undef EXT
@@ -523,9 +525,11 @@ BEGIN_TEST(to_hw_instr.insert)
INS(2, 8)
//! s1: %_:s[0], s1: %_:scc = s_lshl_b32 %_:s[1], 24
INS(3, 8)
- //! s1: %_:s[0], s1: %_:scc = s_bfe_u32 %_:s[1], 0x100000
+ //~gfx(7|8)! s1: %_:s[0], s1: %_:scc = s_bfe_u32 %_:s[1], 0x100000
+ //~gfx(9|11)! s1: %_:s[0] = s_pack_ll_b32_b16 %_:s[1], 0
INS(0, 16)
- //! s1: %_:s[0], s1: %_:scc = s_lshl_b32 %_:s[1], 16
+ //~gfx(7|8)! s1: %_:s[0], s1: %_:scc = s_lshl_b32 %_:s[1], 16
+ //~gfx(9|11)! s1: %_:s[0] = s_pack_ll_b32_b16 0, %_:s[1]
INS(1, 16)
#undef INS