aco: only add/subtract low bits of program addresses

fossil-db (Sienna Cichlid):
Totals from 4007 (2.47% of 162293) affected shaders:
Instrs: 3733239 -> 3728018 (-0.14%)
CodeSize: 20770340 -> 20749456 (-0.10%)
Latency: 46883958 -> 46872764 (-0.02%); split: -0.02%, +0.00%
InvThroughput: 10550392 -> 10548698 (-0.02%); split: -0.02%, +0.00%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16460>
This commit is contained in:
Rhys Perry 2022-05-11 19:33:32 +01:00 committed by Marge Bot
parent 69d1f4186a
commit 4513cb8d41
3 changed files with 11 additions and 21 deletions

View file

@ -903,7 +903,7 @@ emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards,
case aco_opcode::s_cbranch_execnz: inv = aco_opcode::s_cbranch_execz; break;
default: unreachable("Unhandled long jump.");
}
instr.reset(bld.sopp(inv, -1, 7));
instr.reset(bld.sopp(inv, -1, 6));
emit_instruction(ctx, out, instr.get());
}
@ -916,10 +916,7 @@ emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards,
emit_instruction(ctx, out, instr.get());
branch->pass_flags = out.size();
instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi,
Operand::c32(backwards ? UINT32_MAX : 0u))
.instr);
emit_instruction(ctx, out, instr.get());
/* s_addc_u32 for high 32 bits not needed because the program is in a 32-bit VA range */
/* restore SCC and clear the LSB of the new PC */
instr.reset(bld.sopc(aco_opcode::s_bitcmp1_b32, def_tmp_lo, op_tmp_lo, Operand::zero()).instr);

View file

@ -2100,8 +2100,7 @@ lower_to_hw_instr(Program* program)
bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand::c32(id));
bld.sop2(aco_opcode::p_constaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
Operand(reg, s1), instr->operands[0], Operand::c32(id));
bld.sop2(aco_opcode::s_addc_u32, Definition(reg.advance(4), s1), bld.def(s1, scc),
Operand(reg.advance(4), s1), Operand::zero(), Operand(scc, s1));
/* s_addc_u32 not needed because the program is in a 32-bit VA range */
break;
}
case aco_opcode::p_extract: {

View file

@ -65,8 +65,7 @@ BEGIN_TEST(assembler.long_jump.unconditional_forwards)
//!BB0:
//! s_getpc_b64 s[0:1] ; be801f00
//! s_addc_u32 s0, s0, 0x20018 ; 8200ff00 00020018
//! s_addc_u32 s1, s1, 0 ; 82018001
//! s_addc_u32 s0, s0, 0x20014 ; 8200ff00 00020014
//! s_bitcmp1_b32 s0, 0 ; bf0d8000
//! s_bitset0_b32 s0, 0 ; be801b80
//! s_setpc_b64 s[0:1] ; be802000
@ -94,10 +93,9 @@ BEGIN_TEST(assembler.long_jump.conditional_forwards)
return;
//! BB0:
//! s_cbranch_scc1 BB1 ; bf850007
//! s_cbranch_scc1 BB1 ; bf850006
//! s_getpc_b64 s[0:1] ; be801f00
//! s_addc_u32 s0, s0, 0x20018 ; 8200ff00 00020018
//! s_addc_u32 s1, s1, 0 ; 82018001
//! s_addc_u32 s0, s0, 0x20014 ; 8200ff00 00020014
//! s_bitcmp1_b32 s0, 0 ; bf0d8000
//! s_bitset0_b32 s0, 0 ; be801b80
//! s_setpc_b64 s[0:1] ; be802000
@ -134,7 +132,6 @@ BEGIN_TEST(assembler.long_jump.unconditional_backwards)
//! s_getpc_b64 s[0:1] ; be801f00
//! s_addc_u32 s0, s0, 0xfffdfffc ; 8200ff00 fffdfffc
//! s_addc_u32 s1, s1, -1 ; 8201c101
//! s_bitcmp1_b32 s0, 0 ; bf0d8000
//! s_bitset0_b32 s0, 0 ; be801b80
//! s_setpc_b64 s[0:1] ; be802000
@ -160,10 +157,9 @@ BEGIN_TEST(assembler.long_jump.conditional_backwards)
for (unsigned i = 0; i < INT16_MAX + 1; i++)
bld.sopp(aco_opcode::s_nop, -1, 0);
//! s_cbranch_execz BB1 ; bf880007
//! s_cbranch_execz BB1 ; bf880006
//! s_getpc_b64 s[0:1] ; be801f00
//! s_addc_u32 s0, s0, 0xfffdfff8 ; 8200ff00 fffdfff8
//! s_addc_u32 s1, s1, -1 ; 8201c101
//! s_bitcmp1_b32 s0, 0 ; bf0d8000
//! s_bitset0_b32 s0, 0 ; be801b80
//! s_setpc_b64 s[0:1] ; be802000
@ -188,7 +184,7 @@ BEGIN_TEST(assembler.long_jump.3f)
//! s_nop 0 ; bf800000
bld.sopp(aco_opcode::s_branch, Definition(PhysReg(0), s2), 1);
for (unsigned i = 0; i < 0x3f - 7; i++) // a unconditional long jump is 7 dwords
for (unsigned i = 0; i < 0x3f - 6; i++) // a unconditional long jump is 6 dwords
bld.vop1(aco_opcode::v_nop);
bld.sopp(aco_opcode::s_branch, Definition(PhysReg(0), s2), 2);
@ -219,7 +215,7 @@ BEGIN_TEST(assembler.long_jump.constaddr)
bld.reset(program->create_and_insert_block());
//>> s_getpc_b64 s[0:1] ; be801f00
//! s_add_u32 s0, s0, 0xe0 ; 8000ff00 000000e0
//! s_add_u32 s0, s0, 0xe4 ; 8000ff00 000000e4
bld.sop1(aco_opcode::p_constaddr_getpc, Definition(PhysReg(0), s2), Operand::zero());
bld.sop2(aco_opcode::p_constaddr_addlo, Definition(PhysReg(0), s1), bld.def(s1, scc),
Operand(PhysReg(0), s1), Operand::zero(), Operand::zero());
@ -302,13 +298,11 @@ BEGIN_TEST(assembler.p_constaddr)
dst1.setFixed(PhysReg(2));
//>> s_getpc_b64 s[0:1] ; be801c00
//! s_add_u32 s0, s0, 32 ; 8000ff00 00000020
//! s_addc_u32 s1, s1, 0 ; 82018001
//! s_add_u32 s0, s0, 24 ; 8000ff00 00000018
bld.pseudo(aco_opcode::p_constaddr, dst0, Operand::zero());
//! s_getpc_b64 s[2:3] ; be821c00
//! s_add_u32 s2, s2, 48 ; 8002ff02 00000030
//! s_addc_u32 s3, s3, 0 ; 82038003
//! s_add_u32 s2, s2, 44 ; 8002ff02 0000002c
bld.pseudo(aco_opcode::p_constaddr, dst1, Operand::c32(32));
aco::lower_to_hw_instr(program.get());