aco: sometimes join linear wait entries on logical edges

fossil-db (gfx1201):
Totals from 1303 (1.64% of 79653) affected shaders:
Instrs: 6920949 -> 6917692 (-0.05%); split: -0.06%, +0.01%
CodeSize: 37112404 -> 37095728 (-0.04%); split: -0.05%, +0.01%
Latency: 70471343 -> 70365986 (-0.15%); split: -0.15%, +0.00%
InvThroughput: 11515673 -> 11504666 (-0.10%); split: -0.10%, +0.01%

fossil-db (navi31):
Totals from 1293 (1.62% of 79653) affected shaders:
Instrs: 6500186 -> 6496761 (-0.05%); split: -0.06%, +0.01%
CodeSize: 34562712 -> 34549236 (-0.04%); split: -0.04%, +0.01%
Latency: 68604746 -> 68666532 (+0.09%); split: -0.15%, +0.24%
InvThroughput: 11276591 -> 11284914 (+0.07%); split: -0.10%, +0.17%

fossil-db (navi21):
Totals from 811 (1.02% of 79653) affected shaders:
Instrs: 4110953 -> 4108788 (-0.05%); split: -0.05%, +0.00%
CodeSize: 22955984 -> 22948064 (-0.03%); split: -0.03%, +0.00%
Latency: 35070231 -> 35064448 (-0.02%); split: -0.02%, +0.00%
InvThroughput: 6945610 -> 6945053 (-0.01%); split: -0.01%, +0.00%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34978>
This commit is contained in:
Rhys Perry 2025-05-13 12:54:00 +01:00
parent c1f8537131
commit 1088ac49db
2 changed files with 60 additions and 4 deletions

View file

@ -199,7 +199,7 @@ struct wait_ctx {
: program(program_), gfx_level(program_->gfx_level), info(info_)
{}
bool join(const wait_ctx* other, bool logical)
bool join(const wait_ctx* other, bool logical, bool logical_merge)
{
bool changed = (other->pending_flat_lgkm && !pending_flat_lgkm) ||
(other->pending_flat_vm && !pending_flat_vm) || (~nonzero & other->nonzero);
@ -212,7 +212,7 @@ struct wait_ctx {
using iterator = std::map<PhysReg, wait_entry>::iterator;
for (const auto& entry : other->gpr_map) {
if (entry.second.logical != logical) {
if (logical_merge ? !logical : (entry.second.logical != logical)) {
if (logical) {
iterator it = gpr_map.find(entry.first);
if (it != gpr_map.end()) {
@ -917,11 +917,24 @@ insert_waitcnt(Program* program)
continue;
}
/* Sometimes the counter for an entry is incremented or removed on all logical predecessors,
* so it might be better to join entries using the logical predecessors instead of the linear
* ones.
*/
bool logical_merge =
current.logical_preds.size() > 1 &&
std::any_of(current.linear_preds.begin(), current.linear_preds.end(),
[&](unsigned pred)
{
return std::find(current.logical_preds.begin(), current.logical_preds.end(),
pred) == current.logical_preds.end();
});
bool changed = false;
for (unsigned b : current.linear_preds)
changed |= ctx.join(&out_ctx[b], false);
changed |= ctx.join(&out_ctx[b], false, logical_merge);
for (unsigned b : current.logical_preds)
changed |= ctx.join(&out_ctx[b], true);
changed |= ctx.join(&out_ctx[b], true, logical_merge);
if (done[current.index] && !changed) {
in_ctx[current.index] = std::move(ctx);

View file

@ -756,3 +756,46 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_different_lanes)
finish_waitcnt_test();
}
END_TEST
BEGIN_TEST(insert_waitcnt.divergent_branch.inc_counter)
for (amd_gfx_level gfx : {GFX10_3, GFX11, GFX12}) {
if (!setup_cs(NULL, gfx))
continue;
Definition def_v4(PhysReg(260), v1);
Definition def_v5(PhysReg(261), v1);
Operand op_v0(PhysReg(256), v1);
Operand desc_s4(PhysReg(0), s4);
Operand desc_s8(PhysReg(8), s8);
//>> v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
emit_divergent_if_else(
program.get(), bld, Operand::c64(1),
[&]()
{
//>> p_unit_test 1
//! v1: %0:v[5] = buffer_load_dword %0:s[0-3], %0:v[0], 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.mubuf(aco_opcode::buffer_load_dword, def_v5, desc_s4, op_v0, Operand::zero(), 0,
false);
},
[&]()
{
//>> p_unit_test 2
//! v1: %0:v[5] = buffer_load_dword %0:s[0-3], %0:v[0], 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.mubuf(aco_opcode::buffer_load_dword, def_v5, desc_s4, op_v0, Operand::zero(), 0,
false);
});
//>> p_unit_test 3
//~gfx(10_3|11)! s_waitcnt vmcnt(1)
//~gfx12! s_wait_loadcnt imm:1
//! p_unit_test %0:v[4]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.pseudo(aco_opcode::p_unit_test, Operand(PhysReg(260), v1));
finish_waitcnt_test();
}
END_TEST