aco: Add Primitive Ordered Pixel Shading waitcnt rules

When letting the overlapping waves enter their ordered sections, there must
be no memory accesses to resources which need primitive-ordered access that
are still pending, or there would be a race between the current wave and
the overlapping waves.

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Signed-off-by: Vitaliy Triang3l Kuzmin <triang3l@yandex.ru>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22250>
This commit is contained in:
Vitaliy Triang3l Kuzmin 2023-04-06 23:09:35 +03:00 committed by Marge Bot
parent a87628cd08
commit e0f4b52559
4 changed files with 37 additions and 0 deletions

View file

@ -509,6 +509,25 @@ kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx,
force_waitcnt(ctx, imm);
}
/* Make sure POPS coherent memory accesses have reached the L2 cache before letting the
* overlapping waves proceed into the ordered section.
*/
if (ctx.program->has_pops_overlapped_waves_wait &&
(ctx.gfx_level >= GFX11 ? instr->isEXP() && instr->exp().done
: (instr->opcode == aco_opcode::s_sendmsg &&
instr->sopp().imm == sendmsg_ordered_ps_done))) {
if (ctx.vm_cnt)
imm.vm = 0;
if (ctx.gfx_level >= GFX10 && ctx.vs_cnt)
imm.vs = 0;
/* Await SMEM loads too, as it's possible for an application to create them, like using a
* scalarization loop - pointless and unoptimal for an inherently divergent address of
* per-pixel data, but still can be done at least synthetically and must be handled correctly.
*/
if (ctx.program->has_smem_buffer_or_global_loads && ctx.lgkm_cnt)
imm.lgkm = 0;
}
check_instr(ctx, imm, delay, instr);
/* It's required to wait for scalar stores before "writing back" data.

View file

@ -4398,6 +4398,8 @@ smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned
{
assert(align >= 4u);
bld.program->has_smem_buffer_or_global_loads = true;
bool buffer = info.resource.id() && info.resource.bytes() == 16;
Temp addr = info.resource;
if (!buffer && !addr.id()) {

View file

@ -2118,6 +2118,7 @@ public:
Stage stage;
bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
bool needs_wqm = false; /* there exists a p_wqm instruction */
bool has_smem_buffer_or_global_loads = false;
bool has_pops_overlapped_waves_wait = false;
bool has_color_exports = false;

View file

@ -2462,6 +2462,21 @@ lower_to_hw_instr(Program* program)
block = &program->blocks[block_idx];
bld.reset(discard_block);
if (program->has_pops_overlapped_waves_wait &&
(program->gfx_level >= GFX11 || discard_sends_pops_done)) {
/* If this discard early exit potentially exits the POPS ordered section, do
* the waitcnt necessary before resuming overlapping waves as the normal
* waitcnt insertion doesn't work in a discard early exit block.
*/
if (program->gfx_level >= GFX10)
bld.sopk(aco_opcode::s_waitcnt_vscnt, Definition(sgpr_null, s1), 0);
wait_imm pops_exit_wait_imm;
pops_exit_wait_imm.vm = 0;
if (program->has_smem_buffer_or_global_loads)
pops_exit_wait_imm.lgkm = 0;
bld.sopp(aco_opcode::s_waitcnt, -1,
pops_exit_wait_imm.pack(program->gfx_level));
}
if (discard_sends_pops_done)
bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
unsigned target = V_008DFC_SQ_EXP_NULL;