From b2172467d1aa1997a18a711fdf0bbc9d95b5aaac Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Mon, 3 Nov 2025 13:19:07 +0100 Subject: [PATCH] aco/gfx10_3: work around NSA hazard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 4+ dword NSA can hang if exec becomes non-zero again directly before the instruction. Foz-DB Navi21: Totals from 608 (0.74% of 82161) affected shaders: Instrs: 945138 -> 946431 (+0.14%) CodeSize: 5171580 -> 5176864 (+0.10%) Latency: 13356895 -> 13357113 (+0.00%) InvThroughput: 3043234 -> 3043236 (+0.00%); split: -0.00%, +0.00% Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9852 Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/13981 Cc: mesa-stable Reviewed-by: Timur Kristóf Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/README-ISA.md | 11 +++++++++++ src/amd/compiler/aco_insert_NOPs.cpp | 27 ++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/amd/compiler/README-ISA.md b/src/amd/compiler/README-ISA.md index e790fb290f6..f828e74d5df 100644 --- a/src/amd/compiler/README-ISA.md +++ b/src/amd/compiler/README-ISA.md @@ -338,6 +338,17 @@ Only `s_waitcnt_vscnt null, 0`. Needed even if the first instruction is a load. NSA MIMG instructions should be limited to 3 dwords before GFX10.3 to avoid stability issues: https://reviews.llvm.org/D103348 +## RDNA2 / GFX10.3 hazards + +### SALU EXEC write followed by NSA MIMG instruction + +Triggered-by: +Potential stability issues can occur if an SALU instruction changes exec from 0 +to non-zero immediately before an NSA MIMG instruction with 4+ dwords. + +Mitigated-by: Any instruction, including `s_nop`. + + ## RDNA3 / GFX11 hazards ### VcmpxPermlaneHazard diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index 1872dc92634..9f8ac442da2 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -129,6 +129,7 @@ struct NOP_ctx_gfx10 { bool has_branch_after_DS = false; bool has_NSA_MIMG = false; bool has_writelane = false; + bool has_salu_exec_write = false; std::bitset<128> sgprs_read_by_VMEM; std::bitset<128> sgprs_read_by_VMEM_store; std::bitset<128> sgprs_read_by_DS; @@ -145,6 +146,7 @@ struct NOP_ctx_gfx10 { has_branch_after_DS |= other.has_branch_after_DS; has_NSA_MIMG |= other.has_NSA_MIMG; has_writelane |= other.has_writelane; + has_salu_exec_write |= other.has_salu_exec_write; sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM; sgprs_read_by_DS |= other.sgprs_read_by_DS; sgprs_read_by_VMEM_store |= other.sgprs_read_by_VMEM_store; @@ -159,6 +161,7 @@ struct NOP_ctx_gfx10 { has_branch_after_VMEM == other.has_branch_after_VMEM && has_DS == other.has_DS && has_branch_after_DS == other.has_branch_after_DS && has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane && + has_salu_exec_write == other.has_salu_exec_write && sgprs_read_by_VMEM == other.sgprs_read_by_VMEM && sgprs_read_by_DS == other.sgprs_read_by_DS && sgprs_read_by_VMEM_store == other.sgprs_read_by_VMEM_store && @@ -907,6 +910,15 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr& ctx.waits_since_fp_atomic = std::min(ctx.waits_since_fp_atomic, 3); } + /* 4+ dword NSA can hang if exec becomes non-zero again directly before the instruction. */ + if (instr->isSALU() && instr->writes_exec()) { + ctx.has_salu_exec_write = true; + } else if (ctx.has_salu_exec_write) { + ctx.has_salu_exec_write = false; + if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) + bld.sopp(aco_opcode::s_nop, 0); + } + if (state.program->gfx_level != GFX10) return; /* no other hazards/bugs to mitigate */ @@ -2019,13 +2031,15 @@ required_export_priority(Program* program) void insert_NOPs(Program* program) { + bool has_previous_part = + program->is_epilog || program->info.vs.has_prolog || program->info.ps.has_prolog || + (program->info.merged_shader_compiled_separately && program->stage.sw != SWStage::VS && + program->stage.sw != SWStage::TES) || + program->stage == raytracing_cs; + if (program->gfx_level >= GFX11) { NOP_ctx_gfx11 initial_ctx; - bool has_previous_part = - program->is_epilog || program->info.vs.has_prolog || program->info.ps.has_prolog || - (program->info.merged_shader_compiled_separately && program->stage.sw != SWStage::VS && - program->stage.sw != SWStage::TES) || program->stage == raytracing_cs; if (program->gfx_level >= GFX12 && has_previous_part) { /* resolve_all_gfx11 can't resolve VALUReadSGPRHazard entirely. We have to assume that any * SGPR might have been read by VALU if there was a previous shader part. @@ -2036,7 +2050,10 @@ insert_NOPs(Program* program) mitigate_hazards(program, initial_ctx); } else if (program->gfx_level >= GFX10) { - mitigate_hazards(program); + NOP_ctx_gfx10 initial_ctx; + initial_ctx.has_salu_exec_write = has_previous_part; + mitigate_hazards(program, + initial_ctx); } else { mitigate_hazards(program); }