From 0e21cd9e154bc82b978f295ae2c773de0c818d2d Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Sat, 21 Sep 2024 15:02:23 +0200 Subject: [PATCH] aco/gfx10+: work around non uniform ds_append wave64 result In wave64 for hw with native wave32, ds_append seems to be split in a load for the low half and an atomic for the high half, and other LDS instructions can be scheduled between the two. Which means the result of the low half is unusable because it might be out of date. I was only able to reproduce this issue in WGP mode, but be conservative and apply the workaround in CU mode too. Foz-DB Navi31: Totals from 13 (0.02% of 79395) affected shaders: Instrs: 7599 -> 7656 (+0.75%) CodeSize: 39708 -> 39972 (+0.66%) Latency: 83174 -> 83572 (+0.48%) InvThroughput: 8271 -> 8357 (+1.04%) Copies: 718 -> 717 (-0.14%) VALU: 3689 -> 3703 (+0.38%) SALU: 935 -> 965 (+3.21%) Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/11921 Fixes: 45e935800a8 ("aco: implement nir_shared_append/consume_amd") Reviewed-by: Rhys Perry Reviewed-by: Bas Nieuwenhuizen Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index a043f8f3af4..03111d4ce35 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -7595,7 +7595,19 @@ visit_shared_append(isel_context* ctx, nir_intrinsic_instr* instr) ds = bld.ds(op, Definition(tmp), m, address); ds->ds().sync = memory_sync_info(storage_shared, semantic_atomicrmw); - bld.pseudo(aco_opcode::p_as_uniform, Definition(get_ssa_temp(ctx, &instr->def)), tmp); + /* In wave64 for hw with native wave32, ds_append seems to be split in a load for the low half + * and an atomic for the high half, and other LDS instructions can be scheduled between the two. + * Which means the result of the low half is unusable because it might be out of date. + */ + if (ctx->program->gfx_level >= GFX10 && ctx->program->wave_size == 64 && + ctx->program->workgroup_size > 64) { + Temp last_lane = bld.sop1(aco_opcode::s_flbit_i32_b64, bld.def(s1), Operand(exec, s2)); + last_lane = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand::c32(63), + last_lane); + bld.readlane(Definition(get_ssa_temp(ctx, &instr->def)), tmp, last_lane); + } else { + bld.pseudo(aco_opcode::p_as_uniform, Definition(get_ssa_temp(ctx, &instr->def)), tmp); + } } void