From 6eac72088c79d5ad91efadb59b09ad80f3a40add Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Wed, 13 Nov 2024 11:43:25 +0100 Subject: [PATCH] aco/gfx10+: only work around split execution of uniform LDS in WGP mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LDS instructions from one CU won't split the execution of other LDS instruction on the same CU. Reviewed-by: Timur Kristóf Part-of: --- .../aco_select_nir_intrinsics.cpp | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp index fed87532be9..8417fd74714 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp @@ -2931,6 +2931,17 @@ emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr) exec_scope); } +/* The two 32 wide halves of a gfx10+ wave64 LDS instruction might be executed interleaved + * with LDS instructions from the other CU in WGP mode. + */ +bool +lds_potential_non_atomic_split(isel_context* ctx, unsigned access) +{ + return ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64 && + ctx->program->workgroup_size > 64 && ctx->program->wgp_mode && + ((access & ACCESS_ATOMIC) || !ctx->shader->info.assume_no_data_races); +} + void visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr) { @@ -2974,14 +2985,11 @@ visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr) ds->operands.pop_back(); if (def.getTemp() != dst) { - /* The 2 separate loads for gfx10+ wave64 can see different values, even for uniform - * addresses, if another wave writes LDS in between. Use v_readfirstlane instead of - * p_as_uniform in order to avoid copy-propagation. + /* Use v_readfirstlane instead of p_as_uniform in order to avoid copy-propagation of + * potentially divergent value. */ - bool atomic = nir_intrinsic_access(instr) & ACCESS_ATOMIC; bool readfirstlane_for_uniform = - ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64 && - ctx->program->workgroup_size > 64 && (atomic || !ctx->shader->info.assume_no_data_races); + lds_potential_non_atomic_split(ctx, nir_intrinsic_access(instr)); emit_vector_as_uniform(ctx, def.getTemp(), dst, readfirstlane_for_uniform); } @@ -3188,12 +3196,10 @@ visit_shared_append(isel_context* ctx, nir_intrinsic_instr* instr) ds = bld.ds(op, Definition(tmp), m, address); ds->ds().sync = memory_sync_info(storage_shared, semantic_atomicrmw); - /* In wave64 for hw with native wave32, ds_append seems to be split in a load for the low half - * and an atomic for the high half, and other LDS instructions can be scheduled between the two. - * Which means the result of the low half is unusable because it might be out of date. + /* If there is a write to the same LDS address between the split halves, only the second half + * will read the correct result. */ - if (ctx->program->gfx_level >= GFX10 && ctx->program->wave_size == 64 && - ctx->program->workgroup_size > 64) { + if (lds_potential_non_atomic_split(ctx, ACCESS_ATOMIC)) { Temp last_lane = bld.sop1(aco_opcode::s_flbit_i32_b64, bld.def(s1), Operand(exec, s2)); last_lane = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand::c32(63), last_lane); @@ -3243,11 +3249,8 @@ visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr) Temp dst = get_ssa_temp(ctx, &instr->def); if (dst.type() == RegType::sgpr) { /* Similar to load_shared. */ - bool atomic = nir_intrinsic_access(instr) & ACCESS_ATOMIC; - bool readfirstlane_for_uniform = ctx->options->gfx_level >= GFX10 && - ctx->program->wave_size == 64 && - ctx->program->workgroup_size > 64 && - (atomic || !ctx->shader->info.assume_no_data_races); + bool readfirstlane_for_uniform = + lds_potential_non_atomic_split(ctx, nir_intrinsic_access(instr)); emit_split_vector(ctx, ds->definitions[0].getTemp(), dst.size()); Temp comp[4];