From 81df5175537a20133e4460c2c4d047a4fe09d8f8 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 25 Aug 2025 16:20:32 +0100 Subject: [PATCH] aco: avoid unaligned offsets when selecting load_global_amd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SMEM instructions mask off the low bits for the base and offset sources both before and after they're added. However, NIR expects ACO to only care about the alignment of the final address. fossil-db (gfx1201): Totals from 21 (0.03% of 79839) affected shaders: Instrs: 229780 -> 229876 (+0.04%) CodeSize: 1267724 -> 1268080 (+0.03%) Latency: 2800924 -> 2800978 (+0.00%) InvThroughput: 520250 -> 520256 (+0.00%) Copies: 27878 -> 27876 (-0.01%); split: -0.01%, +0.00% SALU: 29591 -> 29643 (+0.18%) fossil-db (polaris10): Totals from 3 (0.00% of 62201) affected shaders: Latency: 2651 -> 2652 (+0.04%) InvThroughput: 662 -> 663 (+0.15%) PreSGPRs: 51 -> 54 (+5.88%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- .../aco_instruction_selection.h | 1 + .../instruction_selection/aco_isel_setup.cpp | 2 + .../aco_select_nir_intrinsics.cpp | 37 ++++++++++++++++++- 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/instruction_selection/aco_instruction_selection.h b/src/amd/compiler/instruction_selection/aco_instruction_selection.h index 9405a01b871..29e9b631585 100644 --- a/src/amd/compiler/instruction_selection/aco_instruction_selection.h +++ b/src/amd/compiler/instruction_selection/aco_instruction_selection.h @@ -138,6 +138,7 @@ struct isel_context { /* NIR range analysis. */ struct hash_table* range_ht; + struct hash_table* numlsb_ht; Temp arg_temps[AC_MAX_ARGS]; Operand workgroup_id[3]; diff --git a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp index 7746b05f418..d57105cec31 100644 --- a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp +++ b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp @@ -375,6 +375,7 @@ init_context(isel_context* ctx, nir_shader* shader) /* Init NIR range analysis. */ ctx->range_ht = _mesa_pointer_hash_table_create(NULL); + ctx->numlsb_ht = _mesa_pointer_hash_table_create(NULL); uint32_t options = shader->options->divergence_analysis_options | nir_divergence_ignore_undef_if_phi_srcs; @@ -728,6 +729,7 @@ init_context(isel_context* ctx, nir_shader* shader) void cleanup_context(isel_context* ctx) { + _mesa_hash_table_destroy(ctx->numlsb_ht, NULL); _mesa_hash_table_destroy(ctx->range_ht, NULL); } diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp index 1d3c6f9b9ec..01bc8ff9774 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp @@ -9,6 +9,8 @@ #include "aco_instruction_selection.h" #include "aco_ir.h" +#include "nir_range_analysis.h" + #include "ac_descriptors.h" #include "ac_nir.h" #include "amdgfxregs.h" @@ -281,6 +283,7 @@ struct LoadEmitInfo { unsigned align_mul = 0; unsigned align_offset = 0; pipe_format format; + nir_src* resource_src = NULL; /* should be equal to resource or NULL */ nir_src* offset_src = NULL; /* should be equal to offset or NULL */ isel_context* ctx; @@ -350,6 +353,7 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info, if (new_info.resource.id() && new_info.resource.size() == 2 && add_might_overflow(ctx, info.offset_src, to_add)) { new_info.resource = add64_32(bld, new_info.resource, Operand::c32(to_add)); + new_info.resource_src = NULL; offset_changed = false; } else if (offset.isConstant()) { offset = Operand::c32(offset.constantValue() + to_add); @@ -502,6 +506,12 @@ get_smem_opcode(amd_gfx_level level, unsigned bytes, bool buffer, bool round_dow return {buffer ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16, 64}; } +unsigned +src_has_req_lsb(isel_context* ctx, nir_src* src, unsigned req) +{ + return src && nir_def_num_lsb_zero(ctx->numlsb_ht, nir_get_scalar(src->ssa, 0)) >= req; +} + Temp smem_load_callback(Builder& bld, const LoadEmitInfo& info, unsigned bytes_needed, unsigned align) { @@ -540,6 +550,30 @@ smem_load_callback(Builder& bld, const LoadEmitInfo& info, unsigned bytes_needed */ RegClass rc(RegType::sgpr, DIV_ROUND_UP(util_next_power_of_two(bytes_needed), 4u)); + unsigned req_lsb_zero = bytes_needed == 1 ? 0 : (bytes_needed == 2 ? 1 : 2); + if (!buffer) { + /* We require each offset source and the final address to be aligned, so ensure at least + * two sources are aligned. The remaining one can then be assumed to be aligned, otherwise the + * final address is unaligned. */ + // TODO: lower in NIR + bool addr_aligned = src_has_req_lsb(info.ctx, info.resource_src, req_lsb_zero); + bool offset_aligned = + !offset.id() || src_has_req_lsb(info.ctx, info.offset_src, req_lsb_zero); + bool const_aligned = !const_offset || ffs(const_offset) > req_lsb_zero; + + if (!offset_aligned && (!addr_aligned || !const_aligned)) { + addr = add64_32(bld, addr, Operand(offset)); + offset = Temp(); + } + if (!const_aligned && (!addr_aligned || !offset_aligned)) { + addr = add64_32(bld, addr, Operand::c32(const_offset)); + const_offset = 0; + } + } else { + /* We assume the buffer resource is also aligned. */ + assert(!const_offset || ffs(const_offset) > req_lsb_zero); + } + bool soe = !buffer && offset.id() && const_offset && bld.program->gfx_level >= GFX9; aco_ptr load{create_instruction(op, Format::SMEM, 2 + soe, 1)}; if (buffer) { @@ -2334,7 +2368,8 @@ visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr) info.align_mul = nir_intrinsic_align_mul(instr); info.align_offset = nir_intrinsic_align_offset(instr); info.sync = get_memory_sync_info(instr, storage_buffer, 0); - info.offset_src = &instr->src[1]; + info.resource_src = &instr->src[0]; + info.offset_src = offset.id() ? &instr->src[1] : NULL; info.cache = get_cache_flags(ctx, access, ac_access_type_load); info.disable_wqm = access & ACCESS_SKIP_HELPERS;