aco: avoid unaligned offsets when selecting load_global_amd

SMEM instructions mask off the low bits for the base and offset sources
both before and after they're added. However, NIR expects ACO to only
care about the alignment of the final address.

fossil-db (gfx1201):
Totals from 21 (0.03% of 79839) affected shaders:
Instrs: 229780 -> 229876 (+0.04%)
CodeSize: 1267724 -> 1268080 (+0.03%)
Latency: 2800924 -> 2800978 (+0.00%)
InvThroughput: 520250 -> 520256 (+0.00%)
Copies: 27878 -> 27876 (-0.01%); split: -0.01%, +0.00%
SALU: 29591 -> 29643 (+0.18%)

fossil-db (polaris10):
Totals from 3 (0.00% of 62201) affected shaders:
Latency: 2651 -> 2652 (+0.04%)
InvThroughput: 662 -> 663 (+0.15%)
PreSGPRs: 51 -> 54 (+5.88%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37301>
This commit is contained in:
Rhys Perry 2025-08-25 16:20:32 +01:00 committed by Marge Bot
parent 6d71521ecd
commit 81df517553
3 changed files with 39 additions and 1 deletions

View file

@ -138,6 +138,7 @@ struct isel_context {
/* NIR range analysis. */
struct hash_table* range_ht;
struct hash_table* numlsb_ht;
Temp arg_temps[AC_MAX_ARGS];
Operand workgroup_id[3];

View file

@ -375,6 +375,7 @@ init_context(isel_context* ctx, nir_shader* shader)
/* Init NIR range analysis. */
ctx->range_ht = _mesa_pointer_hash_table_create(NULL);
ctx->numlsb_ht = _mesa_pointer_hash_table_create(NULL);
uint32_t options =
shader->options->divergence_analysis_options | nir_divergence_ignore_undef_if_phi_srcs;
@ -728,6 +729,7 @@ init_context(isel_context* ctx, nir_shader* shader)
void
cleanup_context(isel_context* ctx)
{
_mesa_hash_table_destroy(ctx->numlsb_ht, NULL);
_mesa_hash_table_destroy(ctx->range_ht, NULL);
}

View file

@ -9,6 +9,8 @@
#include "aco_instruction_selection.h"
#include "aco_ir.h"
#include "nir_range_analysis.h"
#include "ac_descriptors.h"
#include "ac_nir.h"
#include "amdgfxregs.h"
@ -281,6 +283,7 @@ struct LoadEmitInfo {
unsigned align_mul = 0;
unsigned align_offset = 0;
pipe_format format;
nir_src* resource_src = NULL; /* should be equal to resource or NULL */
nir_src* offset_src = NULL; /* should be equal to offset or NULL */
isel_context* ctx;
@ -350,6 +353,7 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
if (new_info.resource.id() && new_info.resource.size() == 2 &&
add_might_overflow(ctx, info.offset_src, to_add)) {
new_info.resource = add64_32(bld, new_info.resource, Operand::c32(to_add));
new_info.resource_src = NULL;
offset_changed = false;
} else if (offset.isConstant()) {
offset = Operand::c32(offset.constantValue() + to_add);
@ -502,6 +506,12 @@ get_smem_opcode(amd_gfx_level level, unsigned bytes, bool buffer, bool round_dow
return {buffer ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16, 64};
}
unsigned
src_has_req_lsb(isel_context* ctx, nir_src* src, unsigned req)
{
return src && nir_def_num_lsb_zero(ctx->numlsb_ht, nir_get_scalar(src->ssa, 0)) >= req;
}
Temp
smem_load_callback(Builder& bld, const LoadEmitInfo& info, unsigned bytes_needed, unsigned align)
{
@ -540,6 +550,30 @@ smem_load_callback(Builder& bld, const LoadEmitInfo& info, unsigned bytes_needed
*/
RegClass rc(RegType::sgpr, DIV_ROUND_UP(util_next_power_of_two(bytes_needed), 4u));
unsigned req_lsb_zero = bytes_needed == 1 ? 0 : (bytes_needed == 2 ? 1 : 2);
if (!buffer) {
/* We require each offset source and the final address to be aligned, so ensure at least
* two sources are aligned. The remaining one can then be assumed to be aligned, otherwise the
* final address is unaligned. */
// TODO: lower in NIR
bool addr_aligned = src_has_req_lsb(info.ctx, info.resource_src, req_lsb_zero);
bool offset_aligned =
!offset.id() || src_has_req_lsb(info.ctx, info.offset_src, req_lsb_zero);
bool const_aligned = !const_offset || ffs(const_offset) > req_lsb_zero;
if (!offset_aligned && (!addr_aligned || !const_aligned)) {
addr = add64_32(bld, addr, Operand(offset));
offset = Temp();
}
if (!const_aligned && (!addr_aligned || !offset_aligned)) {
addr = add64_32(bld, addr, Operand::c32(const_offset));
const_offset = 0;
}
} else {
/* We assume the buffer resource is also aligned. */
assert(!const_offset || ffs(const_offset) > req_lsb_zero);
}
bool soe = !buffer && offset.id() && const_offset && bld.program->gfx_level >= GFX9;
aco_ptr<Instruction> load{create_instruction(op, Format::SMEM, 2 + soe, 1)};
if (buffer) {
@ -2334,7 +2368,8 @@ visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
info.align_mul = nir_intrinsic_align_mul(instr);
info.align_offset = nir_intrinsic_align_offset(instr);
info.sync = get_memory_sync_info(instr, storage_buffer, 0);
info.offset_src = &instr->src[1];
info.resource_src = &instr->src[0];
info.offset_src = offset.id() ? &instr->src[1] : NULL;
info.cache = get_cache_flags(ctx, access, ac_access_type_load);
info.disable_wqm = access & ACCESS_SKIP_HELPERS;