From c26851b80b9d8daea4a98da05910e530cd6d80a7 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 23 Apr 2025 16:41:53 +0100 Subject: [PATCH] aco: increase max_const_offset_plus_one for SMEM load_global fossil-db (gfx1201): Totals from 1115 (1.40% of 79377) affected shaders: Instrs: 1473805 -> 1467571 (-0.42%); split: -0.43%, +0.01% CodeSize: 7852972 -> 7819656 (-0.42%); split: -0.44%, +0.02% SpillSGPRs: 1632 -> 1460 (-10.54%); split: -11.27%, +0.74% Latency: 11975762 -> 11971915 (-0.03%); split: -0.05%, +0.02% InvThroughput: 2496961 -> 2496448 (-0.02%); split: -0.03%, +0.01% VClause: 25213 -> 25218 (+0.02%); split: -0.00%, +0.02% SClause: 28822 -> 28565 (-0.89%); split: -1.41%, +0.52% Copies: 106377 -> 105715 (-0.62%); split: -1.23%, +0.61% Branches: 27497 -> 27473 (-0.09%) PreSGPRs: 52071 -> 51310 (-1.46%) VALU: 871051 -> 870694 (-0.04%); split: -0.04%, +0.00% SALU: 186090 -> 181811 (-2.30%); split: -2.32%, +0.02% Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 4 +++- src/amd/compiler/aco_ir.cpp | 7 +++++++ src/amd/compiler/aco_ir.h | 3 +++ src/amd/compiler/aco_optimizer.cpp | 9 +++------ 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index bcd3895b3cc..bf09786e9ce 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -6673,7 +6673,9 @@ visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr) info.resource = bld.as_uniform(info.resource); info.offset = Operand(bld.as_uniform(info.offset)); info.cache = get_cache_flags(ctx, access | ACCESS_TYPE_SMEM); - emit_load(ctx, bld, info, smem_load_params); + EmitLoadParameters params = smem_load_params; + params.max_const_offset_plus_one = ctx->program->dev.smem_offset_max + 1; + emit_load(ctx, bld, info, params); } else { EmitLoadParameters params = global_load_params; info.cache = get_cache_flags(ctx, access); diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index d0775439838..cda3599ff46 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -180,6 +180,13 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info, program->dev.scratch_global_offset_max = 4095; } + if (program->gfx_level >= GFX8) + program->dev.smem_offset_max = 0xfffff; + else if (program->gfx_level >= GFX7) + program->dev.smem_offset_max = 0xffffffff; + else if (program->gfx_level >= GFX6) + program->dev.smem_offset_max = 0x3ff; + if (program->gfx_level >= GFX12) { /* Same as GFX11, except one less for VSAMPLE. */ program->dev.max_nsa_vgprs = 3; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index e8cea9c4ffa..b2579db1480 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2105,6 +2105,9 @@ struct DeviceInfo { int16_t scratch_global_offset_min; int16_t scratch_global_offset_max; unsigned max_nsa_vgprs; + + /* Note that GFX6/7 ignore the low 2 bits and this is only for positive offsets. */ + uint32_t smem_offset_max; }; enum class CompilationProgress { diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 486b8e11bd1..e5b7b5040e7 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -830,14 +830,11 @@ smem_combine(opt_ctx& ctx, aco_ptr& instr) Temp base; uint32_t offset; - if (info.is_constant_or_literal(32) && - ((ctx.program->gfx_level == GFX6 && info.val <= 0x3FF) || - (ctx.program->gfx_level == GFX7 && info.val <= 0xFFFFFFFF) || - (ctx.program->gfx_level >= GFX8 && info.val <= 0xFFFFF))) { + if (info.is_constant_or_literal(32) && info.val <= ctx.program->dev.smem_offset_max) { instr->operands[1] = Operand::c32(info.val); } else if (parse_base_offset(ctx, instr.get(), 1, &base, &offset, true) && - base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->gfx_level >= GFX9 && - offset % 4u == 0) { + base.regClass() == s1 && offset <= ctx.program->dev.smem_offset_max && + ctx.program->gfx_level >= GFX9 && offset % 4u == 0) { bool soe = smem.operands.size() >= (!smem.definitions.empty() ? 3 : 4); if (soe) { if (ctx.info[smem.operands.back().tempId()].is_constant_or_literal(32) &&