From d7dcd81c77efffce2ef122254c9dfc9dbb3bf447 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 30 May 2025 17:52:59 +0100 Subject: [PATCH] aco/gfx6: allow both constant and gpr offset for global with sgpr address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fossil-db (pitcairn): Totals from 81 (0.13% of 62069) affected shaders: MaxWaves: 332 -> 335 (+0.90%) Instrs: 150087 -> 149737 (-0.23%); split: -0.30%, +0.06% CodeSize: 754636 -> 752712 (-0.25%); split: -0.31%, +0.06% SGPRs: 6128 -> 6184 (+0.91%) VGPRs: 7220 -> 7208 (-0.17%); split: -0.28%, +0.11% SpillSGPRs: 288 -> 287 (-0.35%) Latency: 2199197 -> 2198338 (-0.04%); split: -0.20%, +0.17% InvThroughput: 1613474 -> 1614303 (+0.05%); split: -0.07%, +0.12% VClause: 2905 -> 2862 (-1.48%); split: -2.34%, +0.86% SClause: 2366 -> 2378 (+0.51%); split: -0.17%, +0.68% Copies: 17312 -> 17264 (-0.28%); split: -1.03%, +0.76% PreSGPRs: 5080 -> 5004 (-1.50%) PreVGPRs: 5656 -> 5640 (-0.28%) VALU: 114097 -> 113831 (-0.23%); split: -0.31%, +0.07% SALU: 16004 -> 15944 (-0.37%); split: -0.41%, +0.04% Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- .../aco_select_nir_intrinsics.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp index e40f81a6404..29db2851d6c 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp @@ -261,6 +261,7 @@ struct LoadEmitInfo { unsigned align_offset = 0; pipe_format format; nir_src* offset_src = NULL; /* should be equal to offset or NULL */ + isel_context* ctx; ac_hw_cache_flags cache = {{0, 0, 0, 0, 0}}; bool split_by_component_stride = true; @@ -315,6 +316,7 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info, /* reduce constant offset */ LoadEmitInfo new_info = info; + new_info.ctx = ctx; Operand offset = info.offset; unsigned reduced_const_offset = const_offset; if (const_offset > params.max_const_offset) { @@ -817,8 +819,8 @@ get_gfx6_global_rsrc(Builder& bld, Temp addr) } void -lower_global_address(Builder& bld, uint32_t offset_in, Temp* address_inout, - uint32_t* const_offset_inout, Temp* offset_inout) +lower_global_address(isel_context* ctx, Builder& bld, uint32_t offset_in, Temp* address_inout, + uint32_t* const_offset_inout, Temp* offset_inout, nir_src* offset_src) { Temp address = *address_inout; uint64_t const_offset = *const_offset_inout + offset_in; @@ -857,9 +859,10 @@ lower_global_address(Builder& bld, uint32_t offset_in, Temp* address_inout, if (bld.program->gfx_level == GFX6) { /* GFX6 (MUBUF): (SGPR address, SGPR offset) or (SGPR address, VGPR offset) */ /* GFX6 (MUBUF-addr64): (VGPR address, SGPR offset) */ - /* Disallow SGPR address with both a const_offset and offset because of possible overflow. */ + /* Disallow SGPR address with both a const_offset and offset in case of possible overflow. */ if (offset.id() && - (address.type() == RegType::vgpr ? offset.type() != RegType::sgpr : const_offset > 0)) { + (address.type() == RegType::vgpr ? offset.type() != RegType::sgpr + : add_might_overflow(ctx, offset_src, const_offset))) { address = add64_32(bld, address, Operand(offset)); offset = Temp(); } @@ -898,7 +901,7 @@ global_load_callback(Builder& bld, const LoadEmitInfo& info, unsigned bytes_need offset = Temp(); } uint32_t const_offset = info.const_offset; - lower_global_address(bld, 0, &addr, &const_offset, &offset); + lower_global_address(info.ctx, bld, 0, &addr, &const_offset, &offset, info.offset_src); unsigned bytes_size = 0; bool use_mubuf = bld.program->gfx_level == GFX6; @@ -2518,7 +2521,8 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr) Temp write_address = addr; uint32_t write_const_offset = const_offset; Temp write_offset = offset; - lower_global_address(bld, offsets[i], &write_address, &write_const_offset, &write_offset); + lower_global_address(ctx, bld, offsets[i], &write_address, &write_const_offset, &write_offset, + &instr->src[2]); unsigned access = nir_intrinsic_access(instr) | ACCESS_TYPE_STORE; if (write_datas[i].bytes() < 4) @@ -2613,7 +2617,7 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr) Temp addr, offset; uint32_t const_offset; parse_global(ctx, instr, &addr, &const_offset, &offset); - lower_global_address(bld, 0, &addr, &const_offset, &offset); + lower_global_address(ctx, bld, 0, &addr, &const_offset, &offset, &instr->src[2]); if (ctx->options->gfx_level >= GFX7) { bool global = ctx->options->gfx_level >= GFX9;