aco/gfx12: increase maximum vbuffer offset

fossil-db (gfx1201):
Totals from 301 (0.38% of 79377) affected shaders:
Instrs: 2734478 -> 2728816 (-0.21%); split: -0.21%, +0.00%
CodeSize: 14347476 -> 14306568 (-0.29%)
Latency: 15508055 -> 15502202 (-0.04%); split: -0.04%, +0.00%
InvThroughput: 2846419 -> 2842387 (-0.14%); split: -0.14%, +0.00%
VClause: 68286 -> 68101 (-0.27%); split: -0.30%, +0.03%
SClause: 49487 -> 49500 (+0.03%)
Copies: 207179 -> 206093 (-0.52%); split: -0.57%, +0.04%
Branches: 72941 -> 72942 (+0.00%); split: -0.00%, +0.00%
VALU: 1549156 -> 1544727 (-0.29%); split: -0.29%, +0.00%
SALU: 339620 -> 338989 (-0.19%); split: -0.19%, +0.00%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34730>
This commit is contained in:
Rhys Perry 2025-04-23 17:01:48 +01:00 committed by Marge Bot
parent d987d5e341
commit 6338ed44c5
5 changed files with 45 additions and 27 deletions

View file

@ -4783,7 +4783,7 @@ lower_global_address(Builder& bld, uint32_t offset_in, Temp* address_inout,
if (bld.program->gfx_level >= GFX9)
max_const_offset_plus_one = bld.program->dev.scratch_global_offset_max;
else if (bld.program->gfx_level == GFX6)
max_const_offset_plus_one = 4096; /* MUBUF has a 12-bit unsigned offset field */
max_const_offset_plus_one = bld.program->dev.buf_offset_max + 1;
uint64_t excess_offset = const_offset - (const_offset % max_const_offset_plus_one);
const_offset %= max_const_offset_plus_one;
@ -5297,9 +5297,10 @@ create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_t
inline unsigned
resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
{
if (const_offset >= 4096) {
unsigned excess_const_offset = const_offset / 4096u * 4096u;
const_offset %= 4096u;
uint32_t limit = bld.program->dev.buf_offset_max + 1;
if (const_offset >= limit) {
unsigned excess_const_offset = const_offset / limit * limit;
const_offset %= limit;
if (!voffset.id())
voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
@ -6990,14 +6991,18 @@ visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
info.component_stride = can_split ? vtx_info->chan_byte_size : 0;
info.split_by_component_stride = false;
emit_load(ctx, bld, info, mtbuf_load_params);
EmitLoadParameters params = mtbuf_load_params;
params.max_const_offset_plus_one = ctx->program->dev.buf_offset_max + 1;
emit_load(ctx, bld, info, params);
} else {
assert(intrin->intrinsic == nir_intrinsic_load_buffer_amd);
if (nir_intrinsic_access(intrin) & ACCESS_USES_FORMAT_AMD) {
assert(!swizzled);
emit_load(ctx, bld, info, mubuf_load_format_params);
EmitLoadParameters params = mubuf_load_format_params;
params.max_const_offset_plus_one = ctx->program->dev.buf_offset_max + 1;
emit_load(ctx, bld, info, params);
} else {
const unsigned swizzle_element_size =
swizzled ? (ctx->program->gfx_level <= GFX8 ? 4 : 16) : 0;
@ -7007,7 +7012,9 @@ visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
info.align_mul = align_mul;
info.align_offset = align_offset;
emit_load(ctx, bld, info, mubuf_load_params);
EmitLoadParameters params = mubuf_load_params;
params.max_const_offset_plus_one = ctx->program->dev.buf_offset_max + 1;
emit_load(ctx, bld, info, params);
}
}
}

View file

@ -183,6 +183,11 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info,
program->dev.scratch_global_offset_max = 4095;
}
if (program->gfx_level >= GFX12)
program->dev.buf_offset_max = 0x7fffff;
else
program->dev.buf_offset_max = 0xfff;
if (program->gfx_level >= GFX12)
program->dev.smem_offset_max = 0x7fffff;
else if (program->gfx_level >= GFX8)

View file

@ -1627,15 +1627,14 @@ static_assert(sizeof(LDSDIR_instruction) == sizeof(Instruction) + 8, "Unexpected
struct MUBUF_instruction : public Instruction {
memory_sync_info sync;
ac_hw_cache_flags cache;
bool offen : 1; /* Supply an offset from VGPR (VADDR) */
bool idxen : 1; /* Supply an index from VGPR (VADDR) */
bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
bool tfe : 1; /* texture fail enable */
bool lds : 1; /* Return read-data to LDS instead of VGPRs */
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
uint8_t padding0 : 2;
uint8_t padding1;
uint16_t offset; /* Unsigned byte offset - 12 bit */
uint32_t offset : 23; /* Unsigned byte offset */
uint32_t offen : 1; /* Supply an offset from VGPR (VADDR) */
uint32_t idxen : 1; /* Supply an index from VGPR (VADDR) */
uint32_t addr64 : 1; /* SI, CIK: Address size is 64-bit */
uint32_t tfe : 1; /* texture fail enable */
uint32_t lds : 1; /* Return read-data to LDS instead of VGPRs */
uint32_t disable_wqm : 1; /* Require an exec mask without helper invocations */
uint32_t padding : 3;
};
static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
@ -1656,10 +1655,12 @@ struct MTBUF_instruction : public Instruction {
bool idxen : 1; /* Supply an index from VGPR (VADDR) */
bool tfe : 1; /* texture fail enable */
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
uint8_t padding : 5;
uint16_t offset; /* Unsigned byte offset - 12 bit */
uint8_t padding0 : 5;
uint16_t padding1;
uint32_t offset : 23; /* Unsigned byte offset */
uint32_t padding2 : 9;
};
static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 12, "Unexpected padding");
/**
* Vector Memory Image Instructions
@ -2105,6 +2106,7 @@ struct DeviceInfo {
int32_t scratch_global_offset_max;
unsigned max_nsa_vgprs;
uint32_t buf_offset_max;
/* Note that GFX6/7 ignore the low 2 bits and this is only for positive offsets. */
uint32_t smem_offset_max;
};

View file

@ -1499,35 +1499,38 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
* MUBUF accesses. */
bool vaddr_prevent_overflow = swizzled && ctx.program->gfx_level < GFX9;
uint32_t const_max = ctx.program->dev.buf_offset_max;
if (mubuf.offen && mubuf.idxen && i == 1 && info.is_vec() &&
info.instr->operands.size() == 2 && info.instr->operands[0].isTemp() &&
info.instr->operands[0].regClass() == v1 && info.instr->operands[1].isConstant() &&
mubuf.offset + info.instr->operands[1].constantValue() < 4096) {
mubuf.offset + info.instr->operands[1].constantValue() <= const_max) {
instr->operands[1] = info.instr->operands[0];
mubuf.offset += info.instr->operands[1].constantValue();
mubuf.offen = false;
continue;
} else if (mubuf.offen && i == 1 && info.is_constant_or_literal(32) &&
mubuf.offset + info.val < 4096) {
mubuf.offset + info.val <= const_max) {
assert(!mubuf.idxen);
instr->operands[1] = Operand(v1);
mubuf.offset += info.val;
mubuf.offen = false;
continue;
} else if (i == 2 && info.is_constant_or_literal(32) && mubuf.offset + info.val < 4096) {
} else if (i == 2 && info.is_constant_or_literal(32) &&
mubuf.offset + info.val <= const_max) {
instr->operands[2] = Operand::c32(0);
mubuf.offset += info.val;
continue;
} else if (mubuf.offen && i == 1 &&
parse_base_offset(ctx, instr.get(), i, &base, &offset,
vaddr_prevent_overflow) &&
base.regClass() == v1 && mubuf.offset + offset < 4096) {
base.regClass() == v1 && mubuf.offset + offset <= const_max) {
assert(!mubuf.idxen);
instr->operands[1].setTemp(base);
mubuf.offset += offset;
continue;
} else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) &&
base.regClass() == s1 && mubuf.offset + offset < 4096 && !swizzled) {
base.regClass() == s1 && mubuf.offset + offset <= const_max && !swizzled) {
instr->operands[i].setTemp(base);
mubuf.offset += offset;
continue;
@ -1542,7 +1545,8 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (mtbuf.offen && mtbuf.idxen && i == 1 && info.is_vec() &&
info.instr->operands.size() == 2 && info.instr->operands[0].isTemp() &&
info.instr->operands[0].regClass() == v1 && info.instr->operands[1].isConstant() &&
mtbuf.offset + info.instr->operands[1].constantValue() < 4096) {
mtbuf.offset + info.instr->operands[1].constantValue() <=
ctx.program->dev.buf_offset_max) {
instr->operands[1] = info.instr->operands[0];
mtbuf.offset += info.instr->operands[1].constantValue();
mtbuf.offen = false;

View file

@ -1202,8 +1202,8 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block,
offset_range =
ctx.program->dev.scratch_global_offset_max - ctx.program->dev.scratch_global_offset_min;
} else {
if (scratch_size < 4095)
offset_range = 4095 - scratch_size;
if (scratch_size < ctx.program->dev.buf_offset_max)
offset_range = ctx.program->dev.buf_offset_max - scratch_size;
else
offset_range = 0;
}