aco: Add common utility to load scratch descriptor

Also modifies the scratch descriptor to take the stack pointer into
account.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35031>
This commit is contained in:
Natalie Vock 2025-02-17 18:42:48 +01:00 committed by Marge Bot
parent cd2caa5e2b
commit 4a62b342f3
4 changed files with 66 additions and 94 deletions

View file

@ -12,6 +12,9 @@
#include "c11/threads.h"
#include "ac_descriptors.h"
#include "amdgfxregs.h"
namespace aco {
thread_local aco::monotonic_buffer_resource* instruction_buffer = nullptr;
@ -1661,4 +1664,59 @@ create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
return inst;
}
Temp
load_scratch_resource(Program* program, Builder& bld, unsigned resume_idx,
bool apply_scratch_offset)
{
Temp private_segment_buffer;
if (!program->private_segment_buffers.empty())
private_segment_buffer = program->private_segment_buffers[resume_idx];
if (!private_segment_buffer.bytes()) {
Temp addr_lo =
bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
Temp addr_hi =
bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
private_segment_buffer =
bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
} else if (program->stage.hw != AC_HW_COMPUTE_SHADER) {
private_segment_buffer =
bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand::zero());
}
if (apply_scratch_offset && !program->scratch_offsets.empty()) {
Temp addr_lo = bld.tmp(s1);
Temp addr_hi = bld.tmp(s1);
bld.pseudo(aco_opcode::p_split_vector, Definition(addr_lo), Definition(addr_hi),
private_segment_buffer);
Temp carry = bld.tmp(s1);
Temp scratch_offset = program->scratch_offsets[resume_idx];
addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo,
scratch_offset);
addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi,
Operand::c32(0), bld.scc(carry));
private_segment_buffer =
bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
}
struct ac_buffer_state ac_state = {0};
uint32_t desc[4];
ac_state.size = 0xffffffff;
ac_state.format = PIPE_FORMAT_R32_FLOAT;
for (int i = 0; i < 4; i++)
ac_state.swizzle[i] = PIPE_SWIZZLE_0;
/* older generations need element size = 4 bytes. element size removed in GFX9 */
ac_state.element_size = program->gfx_level <= GFX8 ? 1u : 0u;
ac_state.index_stride = program->wave_size == 64 ? 3u : 2u;
ac_state.add_tid = true;
ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
ac_build_buffer_descriptor(program->gfx_level, &ac_state, desc);
return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
Operand::c32(desc[2]), Operand::c32(desc[3]));
}
} // namespace aco

View file

@ -2356,6 +2356,9 @@ RegisterDemand get_addr_regs_from_waves(Program* program, uint16_t waves);
bool uses_scratch(Program* program);
Temp load_scratch_resource(Program* program, Builder& bld, unsigned resume_idx,
bool apply_scratch_offset);
inline bool
dominates_logical(const Block& parent, const Block& child)
{

View file

@ -1136,60 +1136,6 @@ spill_block(spill_ctx& ctx, unsigned block_idx)
}
}
Temp
load_scratch_resource(spill_ctx& ctx, Builder& bld, bool apply_scratch_offset)
{
Temp private_segment_buffer;
if (!ctx.program->private_segment_buffers.empty())
private_segment_buffer = ctx.program->private_segment_buffers[ctx.resume_idx];
if (!private_segment_buffer.bytes()) {
Temp addr_lo =
bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
Temp addr_hi =
bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
private_segment_buffer =
bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
} else if (ctx.program->stage.hw != AC_HW_COMPUTE_SHADER) {
private_segment_buffer =
bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand::zero());
}
if (apply_scratch_offset) {
Temp addr_lo = bld.tmp(s1);
Temp addr_hi = bld.tmp(s1);
bld.pseudo(aco_opcode::p_split_vector, Definition(addr_lo), Definition(addr_hi),
private_segment_buffer);
Temp carry = bld.tmp(s1);
addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo,
ctx.program->scratch_offsets[ctx.resume_idx]);
addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi,
Operand::c32(0), bld.scc(carry));
private_segment_buffer =
bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
}
struct ac_buffer_state ac_state = {0};
uint32_t desc[4];
ac_state.size = 0xffffffff;
ac_state.format = PIPE_FORMAT_R32_FLOAT;
for (int i = 0; i < 4; i++)
ac_state.swizzle[i] = PIPE_SWIZZLE_0;
/* older generations need element size = 4 bytes. element size removed in GFX9 */
ac_state.element_size = ctx.program->gfx_level <= GFX8 ? 1u : 0u;
ac_state.index_stride = ctx.program->wave_size == 64 ? 3u : 2u;
ac_state.add_tid = true;
ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
ac_build_buffer_descriptor(ctx.program->gfx_level, &ac_state, desc);
return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
Operand::c32(desc[2]), Operand::c32(desc[3]));
}
void
setup_vgpr_spill_reload(spill_ctx& ctx, Block& block,
std::vector<aco_ptr<Instruction>>& instructions, uint32_t spill_slot,
@ -1254,7 +1200,7 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block,
}
} else {
if (ctx.scratch_rsrc == Temp())
ctx.scratch_rsrc = load_scratch_resource(ctx, rsrc_bld, overflow);
ctx.scratch_rsrc = load_scratch_resource(ctx.program, rsrc_bld, ctx.resume_idx, overflow);
if (overflow) {
uint32_t soffset =

View file

@ -3235,43 +3235,6 @@ visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr)
}
}
Temp
get_scratch_resource(isel_context* ctx)
{
Builder bld(ctx->program, ctx->block);
Temp scratch_addr;
if (!ctx->program->private_segment_buffers.empty())
scratch_addr = ctx->program->private_segment_buffers.back();
if (!scratch_addr.bytes()) {
Temp addr_lo =
bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
Temp addr_hi =
bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
scratch_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
} else if (ctx->stage.hw != AC_HW_COMPUTE_SHADER) {
scratch_addr =
bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
}
struct ac_buffer_state ac_state = {0};
uint32_t desc[4];
ac_state.size = 0xffffffff;
ac_state.format = PIPE_FORMAT_R32_FLOAT;
for (int i = 0; i < 4; i++)
ac_state.swizzle[i] = PIPE_SWIZZLE_0;
/* older generations need element size = 4 bytes. element size removed in GFX9 */
ac_state.element_size = ctx->program->gfx_level <= GFX8 ? 1u : 0u;
ac_state.index_stride = ctx->program->wave_size == 64 ? 3u : 2u;
ac_state.add_tid = true;
ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
ac_build_buffer_descriptor(ctx->program->gfx_level, &ac_state, desc);
return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(desc[2]),
Operand::c32(desc[3]));
}
void
visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
{
@ -3297,7 +3260,8 @@ visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
params.max_const_offset = ctx->program->dev.scratch_global_offset_max;
emit_load(ctx, bld, info, params);
} else {
info.resource = get_scratch_resource(ctx);
info.resource = load_scratch_resource(
ctx->program, bld, ctx->program->private_segment_buffers.size() - 1, false);
info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
info.soffset = ctx->program->scratch_offsets.back();
emit_load(ctx, bld, info, scratch_mubuf_load_params);
@ -3351,7 +3315,8 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
memory_sync_info(storage_scratch, semantic_private));
}
} else {
Temp rsrc = get_scratch_resource(ctx);
Temp rsrc = load_scratch_resource(ctx->program, bld,
ctx->program->private_segment_buffers.size() - 1, false);
offset = as_vgpr(ctx, offset);
for (unsigned i = 0; i < write_count; i++) {
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());