aco: Make private_segment_buffer/scratch_offset per-resume

We need different Temps for each resume shader, because registers aren't
preserved across resume boundaries.

This was likely fine in practice because arg registers are the same for
each shader, but resulted in invalid IR and asserts.

Fixes crashes in Indiana Jones RT with assertions enabled on GFX8.

Cc: mesa-stable
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34114>
This commit is contained in:
Natalie Vock 2025-03-28 18:04:43 +01:00 committed by Marge Bot
parent 76096d04bb
commit 3d8db3cbbb
4 changed files with 35 additions and 16 deletions

View file

@ -7416,7 +7416,9 @@ Temp
get_scratch_resource(isel_context* ctx)
{
Builder bld(ctx->program, ctx->block);
Temp scratch_addr = ctx->program->private_segment_buffer;
Temp scratch_addr;
if (!ctx->program->private_segment_buffers.empty())
scratch_addr = ctx->program->private_segment_buffers.back();
if (!scratch_addr.bytes()) {
Temp addr_lo =
bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
@ -7474,7 +7476,7 @@ visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
} else {
info.resource = get_scratch_resource(ctx);
info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
info.soffset = ctx->program->scratch_offset;
info.soffset = ctx->program->scratch_offsets.back();
emit_load(ctx, bld, info, scratch_mubuf_load_params);
}
}
@ -7530,7 +7532,7 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
offset = as_vgpr(ctx, offset);
for (unsigned i = 0; i < write_count; i++) {
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset,
Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offsets.back(),
write_datas[i], offsets[i], true);
mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
unsigned access = ACCESS_TYPE_STORE | ACCESS_IS_SWIZZLED_AMD |
@ -10910,9 +10912,9 @@ add_startpgm(struct isel_context* ctx)
* handling spilling.
*/
if (ctx->args->ring_offsets.used)
ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
ctx->program->private_segment_buffers.push_back(get_arg(ctx, ctx->args->ring_offsets));
ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset);
ctx->program->scratch_offsets.push_back(get_arg(ctx, ctx->args->scratch_offset));
} else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) {
/* Manually initialize scratch. For RT stages scratch initialization is done in the prolog.
*/

View file

@ -2137,8 +2137,9 @@ public:
std::vector<ac_shader_debug_info> debug_info;
std::vector<uint8_t> constant_data;
Temp private_segment_buffer;
Temp scratch_offset;
/* Private segment buffers and scratch offsets. One entry per start/resume block */
aco::small_vec<Temp, 2> private_segment_buffers;
aco::small_vec<Temp, 2> scratch_offsets;
uint16_t num_waves = 0;
uint16_t min_waves = 0;

View file

@ -69,10 +69,14 @@ reindex_program(idx_ctx& ctx, Program* program)
}
/* update program members */
program->private_segment_buffer = Temp(ctx.renames[program->private_segment_buffer.id()],
program->private_segment_buffer.regClass());
program->scratch_offset =
Temp(ctx.renames[program->scratch_offset.id()], program->scratch_offset.regClass());
for (auto& private_segment_buffer : program->private_segment_buffers) {
private_segment_buffer =
Temp(ctx.renames[private_segment_buffer.id()], private_segment_buffer.regClass());
}
for (auto& scratch_offset : program->scratch_offsets) {
scratch_offset =
Temp(ctx.renames[scratch_offset.id()], scratch_offset.regClass());
}
program->temp_rc = ctx.temp_rc;
}

View file

@ -89,13 +89,16 @@ struct spill_ctx {
unsigned vgpr_spill_slots;
Temp scratch_rsrc;
unsigned resume_idx;
spill_ctx(const RegisterDemand target_pressure_, Program* program_)
: target_pressure(target_pressure_), program(program_), memory(),
renames(program->blocks.size(), aco::map<Temp, Temp>(memory)),
spills_entry(program->blocks.size(), aco::unordered_map<Temp, uint32_t>(memory)),
spills_exit(program->blocks.size(), aco::unordered_map<Temp, uint32_t>(memory)),
processed(program->blocks.size(), false), ssa_infos(program->peekAllocationId()),
remat(memory), wave_size(program->wave_size), sgpr_spill_slots(0), vgpr_spill_slots(0)
remat(memory), wave_size(program->wave_size), sgpr_spill_slots(0), vgpr_spill_slots(0),
resume_idx(0)
{}
void add_affinity(uint32_t first, uint32_t second)
@ -1131,7 +1134,10 @@ spill_block(spill_ctx& ctx, unsigned block_idx)
Temp
load_scratch_resource(spill_ctx& ctx, Builder& bld, bool apply_scratch_offset)
{
Temp private_segment_buffer = ctx.program->private_segment_buffer;
Temp private_segment_buffer;
if (!ctx.program->private_segment_buffers.empty())
private_segment_buffer = ctx.program->private_segment_buffers[ctx.resume_idx];
if (!private_segment_buffer.bytes()) {
Temp addr_lo =
bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
@ -1152,7 +1158,7 @@ load_scratch_resource(spill_ctx& ctx, Builder& bld, bool apply_scratch_offset)
Temp carry = bld.tmp(s1);
addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo,
ctx.program->scratch_offset);
ctx.program->scratch_offsets[ctx.resume_idx]);
addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi,
Operand::c32(0), bld.scc(carry));
@ -1261,7 +1267,9 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& inst
uint32_t spill_id = spill->operands[1].constantValue();
uint32_t spill_slot = slots[spill_id];
Temp scratch_offset = ctx.program->scratch_offset;
Temp scratch_offset;
if (!ctx.program->scratch_offsets.empty())
scratch_offset = ctx.program->scratch_offsets[ctx.resume_idx];
unsigned offset;
setup_vgpr_spill_reload(ctx, block, instructions, spill_slot, scratch_offset, &offset);
@ -1307,7 +1315,9 @@ reload_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& ins
uint32_t spill_id = reload->operands[0].constantValue();
uint32_t spill_slot = slots[spill_id];
Temp scratch_offset = ctx.program->scratch_offset;
Temp scratch_offset;
if (!ctx.program->scratch_offsets.empty())
scratch_offset = ctx.program->scratch_offsets[ctx.resume_idx];
unsigned offset;
setup_vgpr_spill_reload(ctx, block, instructions, spill_slot, scratch_offset, &offset);
@ -1531,6 +1541,8 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
* we cannot reuse the current scratch_rsrc temp because its definition is unreachable */
if (block.linear_preds.empty())
ctx.scratch_rsrc = Temp();
if (block.kind & block_kind_resume)
++ctx.resume_idx;
}
std::vector<aco_ptr<Instruction>>::iterator it;