From 3d8db3cbbb9635cd271c5ccbd4b4d54d14633f57 Mon Sep 17 00:00:00 2001 From: Natalie Vock Date: Fri, 28 Mar 2025 18:04:43 +0100 Subject: [PATCH] aco: Make private_segment_buffer/scratch_offset per-resume We need different Temps for each resume shader, because registers aren't preserved across resume boundaries. This was likely fine in practice because arg registers are the same for each shader, but resulted in invalid IR and asserts. Fixes crashes in Indiana Jones RT with assertions enabled on GFX8. Cc: mesa-stable Part-of: --- .../compiler/aco_instruction_selection.cpp | 12 +++++----- src/amd/compiler/aco_ir.h | 5 +++-- src/amd/compiler/aco_reindex_ssa.cpp | 12 ++++++---- src/amd/compiler/aco_spill.cpp | 22 ++++++++++++++----- 4 files changed, 35 insertions(+), 16 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 873d7886fc8..7b33c446215 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -7416,7 +7416,9 @@ Temp get_scratch_resource(isel_context* ctx) { Builder bld(ctx->program, ctx->block); - Temp scratch_addr = ctx->program->private_segment_buffer; + Temp scratch_addr; + if (!ctx->program->private_segment_buffers.empty()) + scratch_addr = ctx->program->private_segment_buffers.back(); if (!scratch_addr.bytes()) { Temp addr_lo = bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo)); @@ -7474,7 +7476,7 @@ visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr) } else { info.resource = get_scratch_resource(ctx); info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa))); - info.soffset = ctx->program->scratch_offset; + info.soffset = ctx->program->scratch_offsets.back(); emit_load(ctx, bld, info, scratch_mubuf_load_params); } } @@ -7530,7 +7532,7 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr) offset = as_vgpr(ctx, offset); for (unsigned i = 0; i < write_count; i++) { aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); - Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, + Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offsets.back(), write_datas[i], offsets[i], true); mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private); unsigned access = ACCESS_TYPE_STORE | ACCESS_IS_SWIZZLED_AMD | @@ -10910,9 +10912,9 @@ add_startpgm(struct isel_context* ctx) * handling spilling. */ if (ctx->args->ring_offsets.used) - ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets); + ctx->program->private_segment_buffers.push_back(get_arg(ctx, ctx->args->ring_offsets)); - ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset); + ctx->program->scratch_offsets.push_back(get_arg(ctx, ctx->args->scratch_offset)); } else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) { /* Manually initialize scratch. For RT stages scratch initialization is done in the prolog. */ diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index e6eb1a5d910..a14fb760569 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2137,8 +2137,9 @@ public: std::vector debug_info; std::vector constant_data; - Temp private_segment_buffer; - Temp scratch_offset; + /* Private segment buffers and scratch offsets. One entry per start/resume block */ + aco::small_vec private_segment_buffers; + aco::small_vec scratch_offsets; uint16_t num_waves = 0; uint16_t min_waves = 0; diff --git a/src/amd/compiler/aco_reindex_ssa.cpp b/src/amd/compiler/aco_reindex_ssa.cpp index 7c30e5b5365..f06da735039 100644 --- a/src/amd/compiler/aco_reindex_ssa.cpp +++ b/src/amd/compiler/aco_reindex_ssa.cpp @@ -69,10 +69,14 @@ reindex_program(idx_ctx& ctx, Program* program) } /* update program members */ - program->private_segment_buffer = Temp(ctx.renames[program->private_segment_buffer.id()], - program->private_segment_buffer.regClass()); - program->scratch_offset = - Temp(ctx.renames[program->scratch_offset.id()], program->scratch_offset.regClass()); + for (auto& private_segment_buffer : program->private_segment_buffers) { + private_segment_buffer = + Temp(ctx.renames[private_segment_buffer.id()], private_segment_buffer.regClass()); + } + for (auto& scratch_offset : program->scratch_offsets) { + scratch_offset = + Temp(ctx.renames[scratch_offset.id()], scratch_offset.regClass()); + } program->temp_rc = ctx.temp_rc; } diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 03479b2691a..8fd52790003 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -89,13 +89,16 @@ struct spill_ctx { unsigned vgpr_spill_slots; Temp scratch_rsrc; + unsigned resume_idx; + spill_ctx(const RegisterDemand target_pressure_, Program* program_) : target_pressure(target_pressure_), program(program_), memory(), renames(program->blocks.size(), aco::map(memory)), spills_entry(program->blocks.size(), aco::unordered_map(memory)), spills_exit(program->blocks.size(), aco::unordered_map(memory)), processed(program->blocks.size(), false), ssa_infos(program->peekAllocationId()), - remat(memory), wave_size(program->wave_size), sgpr_spill_slots(0), vgpr_spill_slots(0) + remat(memory), wave_size(program->wave_size), sgpr_spill_slots(0), vgpr_spill_slots(0), + resume_idx(0) {} void add_affinity(uint32_t first, uint32_t second) @@ -1131,7 +1134,10 @@ spill_block(spill_ctx& ctx, unsigned block_idx) Temp load_scratch_resource(spill_ctx& ctx, Builder& bld, bool apply_scratch_offset) { - Temp private_segment_buffer = ctx.program->private_segment_buffer; + Temp private_segment_buffer; + if (!ctx.program->private_segment_buffers.empty()) + private_segment_buffer = ctx.program->private_segment_buffers[ctx.resume_idx]; + if (!private_segment_buffer.bytes()) { Temp addr_lo = bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo)); @@ -1152,7 +1158,7 @@ load_scratch_resource(spill_ctx& ctx, Builder& bld, bool apply_scratch_offset) Temp carry = bld.tmp(s1); addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo, - ctx.program->scratch_offset); + ctx.program->scratch_offsets[ctx.resume_idx]); addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi, Operand::c32(0), bld.scc(carry)); @@ -1261,7 +1267,9 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector>& inst uint32_t spill_id = spill->operands[1].constantValue(); uint32_t spill_slot = slots[spill_id]; - Temp scratch_offset = ctx.program->scratch_offset; + Temp scratch_offset; + if (!ctx.program->scratch_offsets.empty()) + scratch_offset = ctx.program->scratch_offsets[ctx.resume_idx]; unsigned offset; setup_vgpr_spill_reload(ctx, block, instructions, spill_slot, scratch_offset, &offset); @@ -1307,7 +1315,9 @@ reload_vgpr(spill_ctx& ctx, Block& block, std::vector>& ins uint32_t spill_id = reload->operands[0].constantValue(); uint32_t spill_slot = slots[spill_id]; - Temp scratch_offset = ctx.program->scratch_offset; + Temp scratch_offset; + if (!ctx.program->scratch_offsets.empty()) + scratch_offset = ctx.program->scratch_offsets[ctx.resume_idx]; unsigned offset; setup_vgpr_spill_reload(ctx, block, instructions, spill_slot, scratch_offset, &offset); @@ -1531,6 +1541,8 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) * we cannot reuse the current scratch_rsrc temp because its definition is unreachable */ if (block.linear_preds.empty()) ctx.scratch_rsrc = Temp(); + if (block.kind & block_kind_resume) + ++ctx.resume_idx; } std::vector>::iterator it;