diff --git a/.pick_status.json b/.pick_status.json index 45c5b49616d..9e56eaaae24 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -574,7 +574,7 @@ "description": "aco: Make private_segment_buffer/scratch_offset per-resume", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 7c8c3ecddb6..7ae302a78cf 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -7391,7 +7391,9 @@ Temp get_scratch_resource(isel_context* ctx) { Builder bld(ctx->program, ctx->block); - Temp scratch_addr = ctx->program->private_segment_buffer; + Temp scratch_addr; + if (!ctx->program->private_segment_buffers.empty()) + scratch_addr = ctx->program->private_segment_buffers.back(); if (!scratch_addr.bytes()) { Temp addr_lo = bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo)); @@ -7449,7 +7451,7 @@ visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr) } else { info.resource = get_scratch_resource(ctx); info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa))); - info.soffset = ctx->program->scratch_offset; + info.soffset = ctx->program->scratch_offsets.back(); emit_load(ctx, bld, info, scratch_mubuf_load_params); } } @@ -7505,7 +7507,7 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr) offset = as_vgpr(ctx, offset); for (unsigned i = 0; i < write_count; i++) { aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); - Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, + Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offsets.back(), write_datas[i], offsets[i], true); mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private); unsigned access = ACCESS_TYPE_STORE | ACCESS_IS_SWIZZLED_AMD | @@ -10929,9 +10931,9 @@ add_startpgm(struct isel_context* ctx) * handling spilling. */ if (ctx->args->ring_offsets.used) - ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets); + ctx->program->private_segment_buffers.push_back(get_arg(ctx, ctx->args->ring_offsets)); - ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset); + ctx->program->scratch_offsets.push_back(get_arg(ctx, ctx->args->scratch_offset)); } else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) { /* Manually initialize scratch. For RT stages scratch initialization is done in the prolog. */ diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 0a18d7445b3..eeceef0e5c9 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2130,8 +2130,9 @@ public: std::vector debug_info; std::vector constant_data; - Temp private_segment_buffer; - Temp scratch_offset; + /* Private segment buffers and scratch offsets. One entry per start/resume block */ + aco::small_vec private_segment_buffers; + aco::small_vec scratch_offsets; uint16_t num_waves = 0; uint16_t min_waves = 0; diff --git a/src/amd/compiler/aco_reindex_ssa.cpp b/src/amd/compiler/aco_reindex_ssa.cpp index 7c30e5b5365..f06da735039 100644 --- a/src/amd/compiler/aco_reindex_ssa.cpp +++ b/src/amd/compiler/aco_reindex_ssa.cpp @@ -69,10 +69,14 @@ reindex_program(idx_ctx& ctx, Program* program) } /* update program members */ - program->private_segment_buffer = Temp(ctx.renames[program->private_segment_buffer.id()], - program->private_segment_buffer.regClass()); - program->scratch_offset = - Temp(ctx.renames[program->scratch_offset.id()], program->scratch_offset.regClass()); + for (auto& private_segment_buffer : program->private_segment_buffers) { + private_segment_buffer = + Temp(ctx.renames[private_segment_buffer.id()], private_segment_buffer.regClass()); + } + for (auto& scratch_offset : program->scratch_offsets) { + scratch_offset = + Temp(ctx.renames[scratch_offset.id()], scratch_offset.regClass()); + } program->temp_rc = ctx.temp_rc; } diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index ae7ae16e329..4cdb0cdeb14 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -88,13 +88,16 @@ struct spill_ctx { unsigned vgpr_spill_slots; Temp scratch_rsrc; + unsigned resume_idx; + spill_ctx(const RegisterDemand target_pressure_, Program* program_) : target_pressure(target_pressure_), program(program_), memory(), renames(program->blocks.size(), aco::map(memory)), spills_entry(program->blocks.size(), aco::unordered_map(memory)), spills_exit(program->blocks.size(), aco::unordered_map(memory)), processed(program->blocks.size(), false), ssa_infos(program->peekAllocationId()), - remat(memory), wave_size(program->wave_size), sgpr_spill_slots(0), vgpr_spill_slots(0) + remat(memory), wave_size(program->wave_size), sgpr_spill_slots(0), vgpr_spill_slots(0), + resume_idx(0) {} void add_affinity(uint32_t first, uint32_t second) @@ -1088,7 +1091,10 @@ spill_block(spill_ctx& ctx, unsigned block_idx) Temp load_scratch_resource(spill_ctx& ctx, Builder& bld, bool apply_scratch_offset) { - Temp private_segment_buffer = ctx.program->private_segment_buffer; + Temp private_segment_buffer; + if (!ctx.program->private_segment_buffers.empty()) + private_segment_buffer = ctx.program->private_segment_buffers[ctx.resume_idx]; + if (!private_segment_buffer.bytes()) { Temp addr_lo = bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo)); @@ -1109,7 +1115,7 @@ load_scratch_resource(spill_ctx& ctx, Builder& bld, bool apply_scratch_offset) Temp carry = bld.tmp(s1); addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo, - ctx.program->scratch_offset); + ctx.program->scratch_offsets[ctx.resume_idx]); addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi, Operand::c32(0), bld.scc(carry)); @@ -1218,7 +1224,9 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector>& inst uint32_t spill_id = spill->operands[1].constantValue(); uint32_t spill_slot = slots[spill_id]; - Temp scratch_offset = ctx.program->scratch_offset; + Temp scratch_offset; + if (!ctx.program->scratch_offsets.empty()) + scratch_offset = ctx.program->scratch_offsets[ctx.resume_idx]; unsigned offset; setup_vgpr_spill_reload(ctx, block, instructions, spill_slot, scratch_offset, &offset); @@ -1264,7 +1272,9 @@ reload_vgpr(spill_ctx& ctx, Block& block, std::vector>& ins uint32_t spill_id = reload->operands[0].constantValue(); uint32_t spill_slot = slots[spill_id]; - Temp scratch_offset = ctx.program->scratch_offset; + Temp scratch_offset; + if (!ctx.program->scratch_offsets.empty()) + scratch_offset = ctx.program->scratch_offsets[ctx.resume_idx]; unsigned offset; setup_vgpr_spill_reload(ctx, block, instructions, spill_slot, scratch_offset, &offset); @@ -1488,6 +1498,8 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) * we cannot reuse the current scratch_rsrc temp because its definition is unreachable */ if (block.linear_preds.empty()) ctx.scratch_rsrc = Temp(); + if (block.kind & block_kind_resume) + ++ctx.resume_idx; } std::vector>::iterator it;