From 7c53e5748b0c93d8dfd6ecc01efb6e345367df2b Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 10 Jan 2023 15:29:15 +0000 Subject: [PATCH] aco: end reduce tmp after control flow, when used within control flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the case of: v0 = start_linear_vgpr if (...) { } else { use_linear_vgpr(v0) } v0 = phi We need a p_end_linear_vgpr to ensure that the phi does not use the same VGPR as the linear VGPR. fossil-db (gfx1100): Totals from 3763 (2.80% of 134574) affected shaders: MaxWaves: 90296 -> 90164 (-0.15%) Instrs: 6857726 -> 6856608 (-0.02%); split: -0.03%, +0.01% CodeSize: 35382188 -> 35377688 (-0.01%); split: -0.02%, +0.01% VGPRs: 234864 -> 235692 (+0.35%); split: -0.01%, +0.36% Latency: 47471923 -> 47474965 (+0.01%); split: -0.03%, +0.04% InvThroughput: 5640320 -> 5639736 (-0.01%); split: -0.04%, +0.03% VClause: 93098 -> 93107 (+0.01%); split: -0.01%, +0.02% SClause: 214137 -> 214130 (-0.00%); split: -0.00%, +0.00% Copies: 369895 -> 369305 (-0.16%); split: -0.31%, +0.15% Branches: 164996 -> 164504 (-0.30%); split: -0.30%, +0.00% PreVGPRs: 210655 -> 211438 (+0.37%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Cc: mesa-stable Part-of: (cherry picked from commit 44fdd2ebcb271011665dd100ba9ef6852cddb22e) --- .pick_status.json | 2 +- src/amd/compiler/aco_reduce_assign.cpp | 45 ++++++++++++++------------ 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/.pick_status.json b/.pick_status.json index f5f4e7f4e52..67f021f1387 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -265,7 +265,7 @@ "description": "aco: end reduce tmp after control flow, when used within control flow", "nominated": true, "nomination_type": 0, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null }, diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp index 82ecd53f625..5f902c5e2a6 100644 --- a/src/amd/compiler/aco_reduce_assign.cpp +++ b/src/amd/compiler/aco_reduce_assign.cpp @@ -64,31 +64,36 @@ setup_reduce_temp(Program* program) Temp vtmp(0, RegClass(RegType::vgpr, maxSize).as_linear()); int inserted_at = -1; int vtmp_inserted_at = -1; - bool reduceTmp_in_loop = false; bool vtmp_in_loop = false; for (Block& block : program->blocks) { - /* insert p_end_linear_vgpr after the outermost loop */ - if (reduceTmp_in_loop && block.loop_nest_depth == 0) { - assert(inserted_at == (int)last_top_level_block_idx); - - aco_ptr end{create_instruction( - aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_in_loop ? 2 : 1, 0)}; - end->operands[0] = Operand(reduceTmp); - if (vtmp_in_loop) - end->operands[1] = Operand(vtmp); - /* insert after the phis of the loop exit block */ - std::vector>::iterator it = block.instructions.begin(); - while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi) - ++it; - block.instructions.insert(it, std::move(end)); - reduceTmp_in_loop = false; - } - - if (block.kind & block_kind_top_level) + if (block.kind & block_kind_top_level) { last_top_level_block_idx = block.index; + /* TODO: this could be improved in this case: + * start_linear_vgpr + * if (...) { + * use_linear_vgpr + * } + * end_linear_vgpr + * Here, the linear vgpr is used before any phi copies, so this isn't necessary. + */ + if (inserted_at >= 0) { + aco_ptr end{create_instruction( + aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_inserted_at >= 0 ? 2 : 1, 0)}; + end->operands[0] = Operand(reduceTmp); + if (vtmp_inserted_at >= 0) + end->operands[1] = Operand(vtmp); + /* insert after the phis of the block */ + std::vector>::iterator it = block.instructions.begin(); + while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi) + ++it; + block.instructions.insert(it, std::move(end)); + inserted_at = vtmp_inserted_at = -1; + } + } + if (!hasReductions[block.index]) continue; @@ -100,8 +105,6 @@ setup_reduce_temp(Program* program) instr->opcode != aco_opcode::p_bpermute_gfx11w64) continue; - reduceTmp_in_loop |= block.loop_nest_depth > 0; - if ((int)last_top_level_block_idx != inserted_at) { reduceTmp = program->allocateTmp(reduceTmp.regClass()); aco_ptr create{create_instruction(