From b6a28aaa8bacebf0ff78f2d811a80d3e77a8ef3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Mon, 22 Feb 2021 14:58:37 +0100 Subject: [PATCH] aco/cssa: don't create parallelcopies for constants and exec if we are able to spill these directly. Totals from 4913 (3.60% of 136546) affected shaders (Raven): SpillSGPRs: 16021 -> 15451 (-3.56%); split: -3.87%, +0.31% CodeSize: 58102020 -> 57371464 (-1.26%); split: -1.26%, +0.00% Instrs: 11411454 -> 11230105 (-1.59%); split: -1.59%, +0.00% Latency: 555706331 -> 550058635 (-1.02%); split: -1.07%, +0.05% InvThroughput: 273023354 -> 271854469 (-0.43%); split: -0.44%, +0.01% SClause: 385168 -> 385371 (+0.05%); split: -0.01%, +0.06% Copies: 1342084 -> 1175762 (-12.39%); split: -12.40%, +0.01% Branches: 392619 -> 378662 (-3.55%); split: -3.56%, +0.00% Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_lower_to_cssa.cpp | 27 +++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_lower_to_cssa.cpp b/src/amd/compiler/aco_lower_to_cssa.cpp index 3613df1c8c3..e1eec37c20d 100644 --- a/src/amd/compiler/aco_lower_to_cssa.cpp +++ b/src/amd/compiler/aco_lower_to_cssa.cpp @@ -78,10 +78,17 @@ void collect_parallelcopies(cssa_ctx& ctx) phi->opcode != aco_opcode::p_linear_phi) break; + const Definition& def = phi->definitions[0]; + + /* if the definition is not temp, it is the exec mask. + * We can reload the exec mask directly from the spill slot. + */ + if (!def.isTemp()) + continue; + std::vector& preds = phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; - const Definition& def = phi->definitions[0]; uint32_t index = ctx.merge_sets.size(); merge_set set; @@ -91,6 +98,20 @@ void collect_parallelcopies(cssa_ctx& ctx) if (op.isUndefined()) continue; + if (def.regClass().type() == RegType::sgpr && !op.isTemp()) { + /* SGPR inline constants and literals on GFX10+ can be spilled + * and reloaded directly (without intermediate register) */ + if (op.isConstant()) { + if (ctx.program->chip_class >= GFX10) + continue; + if (op.size() == 1 && !op.isLiteral()) + continue; + } else { + assert(op.isFixed() && op.physReg() == exec); + continue; + } + } + /* create new temporary and rename operands */ Temp tmp = bld.tmp(def.regClass()); ctx.parallelcopies[preds[i]].emplace_back(copy{Definition(tmp), op}); @@ -107,6 +128,10 @@ void collect_parallelcopies(cssa_ctx& ctx) has_preheader_copy |= i == 0 && block.kind & block_kind_loop_header; } + + if (set.empty()) + continue; + /* place the definition in dominance-order */ if (def.isTemp()) { if (has_preheader_copy)