From 61854009f31ae44e7426649738f5e68ac459df02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Thu, 4 Jan 2024 15:50:10 +0100 Subject: [PATCH] aco: rematerialize constants in every basic block during optimizer Totals from 16837 (21.25% of 79242) affected shaders: (GFX11) MaxWaves: 441634 -> 444546 (+0.66%); split: +0.66%, -0.00% Instrs: 25908303 -> 25838469 (-0.27%); split: -0.36%, +0.09% CodeSize: 133943168 -> 135446948 (+1.12%); split: -0.04%, +1.16% VGPRs: 985332 -> 977440 (-0.80%); split: -0.83%, +0.03% SpillSGPRs: 9133 -> 7535 (-17.50%); split: -17.74%, +0.24% SpillVGPRs: 1418 -> 1359 (-4.16%); split: -4.58%, +0.42% Scratch: 5047552 -> 5040640 (-0.14%) Latency: 204330340 -> 204179212 (-0.07%); split: -0.32%, +0.25% InvThroughput: 36584220 -> 36508856 (-0.21%); split: -0.40%, +0.19% VClause: 437847 -> 437344 (-0.11%); split: -0.34%, +0.22% SClause: 771311 -> 771013 (-0.04%); split: -0.42%, +0.38% Copies: 1774950 -> 1712070 (-3.54%); split: -4.46%, +0.91% Branches: 580595 -> 580478 (-0.02%); split: -0.03%, +0.01% PreSGPRs: 877017 -> 817549 (-6.78%) PreVGPRs: 852747 -> 846966 (-0.68%); split: -0.68%, +0.00% Part-of: --- src/amd/compiler/aco_optimizer.cpp | 100 +++++++++++++++++++++++++++-- 1 file changed, 94 insertions(+), 6 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index dc7e003b6ab..d4b52e85402 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -521,7 +521,7 @@ struct opt_ctx { Program* program; float_mode fp_mode; std::vector> instructions; - ssa_info* info; + std::vector info; std::pair last_literal; std::vector mad_infos; std::vector uses; @@ -3116,6 +3116,7 @@ combine_add_sub_b2i(opt_ctx& ctx, aco_ptr& instr, aco_opcode new_op * uses properly initialized to 0. */ ctx.uses.push_back(0); + ctx.info.push_back(ssa_info{}); } new_instr->operands[0] = Operand::zero(); new_instr->operands[1] = instr->operands[!i]; @@ -4616,6 +4617,91 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) } } +struct remat_entry { + Instruction* instr; + uint32_t block; +}; + +inline bool +is_constant(Instruction* instr) +{ + if (instr->opcode != aco_opcode::p_parallelcopy || instr->operands.size() != 1) + return false; + + return instr->operands[0].isConstant() && instr->definitions[0].isTemp(); +} + +void +remat_constants_instr(opt_ctx& ctx, aco::map& constants, Instruction* instr, + uint32_t block_idx) +{ + for (Operand& op : instr->operands) { + if (!op.isTemp()) + continue; + + auto it = constants.find(op.getTemp()); + if (it == constants.end()) + continue; + + /* Check if we already emitted the same constant in this block. */ + if (it->second.block != block_idx) { + /* Rematerialize the constant. */ + Builder bld(ctx.program, &ctx.instructions); + Operand const_op = it->second.instr->operands[0]; + it->second.instr = bld.copy(bld.def(op.regClass()), const_op); + it->second.block = block_idx; + ctx.uses.push_back(0); + ctx.info.push_back(ctx.info[op.tempId()]); + } + + /* Use the rematerialized constant and update information about latest use. */ + if (op.getTemp() != it->second.instr->definitions[0].getTemp()) { + ctx.uses[op.tempId()]--; + op.setTemp(it->second.instr->definitions[0].getTemp()); + ctx.uses[op.tempId()]++; + } + } +} + +/** + * This pass implements a simple constant rematerialization. + * As common subexpression elimination (CSE) might increase the live-ranges + * of loaded constants over large distances, this pass splits the live-ranges + * again by re-emitting constants in every basic block. + */ +void +rematerialize_constants(opt_ctx& ctx) +{ + aco::monotonic_buffer_resource memory(1024); + aco::map constants(memory); + + for (Block& block : ctx.program->blocks) { + if (block.logical_idom == -1) + continue; + + if (block.logical_idom == (int)block.index) + constants.clear(); + + ctx.instructions.reserve(block.instructions.size()); + + for (aco_ptr& instr : block.instructions) { + if (is_dead(ctx.uses, instr.get())) + continue; + + if (is_constant(instr.get())) { + Temp tmp = instr->definitions[0].getTemp(); + constants[tmp] = {instr.get(), block.index}; + } else if (!is_phi(instr)) { + remat_constants_instr(ctx, constants, instr.get(), block.index); + } + + ctx.instructions.emplace_back(instr.release()); + } + + block.instructions = std::move(ctx.instructions); + } +} + bool to_uniform_bool_instr(opt_ctx& ctx, aco_ptr& instr) { @@ -5301,8 +5387,7 @@ optimize(Program* program) { opt_ctx ctx; ctx.program = program; - std::vector info(program->peekAllocationId()); - ctx.info = info.data(); + ctx.info = std::vector(program->peekAllocationId()); /* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */ for (Block& block : program->blocks) { @@ -5313,14 +5398,17 @@ optimize(Program* program) ctx.uses = dead_code_analysis(program); - /* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */ + /* 2. Rematerialize constants in every block. */ + rematerialize_constants(ctx); + + /* 3. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */ for (Block& block : program->blocks) { ctx.fp_mode = block.fp_mode; for (aco_ptr& instr : block.instructions) combine_instruction(ctx, instr); } - /* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */ + /* 4. Top-Down DAG pass (backward) to select instructions (includes DCE) */ for (auto block_rit = program->blocks.rbegin(); block_rit != program->blocks.rend(); ++block_rit) { Block* block = &(*block_rit); @@ -5330,7 +5418,7 @@ optimize(Program* program) select_instruction(ctx, *instr_rit); } - /* 4. Add literals to instructions */ + /* 5. Add literals to instructions */ for (Block& block : program->blocks) { ctx.instructions.reserve(block.instructions.size()); ctx.fp_mode = block.fp_mode;