From b9c5b273b0a9b12c9f962e511dac801b3732f8fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Tue, 20 Jun 2023 19:03:35 +0200 Subject: [PATCH] aco/assembler: align loops if it reduces the number of cache lines This is especially beneficial on GFX6-9. Totals from 11229 (8.46% of 132726) affected shaders: GFX11 CodeSize: 109608640 -> 109840916 (+0.21%) Part-of: --- src/amd/compiler/aco_assembler.cpp | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index c4a341dab3f..0b3d30ac897 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -48,6 +48,7 @@ struct asm_context { std::map constaddrs; std::map resumeaddrs; std::vector* symbols; + Block* loop_header; const int16_t* opcode; // TODO: keep track of branch instructions referring blocks // and, when emitting the block, correct the offset in instr @@ -1220,6 +1221,35 @@ fix_constaddrs(asm_context& ctx, std::vector& out) void align_block(asm_context& ctx, std::vector& code, Block& block) { + if (block.kind & block_kind_loop_exit && ctx.loop_header) { + Block* loop_header = ctx.loop_header; + ctx.loop_header = NULL; + std::vector nops; + + const unsigned loop_num_cl = DIV_ROUND_UP(block.offset - loop_header->offset, 16); + const unsigned loop_start_cl = loop_header->offset >> 4; + const unsigned loop_end_cl = (block.offset - 1) >> 4; + + /* Align the loop if it fits into a single cache line or if we can + * reduce the number of cache lines with less than 8 NOPs. + */ + const bool align_loop = loop_end_cl - loop_start_cl >= loop_num_cl && + (loop_num_cl == 1 || loop_header->offset % 16 > 8); + + if (align_loop) { + nops.resize(16 - (loop_header->offset % 16), 0xbf800000u); + insert_code(ctx, code, loop_header->offset, nops.size(), nops.data()); + } + } + + if (block.kind & block_kind_loop_header) { + /* In case of nested loops, only handle the inner-most loops in order + * to not break the alignment of inner loops by handling outer loops. + * Also ignore loops without back-edge. + */ + ctx.loop_header = block.linear_preds.size() > 1 ? &block : NULL; + } + /* align resume shaders with cache line */ if (block.kind & block_kind_resume) { size_t cache_aligned = align(code.size(), 16);