aco/assembler: align loops if it reduces the number of cache lines

This is especially beneficial on GFX6-9. Totals from 11229 (8.46% of 132726) affected shaders: GFX11 CodeSize: 109608640 -> 109840916 (+0.21%) Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23748>
2026-05-08 22:08:26 +02:00 · 2023-06-20 19:03:35 +02:00 · 2023-06-20 19:03:35 +02:00 · b9c5b273b0
commit b9c5b273b0
parent de8ecc127e
1 changed files with 30 additions and 0 deletions
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@ -48,6 +48,7 @@ struct asm_context {
   std::map<unsigned, constaddr_info> constaddrs;
   std::map<unsigned, constaddr_info> resumeaddrs;
   std::vector<struct aco_symbol>* symbols;
+   Block* loop_header;
   const int16_t* opcode;
   // TODO: keep track of branch instructions referring blocks
   // and, when emitting the block, correct the offset in instr
@ -1220,6 +1221,35 @@ fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
 void
 align_block(asm_context& ctx, std::vector<uint32_t>& code, Block& block)
 {
+   if (block.kind & block_kind_loop_exit && ctx.loop_header) {
+      Block* loop_header = ctx.loop_header;
+      ctx.loop_header = NULL;
+      std::vector<uint32_t> nops;
+
+      const unsigned loop_num_cl = DIV_ROUND_UP(block.offset - loop_header->offset, 16);
+      const unsigned loop_start_cl = loop_header->offset >> 4;
+      const unsigned loop_end_cl = (block.offset - 1) >> 4;
+
+      /* Align the loop if it fits into a single cache line or if we can
+       * reduce the number of cache lines with less than 8 NOPs.
+       */
+      const bool align_loop = loop_end_cl - loop_start_cl >= loop_num_cl &&
+                              (loop_num_cl == 1 || loop_header->offset % 16 > 8);
+
+      if (align_loop) {
+         nops.resize(16 - (loop_header->offset % 16), 0xbf800000u);
+         insert_code(ctx, code, loop_header->offset, nops.size(), nops.data());
+      }
+   }
+
+   if (block.kind & block_kind_loop_header) {
+      /* In case of nested loops, only handle the inner-most loops in order
+       * to not break the alignment of inner loops by handling outer loops.
+       * Also ignore loops without back-edge.
+       */
+      ctx.loop_header = block.linear_preds.size() > 1 ? &block : NULL;
+   }
+
   /* align resume shaders with cache line */
   if (block.kind & block_kind_resume) {
      size_t cache_aligned = align(code.size(), 16);