aco/gfx11: schedule for VMEM store clauses

fossil-db (gfx1100): Totals from 49486 (37.09% of 133428) affected shaders: Instrs: 18376819 -> 18480712 (+0.57%); split: -0.00%, +0.57% CodeSize: 91810836 -> 92227292 (+0.45%); split: -0.00%, +0.45% VGPRs: 2031824 -> 2047784 (+0.79%); split: -0.02%, +0.81% Latency: 104259318 -> 103804792 (-0.44%); split: -0.44%, +0.00% InvThroughput: 16388760 -> 16399819 (+0.07%); split: -0.13%, +0.19% VClause: 568844 -> 432401 (-23.99%) Copies: 1197942 -> 1231202 (+2.78%); split: -0.08%, +2.86% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23505>
2026-01-06 15:20:17 +01:00 · 2023-06-07 17:08:10 +01:00 · 2023-06-07 17:08:10 +01:00 · f1f01aaef5
commit f1f01aaef5
parent f837fec213
1 changed files with 48 additions and 1 deletions
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@ -1019,6 +1019,37 @@ schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDeman
   }
 }

+unsigned
+schedule_VMEM_store(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
+                    Instruction* current, int idx)
+{
+   hazard_query hq;
+   init_hazard_query(ctx, &hq);
+
+   DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true);
+   unsigned skip = 0;
+
+   for (int i = 0; i < VMEM_CLAUSE_MAX_GRAB_DIST; i++) {
+      aco_ptr<Instruction>& candidate = block->instructions[cursor.source_idx];
+      if (candidate->opcode == aco_opcode::p_logical_start)
+         break;
+
+      if (!should_form_clause(current, candidate.get())) {
+         add_to_hazard_query(&hq, candidate.get());
+         ctx.mv.downwards_skip(cursor);
+         continue;
+      }
+
+      if (perform_hazard_query(&hq, candidate.get(), false) != hazard_success ||
+          ctx.mv.downwards_move(cursor, true) != move_success)
+         break;
+
+      skip++;
+   }
+
+   return skip;
+}
+
 void
 schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
 {
@ -1028,6 +1059,7 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
   ctx.mv.register_demand = live_vars.register_demand[block->index].data();

   /* go through all instructions and find memory loads */
+   unsigned num_stores = 0;
   for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
      Instruction* current = block->instructions[idx].get();

@ -1040,8 +1072,10 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
         }
      }

-      if (current->definitions.empty())
+      if (current->definitions.empty()) {
+         num_stores += current->isVMEM() || current->isFlatLike() ? 1 : 0;
         continue;
+      }

      if (current->isVMEM() || current->isFlatLike()) {
         ctx.mv.current = current;
@ -1054,6 +1088,19 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
      }
   }

+   /* GFX11 benefits from creating VMEM store clauses. */
+   if (num_stores > 1 && program->gfx_level >= GFX11) {
+      for (int idx = block->instructions.size() - 1; idx >= 0; idx--) {
+         Instruction* current = block->instructions[idx].get();
+         if (!current->definitions.empty() || !(current->isVMEM() || current->isFlatLike()))
+            continue;
+
+         ctx.mv.current = current;
+         idx -=
+            schedule_VMEM_store(ctx, block, live_vars.register_demand[block->index], current, idx);
+      }
+   }
+
   /* resummarize the block's register demand */
   block->register_demand = RegisterDemand();
   for (unsigned idx = 0; idx < block->instructions.size(); idx++) {