aco/scheduler: schedule VMEM store clauses during the regular forward pass

Totals from 1456 (1.82% of 79839) affected shaders: (Navi48) MaxWaves: 37780 -> 37128 (-1.73%); split: +0.15%, -1.87% Instrs: 3788175 -> 3788435 (+0.01%); split: -0.04%, +0.04% CodeSize: 20468648 -> 20467432 (-0.01%); split: -0.04%, +0.03% VGPRs: 86820 -> 91440 (+5.32%); split: -0.10%, +5.42% Latency: 26866232 -> 26858867 (-0.03%); split: -0.04%, +0.01% InvThroughput: 3491741 -> 3828339 (+9.64%); split: -0.02%, +9.66% VClause: 90413 -> 89426 (-1.09%); split: -1.27%, +0.18% SClause: 130532 -> 130530 (-0.00%); split: -0.00%, +0.00% Copies: 347397 -> 347806 (+0.12%); split: -0.11%, +0.23% Branches: 117476 -> 117496 (+0.02%) VALU: 1897427 -> 1897830 (+0.02%); split: -0.02%, +0.04% SALU: 602365 -> 602379 (+0.00%) VOPD: 1259 -> 1251 (-0.64%); split: +0.24%, -0.87% Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36599>
2026-05-06 13:48:06 +02:00 · 2025-08-05 15:37:10 +02:00 · 2025-08-05 15:37:10 +02:00 · 0c590eb903
commit 0c590eb903
parent f601eb8555
1 changed files with 22 additions and 25 deletions
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@ -113,6 +113,7 @@ struct sched_ctx {
   int16_t occupancy_factor;
   int16_t last_SMEM_stall;
   int last_SMEM_dep_idx;
+   int last_VMEM_store_idx;
   MoveState mv;
   bool schedule_pos_exports = true;
   unsigned schedule_pos_export_div = 1;
@ -1178,9 +1179,15 @@ schedule_position_export(sched_ctx& ctx, Block* block, Instruction* current, int
   }
 }

-unsigned
+void
 schedule_VMEM_store(sched_ctx& ctx, Block* block, Instruction* current, int idx)
 {
+   int max_distance = ctx.last_VMEM_store_idx + VMEM_STORE_CLAUSE_MAX_GRAB_DIST;
+   ctx.last_VMEM_store_idx = idx;
+
+   if (max_distance < idx)
+      return;
+
   hazard_query hq;
   init_hazard_query(ctx, &hq);

@ -1191,31 +1198,30 @@ schedule_VMEM_store(sched_ctx& ctx, Block* block, Instruction* current, int idx)
      if (candidate->opcode == aco_opcode::p_logical_start)
         break;

-      if (!should_form_clause(current, candidate.get())) {
-         add_to_hazard_query(&hq, candidate.get());
-         ctx.mv.downwards_skip(cursor);
-         k += get_likely_cost(candidate.get());
-         continue;
+      if (should_form_clause(current, candidate.get())) {
+         if (perform_hazard_query(&hq, candidate.get(), false) == hazard_success)
+            ctx.mv.downwards_move_clause(cursor);
+         break;
      }

-      if (perform_hazard_query(&hq, candidate.get(), false) != hazard_success)
+      if (candidate->isVMEM() || candidate->isFlatLike())
         break;
-      if (ctx.mv.downwards_move_clause(cursor) != move_success)
-         break;
-   }

-   return cursor.insert_idx - cursor.insert_idx_clause - 1;
+      add_to_hazard_query(&hq, candidate.get());
+      ctx.mv.downwards_skip(cursor);
+      k += get_likely_cost(candidate.get());
+   }
 }

 void
 schedule_block(sched_ctx& ctx, Program* program, Block* block)
 {
   ctx.last_SMEM_dep_idx = 0;
+   ctx.last_VMEM_store_idx = INT_MAX;
   ctx.last_SMEM_stall = INT16_MIN;
   ctx.mv.block = block;

   /* go through all instructions and find memory loads */
-   unsigned num_stores = 0;
   for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
      Instruction* current = block->instructions[idx].get();

@ -1231,7 +1237,10 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block)
      }

      if (current->definitions.empty()) {
-         num_stores += current->isVMEM() || current->isFlatLike() ? 1 : 0;
+         if ((current->isVMEM() || current->isFlatLike()) && program->gfx_level >= GFX11) {
+            ctx.mv.current = current;
+            schedule_VMEM_store(ctx, block, current, idx);
+         }
         continue;
      }

@ -1251,18 +1260,6 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block)
      }
   }

-   /* GFX11 benefits from creating VMEM store clauses. */
-   if (num_stores > 1 && program->gfx_level >= GFX11) {
-      for (int idx = block->instructions.size() - 1; idx >= 0; idx--) {
-         Instruction* current = block->instructions[idx].get();
-         if (!current->definitions.empty() || !(current->isVMEM() || current->isFlatLike()))
-            continue;
-
-         ctx.mv.current = current;
-         idx -= schedule_VMEM_store(ctx, block, current, idx);
-      }
-   }
-
   /* resummarize the block's register demand */
   block->register_demand = block->live_in_demand;
   for (const aco_ptr<Instruction>& instr : block->instructions)