From 0c590eb90328b39e38b742125fe14ee1f56c79da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Tue, 5 Aug 2025 15:37:10 +0200 Subject: [PATCH] aco/scheduler: schedule VMEM store clauses during the regular forward pass Totals from 1456 (1.82% of 79839) affected shaders: (Navi48) MaxWaves: 37780 -> 37128 (-1.73%); split: +0.15%, -1.87% Instrs: 3788175 -> 3788435 (+0.01%); split: -0.04%, +0.04% CodeSize: 20468648 -> 20467432 (-0.01%); split: -0.04%, +0.03% VGPRs: 86820 -> 91440 (+5.32%); split: -0.10%, +5.42% Latency: 26866232 -> 26858867 (-0.03%); split: -0.04%, +0.01% InvThroughput: 3491741 -> 3828339 (+9.64%); split: -0.02%, +9.66% VClause: 90413 -> 89426 (-1.09%); split: -1.27%, +0.18% SClause: 130532 -> 130530 (-0.00%); split: -0.00%, +0.00% Copies: 347397 -> 347806 (+0.12%); split: -0.11%, +0.23% Branches: 117476 -> 117496 (+0.02%) VALU: 1897427 -> 1897830 (+0.02%); split: -0.02%, +0.04% SALU: 602365 -> 602379 (+0.00%) VOPD: 1259 -> 1251 (-0.64%); split: +0.24%, -0.87% Part-of: --- src/amd/compiler/aco_scheduler.cpp | 47 ++++++++++++++---------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index da3f0cacd81..f3e2ec628ac 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -113,6 +113,7 @@ struct sched_ctx { int16_t occupancy_factor; int16_t last_SMEM_stall; int last_SMEM_dep_idx; + int last_VMEM_store_idx; MoveState mv; bool schedule_pos_exports = true; unsigned schedule_pos_export_div = 1; @@ -1178,9 +1179,15 @@ schedule_position_export(sched_ctx& ctx, Block* block, Instruction* current, int } } -unsigned +void schedule_VMEM_store(sched_ctx& ctx, Block* block, Instruction* current, int idx) { + int max_distance = ctx.last_VMEM_store_idx + VMEM_STORE_CLAUSE_MAX_GRAB_DIST; + ctx.last_VMEM_store_idx = idx; + + if (max_distance < idx) + return; + hazard_query hq; init_hazard_query(ctx, &hq); @@ -1191,31 +1198,30 @@ schedule_VMEM_store(sched_ctx& ctx, Block* block, Instruction* current, int idx) if (candidate->opcode == aco_opcode::p_logical_start) break; - if (!should_form_clause(current, candidate.get())) { - add_to_hazard_query(&hq, candidate.get()); - ctx.mv.downwards_skip(cursor); - k += get_likely_cost(candidate.get()); - continue; + if (should_form_clause(current, candidate.get())) { + if (perform_hazard_query(&hq, candidate.get(), false) == hazard_success) + ctx.mv.downwards_move_clause(cursor); + break; } - if (perform_hazard_query(&hq, candidate.get(), false) != hazard_success) + if (candidate->isVMEM() || candidate->isFlatLike()) break; - if (ctx.mv.downwards_move_clause(cursor) != move_success) - break; - } - return cursor.insert_idx - cursor.insert_idx_clause - 1; + add_to_hazard_query(&hq, candidate.get()); + ctx.mv.downwards_skip(cursor); + k += get_likely_cost(candidate.get()); + } } void schedule_block(sched_ctx& ctx, Program* program, Block* block) { ctx.last_SMEM_dep_idx = 0; + ctx.last_VMEM_store_idx = INT_MAX; ctx.last_SMEM_stall = INT16_MIN; ctx.mv.block = block; /* go through all instructions and find memory loads */ - unsigned num_stores = 0; for (unsigned idx = 0; idx < block->instructions.size(); idx++) { Instruction* current = block->instructions[idx].get(); @@ -1231,7 +1237,10 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block) } if (current->definitions.empty()) { - num_stores += current->isVMEM() || current->isFlatLike() ? 1 : 0; + if ((current->isVMEM() || current->isFlatLike()) && program->gfx_level >= GFX11) { + ctx.mv.current = current; + schedule_VMEM_store(ctx, block, current, idx); + } continue; } @@ -1251,18 +1260,6 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block) } } - /* GFX11 benefits from creating VMEM store clauses. */ - if (num_stores > 1 && program->gfx_level >= GFX11) { - for (int idx = block->instructions.size() - 1; idx >= 0; idx--) { - Instruction* current = block->instructions[idx].get(); - if (!current->definitions.empty() || !(current->isVMEM() || current->isFlatLike())) - continue; - - ctx.mv.current = current; - idx -= schedule_VMEM_store(ctx, block, current, idx); - } - } - /* resummarize the block's register demand */ block->register_demand = block->live_in_demand; for (const aco_ptr& instr : block->instructions)