aco/scheduler: schedule VMEM store clauses during the regular forward pass

Totals from 1456 (1.82% of 79839) affected shaders: (Navi48)

MaxWaves: 37780 -> 37128 (-1.73%); split: +0.15%, -1.87%
Instrs: 3788175 -> 3788435 (+0.01%); split: -0.04%, +0.04%
CodeSize: 20468648 -> 20467432 (-0.01%); split: -0.04%, +0.03%
VGPRs: 86820 -> 91440 (+5.32%); split: -0.10%, +5.42%
Latency: 26866232 -> 26858867 (-0.03%); split: -0.04%, +0.01%
InvThroughput: 3491741 -> 3828339 (+9.64%); split: -0.02%, +9.66%
VClause: 90413 -> 89426 (-1.09%); split: -1.27%, +0.18%
SClause: 130532 -> 130530 (-0.00%); split: -0.00%, +0.00%
Copies: 347397 -> 347806 (+0.12%); split: -0.11%, +0.23%
Branches: 117476 -> 117496 (+0.02%)
VALU: 1897427 -> 1897830 (+0.02%); split: -0.02%, +0.04%
SALU: 602365 -> 602379 (+0.00%)
VOPD: 1259 -> 1251 (-0.64%); split: +0.24%, -0.87%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36599>
This commit is contained in:
Daniel Schürmann 2025-08-05 15:37:10 +02:00 committed by Marge Bot
parent f601eb8555
commit 0c590eb903

View file

@ -113,6 +113,7 @@ struct sched_ctx {
int16_t occupancy_factor;
int16_t last_SMEM_stall;
int last_SMEM_dep_idx;
int last_VMEM_store_idx;
MoveState mv;
bool schedule_pos_exports = true;
unsigned schedule_pos_export_div = 1;
@ -1178,9 +1179,15 @@ schedule_position_export(sched_ctx& ctx, Block* block, Instruction* current, int
}
}
unsigned
void
schedule_VMEM_store(sched_ctx& ctx, Block* block, Instruction* current, int idx)
{
int max_distance = ctx.last_VMEM_store_idx + VMEM_STORE_CLAUSE_MAX_GRAB_DIST;
ctx.last_VMEM_store_idx = idx;
if (max_distance < idx)
return;
hazard_query hq;
init_hazard_query(ctx, &hq);
@ -1191,31 +1198,30 @@ schedule_VMEM_store(sched_ctx& ctx, Block* block, Instruction* current, int idx)
if (candidate->opcode == aco_opcode::p_logical_start)
break;
if (!should_form_clause(current, candidate.get())) {
add_to_hazard_query(&hq, candidate.get());
ctx.mv.downwards_skip(cursor);
k += get_likely_cost(candidate.get());
continue;
if (should_form_clause(current, candidate.get())) {
if (perform_hazard_query(&hq, candidate.get(), false) == hazard_success)
ctx.mv.downwards_move_clause(cursor);
break;
}
if (perform_hazard_query(&hq, candidate.get(), false) != hazard_success)
if (candidate->isVMEM() || candidate->isFlatLike())
break;
if (ctx.mv.downwards_move_clause(cursor) != move_success)
break;
}
return cursor.insert_idx - cursor.insert_idx_clause - 1;
add_to_hazard_query(&hq, candidate.get());
ctx.mv.downwards_skip(cursor);
k += get_likely_cost(candidate.get());
}
}
void
schedule_block(sched_ctx& ctx, Program* program, Block* block)
{
ctx.last_SMEM_dep_idx = 0;
ctx.last_VMEM_store_idx = INT_MAX;
ctx.last_SMEM_stall = INT16_MIN;
ctx.mv.block = block;
/* go through all instructions and find memory loads */
unsigned num_stores = 0;
for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
Instruction* current = block->instructions[idx].get();
@ -1231,7 +1237,10 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block)
}
if (current->definitions.empty()) {
num_stores += current->isVMEM() || current->isFlatLike() ? 1 : 0;
if ((current->isVMEM() || current->isFlatLike()) && program->gfx_level >= GFX11) {
ctx.mv.current = current;
schedule_VMEM_store(ctx, block, current, idx);
}
continue;
}
@ -1251,18 +1260,6 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block)
}
}
/* GFX11 benefits from creating VMEM store clauses. */
if (num_stores > 1 && program->gfx_level >= GFX11) {
for (int idx = block->instructions.size() - 1; idx >= 0; idx--) {
Instruction* current = block->instructions[idx].get();
if (!current->definitions.empty() || !(current->isVMEM() || current->isFlatLike()))
continue;
ctx.mv.current = current;
idx -= schedule_VMEM_store(ctx, block, current, idx);
}
}
/* resummarize the block's register demand */
block->register_demand = block->live_in_demand;
for (const aco_ptr<Instruction>& instr : block->instructions)