aco/sched_ilp: new latency heuristic

The main train of thought is that we should consider latency after
the write was scheduled. This means we rely a lot less on the input
order of instructions for good results.

Foz-DB GFX1150:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01%
CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00%
Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06%
InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02%
VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07%
SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05%
VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00%

Foz-DB Navi31:
Totals from 75606 (95.25% of 79377) affected shaders:
Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00%
CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00%
Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05%
InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01%
VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06%
SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05%
VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00%

Foz-DB Navi21:
Totals from 76224 (96.03% of 79377) affected shaders:
Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03%
CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02%
Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05%
InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01%
VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04%
SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06%

Foz-DB Vega10:
Totals from 60142 (95.42% of 63026) affected shaders:
Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02%
CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02%
Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43%
InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09%
VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09%
SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08%

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
This commit is contained in:
Georg Lehmann 2025-01-26 13:41:53 +01:00 committed by Marge Bot
parent 0f13a42657
commit 819938d2fa
2 changed files with 58 additions and 39 deletions

View file

@ -6,6 +6,7 @@
#include "aco_ir.h"
#include "util/bitscan.h"
#include "util/bitset.h"
#include "util/macros.h"
#include <limits>
@ -40,14 +41,15 @@ struct VOPDInfo {
struct InstrInfo {
Instruction* instr;
int32_t priority;
int16_t priority;
mask_t dependency_mask; /* bitmask of nodes which have to be scheduled before this node. */
mask_t write_for_read_mask; /* bitmask of nodes in the DAG that have a RaW dependency. */
uint8_t next_non_reorderable; /* index of next non-reorderable instruction node after this one. */
};
struct RegisterInfo {
mask_t read_mask; /* bitmask of nodes which have to be scheduled before the next write. */
int8_t latency; /* estimated latency of last register write. */
int8_t latency; /* estimated outstanding latency of last register write outside the DAG. */
uint8_t direct_dependency : 4; /* node that has to be scheduled before any other access. */
uint8_t has_direct_dependency : 1; /* whether there is an unscheduled direct dependency. */
uint8_t padding : 3;
@ -58,6 +60,7 @@ struct SchedILPContext {
bool is_vopd = false;
InstrInfo nodes[num_nodes];
RegisterInfo regs[512];
BITSET_DECLARE(reg_has_latency, 512) = { 0 };
mask_t non_reorder_mask = 0; /* bitmask of instruction nodes which should not be reordered. */
mask_t active_mask = 0; /* bitmask of valid instruction nodes. */
uint8_t next_non_reorderable = UINT8_MAX; /* index of next node which should not be reordered. */
@ -318,6 +321,7 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
InstrInfo& entry = ctx.nodes[idx];
entry.instr = instr;
entry.priority = 0;
entry.write_for_read_mask = 0;
const mask_t mask = BITFIELD_BIT(idx);
bool reorder = can_reorder(instr);
ctx.active_mask |= mask;
@ -346,28 +350,12 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
/* Add register reads. */
reg_info.read_mask |= mask;
int cycles_since_reg_write = num_nodes;
if (reg_info.has_direct_dependency) {
/* A previous dependency is still part of the DAG. */
ctx.nodes[ctx.regs[reg].direct_dependency].write_for_read_mask |= mask;
entry.dependency_mask |= BITFIELD_BIT(reg_info.direct_dependency);
cycles_since_reg_write = ctx.nodes[reg_info.direct_dependency].priority;
}
if (reg_info.latency) {
/* Ignore and reset register latencies for memory loads and other non-reorderable
* instructions. We schedule these as early as possible anyways.
*/
if (reorder && reg_info.latency > cycles_since_reg_write) {
entry.priority = MIN2(entry.priority, cycles_since_reg_write - reg_info.latency);
/* If a previous register write created some latency, ensure that this
* is the first read of the register by making this instruction a direct
* dependency of all following register reads.
*/
reg_info.has_direct_dependency = 1;
reg_info.direct_dependency = idx;
}
reg_info.latency = 0;
} else if (BITSET_TEST(ctx.reg_has_latency, reg + i)) {
entry.priority = MIN2(entry.priority, -reg_info.latency);
}
}
}
@ -375,15 +363,19 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
/* Check if this instructions reads implicit registers. */
if (needs_exec_mask(instr)) {
for (unsigned reg = exec_lo; reg <= exec_hi; reg++) {
if (ctx.regs[reg].has_direct_dependency)
if (ctx.regs[reg].has_direct_dependency) {
entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
ctx.nodes[ctx.regs[reg].direct_dependency].write_for_read_mask |= mask;
}
ctx.regs[reg].read_mask |= mask;
}
}
if (ctx.program->gfx_level < GFX10 && instr->isScratch()) {
for (unsigned reg = flat_scr_lo; reg <= flat_scr_hi; reg++) {
if (ctx.regs[reg].has_direct_dependency)
if (ctx.regs[reg].has_direct_dependency) {
entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
ctx.nodes[ctx.regs[reg].direct_dependency].write_for_read_mask |= mask;
}
ctx.regs[reg].read_mask |= mask;
}
}
@ -400,11 +392,6 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
/* This register write is a direct dependency for all following reads. */
reg_info.has_direct_dependency = 1;
reg_info.direct_dependency = idx;
if (!ctx.is_vopd) {
/* Add latency information for the next register read. */
reg_info.latency = get_latency(instr);
}
}
}
@ -447,9 +434,6 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
/* Add transitive dependencies. */
if (entry.dependency_mask & BITFIELD_BIT(i))
entry.dependency_mask |= ctx.nodes[i].dependency_mask;
/* increment base priority */
ctx.nodes[i].priority++;
}
}
@ -459,6 +443,24 @@ remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_
const mask_t mask = ~BITFIELD_BIT(idx);
ctx.active_mask &= mask;
int stall = 1; /* Assume all instructions take one cycle to issue. */
if (ctx.nodes[idx].priority < 0) {
/* Add remaining latency stall. */
stall -= ctx.nodes[idx].priority;
}
if (!ctx.is_vopd) {
unsigned i;
BITSET_FOREACH_SET (i, ctx.reg_has_latency, 512) {
if (ctx.regs[i].latency <= stall) {
ctx.regs[i].latency = 0;
BITSET_CLEAR(ctx.reg_has_latency, i);
} else {
ctx.regs[i].latency -= stall;
}
}
}
for (const Operand& op : instr->operands) {
const unsigned reg = op.physReg();
if (reg >= max_sgpr && reg != scc && reg < min_vgpr)
@ -467,7 +469,6 @@ remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_
for (unsigned i = 0; i < op.size(); i++) {
RegisterInfo& reg_info = ctx.regs[reg + i];
reg_info.read_mask &= mask;
reg_info.has_direct_dependency &= reg_info.direct_dependency != idx;
}
}
if (needs_exec_mask(instr)) {
@ -478,16 +479,30 @@ remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_
ctx.regs[flat_scr_lo].read_mask &= mask;
ctx.regs[flat_scr_hi].read_mask &= mask;
}
const int8_t latency = get_latency(instr);
for (const Definition& def : instr->definitions) {
for (unsigned i = 0; i < def.size(); i++) {
unsigned reg = def.physReg().reg() + i;
ctx.regs[reg].read_mask &= mask;
ctx.regs[reg].has_direct_dependency &= ctx.regs[reg].direct_dependency != idx;
if (ctx.regs[reg].has_direct_dependency && ctx.regs[reg].direct_dependency == idx) {
ctx.regs[reg].has_direct_dependency = false;
if (!ctx.is_vopd) {
BITSET_SET(ctx.reg_has_latency, reg);
ctx.regs[reg].latency = latency;
}
}
}
}
for (unsigned i = 0; i < num_nodes; i++)
for (unsigned i = 0; i < num_nodes; i++) {
ctx.nodes[i].dependency_mask &= mask;
ctx.nodes[i].priority += stall;
if (ctx.nodes[idx].write_for_read_mask & BITFIELD_BIT(i) && !ctx.is_vopd) {
ctx.nodes[i].priority = MIN2(ctx.nodes[i].priority, -latency);
}
}
if (ctx.next_non_reorderable == idx) {
ctx.non_reorder_mask &= mask;
@ -782,10 +797,14 @@ schedule_ilp(Program* program)
SchedILPContext ctx = {program};
for (Block& block : program->blocks) {
if (block.instructions.empty())
continue;
auto it = block.instructions.begin();
auto insert_it = block.instructions.begin();
do_schedule(ctx, insert_it, it, block.instructions.begin(), block.instructions.end());
block.instructions.resize(insert_it - block.instructions.begin());
if (block.linear_succs.empty() || block.instructions.back()->opcode == aco_opcode::s_branch)
BITSET_ZERO(ctx.reg_has_latency);
}
}

View file

@ -83,8 +83,8 @@ BEGIN_TEST(d3d11_derivs.constant)
//>> p_end_linear_vgpr (kill)%wqm
pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
//>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_
//>> v_mov_b32_e32 v#ry, -0.5 ; $_
//>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_
//>> v_mov_b32_e32 v#rx, v#rx_tmp ; $_
//>> image_sample v[#_:#_], v[#rx:#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_
pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
@ -334,8 +334,8 @@ BEGIN_TEST(d3d11_derivs._1d_gfx9)
//>> p_end_linear_vgpr (kill)%wqm
pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
//>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_
//>> v_mov_b32_e32 v#ry, 0.5 ; $_
//>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_
//>> v_mov_b32_e32 v#rx, v#rx_tmp ; $_
//; success = rx+1 == ry
//>> image_sample v[#_:#_], v#rx, s[#_:#_], s[#_:#_] dmask:0xf ; $_ $_
@ -376,9 +376,9 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9)
//>> p_end_linear_vgpr (kill)%wqm
pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
//>> v_mov_b32_e32 v#ry, 0.5 ; $_
//>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.y ; $_
//>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_
//>> v_mov_b32_e32 v#ry, 0.5 ; $_
//>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp ; $_
//>> v_mov_b32_e32 v#rx, v#rx_tmp ; $_
//>> v_mov_b32_e32 v#rl, v#rl_tmp ; $_
@ -472,11 +472,11 @@ BEGIN_TEST(d3d11_derivs.cube_array)
//>> v_cubeid_f32 v#rf, v#_, v#_, v#_ ; $_ $_
//>> v_fmamk_f32 v#rlf_tmp, v#rl, 0x41000000, v#rf ; $_ $_
//>> v_fmaak_f32 v#rx_tmp, v#_, v#_, 0x3fc00000 ; $_ $_
//>> v_fmaak_f32 v#ry_tmp, v#_, v#_, 0x3fc00000 ; $_ $_
//>> v_fmamk_f32 v#rlf_tmp, v#rl, 0x41000000, v#rf ; $_ $_
//>> v_lshrrev_b64 v[#rx:#ry], 0, v[#rx_tmp:#ry_tmp] ; $_ $_
//>> v_mov_b32_e32 v#rlf, v#rlf_tmp ; $_
//>> v_lshrrev_b64 v[#rx:#ry], 0, v[#rx_tmp:#ry_tmp] ; $_ $_
//>> BB1:
//; success = rx+1 == ry and rx+2 == rlf