mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-04-22 20:30:42 +02:00
aco/sched_ilp: new latency heuristic
The main train of thought is that we should consider latency after the write was scheduled. This means we rely a lot less on the input order of instructions for good results. Foz-DB GFX1150: Totals from 75606 (95.25% of 79377) affected shaders: Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01% CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00% Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06% InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02% VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07% SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05% VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00% Foz-DB Navi31: Totals from 75606 (95.25% of 79377) affected shaders: Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00% CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00% Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05% InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01% VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06% SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05% VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00% Foz-DB Navi21: Totals from 76224 (96.03% of 79377) affected shaders: Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03% CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02% Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05% InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01% VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04% SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06% Foz-DB Vega10: Totals from 60142 (95.42% of 63026) affected shaders: Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02% CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02% Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43% InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09% VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09% SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08% Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
This commit is contained in:
parent
0f13a42657
commit
819938d2fa
2 changed files with 58 additions and 39 deletions
|
|
@ -6,6 +6,7 @@
|
|||
#include "aco_ir.h"
|
||||
|
||||
#include "util/bitscan.h"
|
||||
#include "util/bitset.h"
|
||||
#include "util/macros.h"
|
||||
|
||||
#include <limits>
|
||||
|
|
@ -40,14 +41,15 @@ struct VOPDInfo {
|
|||
|
||||
struct InstrInfo {
|
||||
Instruction* instr;
|
||||
int32_t priority;
|
||||
int16_t priority;
|
||||
mask_t dependency_mask; /* bitmask of nodes which have to be scheduled before this node. */
|
||||
mask_t write_for_read_mask; /* bitmask of nodes in the DAG that have a RaW dependency. */
|
||||
uint8_t next_non_reorderable; /* index of next non-reorderable instruction node after this one. */
|
||||
};
|
||||
|
||||
struct RegisterInfo {
|
||||
mask_t read_mask; /* bitmask of nodes which have to be scheduled before the next write. */
|
||||
int8_t latency; /* estimated latency of last register write. */
|
||||
int8_t latency; /* estimated outstanding latency of last register write outside the DAG. */
|
||||
uint8_t direct_dependency : 4; /* node that has to be scheduled before any other access. */
|
||||
uint8_t has_direct_dependency : 1; /* whether there is an unscheduled direct dependency. */
|
||||
uint8_t padding : 3;
|
||||
|
|
@ -58,6 +60,7 @@ struct SchedILPContext {
|
|||
bool is_vopd = false;
|
||||
InstrInfo nodes[num_nodes];
|
||||
RegisterInfo regs[512];
|
||||
BITSET_DECLARE(reg_has_latency, 512) = { 0 };
|
||||
mask_t non_reorder_mask = 0; /* bitmask of instruction nodes which should not be reordered. */
|
||||
mask_t active_mask = 0; /* bitmask of valid instruction nodes. */
|
||||
uint8_t next_non_reorderable = UINT8_MAX; /* index of next node which should not be reordered. */
|
||||
|
|
@ -318,6 +321,7 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
|
|||
InstrInfo& entry = ctx.nodes[idx];
|
||||
entry.instr = instr;
|
||||
entry.priority = 0;
|
||||
entry.write_for_read_mask = 0;
|
||||
const mask_t mask = BITFIELD_BIT(idx);
|
||||
bool reorder = can_reorder(instr);
|
||||
ctx.active_mask |= mask;
|
||||
|
|
@ -346,28 +350,12 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
|
|||
/* Add register reads. */
|
||||
reg_info.read_mask |= mask;
|
||||
|
||||
int cycles_since_reg_write = num_nodes;
|
||||
if (reg_info.has_direct_dependency) {
|
||||
/* A previous dependency is still part of the DAG. */
|
||||
ctx.nodes[ctx.regs[reg].direct_dependency].write_for_read_mask |= mask;
|
||||
entry.dependency_mask |= BITFIELD_BIT(reg_info.direct_dependency);
|
||||
cycles_since_reg_write = ctx.nodes[reg_info.direct_dependency].priority;
|
||||
}
|
||||
|
||||
if (reg_info.latency) {
|
||||
/* Ignore and reset register latencies for memory loads and other non-reorderable
|
||||
* instructions. We schedule these as early as possible anyways.
|
||||
*/
|
||||
if (reorder && reg_info.latency > cycles_since_reg_write) {
|
||||
entry.priority = MIN2(entry.priority, cycles_since_reg_write - reg_info.latency);
|
||||
|
||||
/* If a previous register write created some latency, ensure that this
|
||||
* is the first read of the register by making this instruction a direct
|
||||
* dependency of all following register reads.
|
||||
*/
|
||||
reg_info.has_direct_dependency = 1;
|
||||
reg_info.direct_dependency = idx;
|
||||
}
|
||||
reg_info.latency = 0;
|
||||
} else if (BITSET_TEST(ctx.reg_has_latency, reg + i)) {
|
||||
entry.priority = MIN2(entry.priority, -reg_info.latency);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -375,15 +363,19 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
|
|||
/* Check if this instructions reads implicit registers. */
|
||||
if (needs_exec_mask(instr)) {
|
||||
for (unsigned reg = exec_lo; reg <= exec_hi; reg++) {
|
||||
if (ctx.regs[reg].has_direct_dependency)
|
||||
if (ctx.regs[reg].has_direct_dependency) {
|
||||
entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
|
||||
ctx.nodes[ctx.regs[reg].direct_dependency].write_for_read_mask |= mask;
|
||||
}
|
||||
ctx.regs[reg].read_mask |= mask;
|
||||
}
|
||||
}
|
||||
if (ctx.program->gfx_level < GFX10 && instr->isScratch()) {
|
||||
for (unsigned reg = flat_scr_lo; reg <= flat_scr_hi; reg++) {
|
||||
if (ctx.regs[reg].has_direct_dependency)
|
||||
if (ctx.regs[reg].has_direct_dependency) {
|
||||
entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
|
||||
ctx.nodes[ctx.regs[reg].direct_dependency].write_for_read_mask |= mask;
|
||||
}
|
||||
ctx.regs[reg].read_mask |= mask;
|
||||
}
|
||||
}
|
||||
|
|
@ -400,11 +392,6 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
|
|||
/* This register write is a direct dependency for all following reads. */
|
||||
reg_info.has_direct_dependency = 1;
|
||||
reg_info.direct_dependency = idx;
|
||||
|
||||
if (!ctx.is_vopd) {
|
||||
/* Add latency information for the next register read. */
|
||||
reg_info.latency = get_latency(instr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -447,9 +434,6 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
|
|||
/* Add transitive dependencies. */
|
||||
if (entry.dependency_mask & BITFIELD_BIT(i))
|
||||
entry.dependency_mask |= ctx.nodes[i].dependency_mask;
|
||||
|
||||
/* increment base priority */
|
||||
ctx.nodes[i].priority++;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -459,6 +443,24 @@ remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_
|
|||
const mask_t mask = ~BITFIELD_BIT(idx);
|
||||
ctx.active_mask &= mask;
|
||||
|
||||
int stall = 1; /* Assume all instructions take one cycle to issue. */
|
||||
if (ctx.nodes[idx].priority < 0) {
|
||||
/* Add remaining latency stall. */
|
||||
stall -= ctx.nodes[idx].priority;
|
||||
}
|
||||
|
||||
if (!ctx.is_vopd) {
|
||||
unsigned i;
|
||||
BITSET_FOREACH_SET (i, ctx.reg_has_latency, 512) {
|
||||
if (ctx.regs[i].latency <= stall) {
|
||||
ctx.regs[i].latency = 0;
|
||||
BITSET_CLEAR(ctx.reg_has_latency, i);
|
||||
} else {
|
||||
ctx.regs[i].latency -= stall;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const Operand& op : instr->operands) {
|
||||
const unsigned reg = op.physReg();
|
||||
if (reg >= max_sgpr && reg != scc && reg < min_vgpr)
|
||||
|
|
@ -467,7 +469,6 @@ remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_
|
|||
for (unsigned i = 0; i < op.size(); i++) {
|
||||
RegisterInfo& reg_info = ctx.regs[reg + i];
|
||||
reg_info.read_mask &= mask;
|
||||
reg_info.has_direct_dependency &= reg_info.direct_dependency != idx;
|
||||
}
|
||||
}
|
||||
if (needs_exec_mask(instr)) {
|
||||
|
|
@ -478,16 +479,30 @@ remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_
|
|||
ctx.regs[flat_scr_lo].read_mask &= mask;
|
||||
ctx.regs[flat_scr_hi].read_mask &= mask;
|
||||
}
|
||||
|
||||
const int8_t latency = get_latency(instr);
|
||||
|
||||
for (const Definition& def : instr->definitions) {
|
||||
for (unsigned i = 0; i < def.size(); i++) {
|
||||
unsigned reg = def.physReg().reg() + i;
|
||||
ctx.regs[reg].read_mask &= mask;
|
||||
ctx.regs[reg].has_direct_dependency &= ctx.regs[reg].direct_dependency != idx;
|
||||
if (ctx.regs[reg].has_direct_dependency && ctx.regs[reg].direct_dependency == idx) {
|
||||
ctx.regs[reg].has_direct_dependency = false;
|
||||
if (!ctx.is_vopd) {
|
||||
BITSET_SET(ctx.reg_has_latency, reg);
|
||||
ctx.regs[reg].latency = latency;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < num_nodes; i++)
|
||||
for (unsigned i = 0; i < num_nodes; i++) {
|
||||
ctx.nodes[i].dependency_mask &= mask;
|
||||
ctx.nodes[i].priority += stall;
|
||||
if (ctx.nodes[idx].write_for_read_mask & BITFIELD_BIT(i) && !ctx.is_vopd) {
|
||||
ctx.nodes[i].priority = MIN2(ctx.nodes[i].priority, -latency);
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx.next_non_reorderable == idx) {
|
||||
ctx.non_reorder_mask &= mask;
|
||||
|
|
@ -782,10 +797,14 @@ schedule_ilp(Program* program)
|
|||
SchedILPContext ctx = {program};
|
||||
|
||||
for (Block& block : program->blocks) {
|
||||
if (block.instructions.empty())
|
||||
continue;
|
||||
auto it = block.instructions.begin();
|
||||
auto insert_it = block.instructions.begin();
|
||||
do_schedule(ctx, insert_it, it, block.instructions.begin(), block.instructions.end());
|
||||
block.instructions.resize(insert_it - block.instructions.begin());
|
||||
if (block.linear_succs.empty() || block.instructions.back()->opcode == aco_opcode::s_branch)
|
||||
BITSET_ZERO(ctx.reg_has_latency);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -83,8 +83,8 @@ BEGIN_TEST(d3d11_derivs.constant)
|
|||
//>> p_end_linear_vgpr (kill)%wqm
|
||||
pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
|
||||
|
||||
//>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_
|
||||
//>> v_mov_b32_e32 v#ry, -0.5 ; $_
|
||||
//>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_
|
||||
//>> v_mov_b32_e32 v#rx, v#rx_tmp ; $_
|
||||
//>> image_sample v[#_:#_], v[#rx:#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_
|
||||
pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
|
||||
|
|
@ -334,8 +334,8 @@ BEGIN_TEST(d3d11_derivs._1d_gfx9)
|
|||
//>> p_end_linear_vgpr (kill)%wqm
|
||||
pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
|
||||
|
||||
//>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_
|
||||
//>> v_mov_b32_e32 v#ry, 0.5 ; $_
|
||||
//>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_
|
||||
//>> v_mov_b32_e32 v#rx, v#rx_tmp ; $_
|
||||
//; success = rx+1 == ry
|
||||
//>> image_sample v[#_:#_], v#rx, s[#_:#_], s[#_:#_] dmask:0xf ; $_ $_
|
||||
|
|
@ -376,9 +376,9 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9)
|
|||
//>> p_end_linear_vgpr (kill)%wqm
|
||||
pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
|
||||
|
||||
//>> v_mov_b32_e32 v#ry, 0.5 ; $_
|
||||
//>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.y ; $_
|
||||
//>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_
|
||||
//>> v_mov_b32_e32 v#ry, 0.5 ; $_
|
||||
//>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp ; $_
|
||||
//>> v_mov_b32_e32 v#rx, v#rx_tmp ; $_
|
||||
//>> v_mov_b32_e32 v#rl, v#rl_tmp ; $_
|
||||
|
|
@ -472,11 +472,11 @@ BEGIN_TEST(d3d11_derivs.cube_array)
|
|||
|
||||
//>> v_cubeid_f32 v#rf, v#_, v#_, v#_ ; $_ $_
|
||||
|
||||
//>> v_fmamk_f32 v#rlf_tmp, v#rl, 0x41000000, v#rf ; $_ $_
|
||||
//>> v_fmaak_f32 v#rx_tmp, v#_, v#_, 0x3fc00000 ; $_ $_
|
||||
//>> v_fmaak_f32 v#ry_tmp, v#_, v#_, 0x3fc00000 ; $_ $_
|
||||
//>> v_fmamk_f32 v#rlf_tmp, v#rl, 0x41000000, v#rf ; $_ $_
|
||||
//>> v_lshrrev_b64 v[#rx:#ry], 0, v[#rx_tmp:#ry_tmp] ; $_ $_
|
||||
//>> v_mov_b32_e32 v#rlf, v#rlf_tmp ; $_
|
||||
//>> v_lshrrev_b64 v[#rx:#ry], 0, v[#rx_tmp:#ry_tmp] ; $_ $_
|
||||
|
||||
//>> BB1:
|
||||
//; success = rx+1 == ry and rx+2 == rlf
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue