From 068f9b51de08aeab311e39aea973d59a76693c02 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Sun, 26 Jan 2025 15:57:44 +0100 Subject: [PATCH] aco/sched_ilp: use more realistic memory latencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The last commit changes order of instructions more aggressively, and because the memory load latencies here are wastly underestimated, it ruins some of the work of pre-RA memory scheduling. With the new heuristic large latency values work fine, so use them. Foz-DB GFX1150: Totals from 71343 (89.88% of 79377) affected shaders: Instrs: 41627671 -> 41915029 (+0.69%); split: -0.01%, +0.70% CodeSize: 215901308 -> 217051132 (+0.53%); split: -0.01%, +0.54% Latency: 288714439 -> 286556159 (-0.75%); split: -0.76%, +0.02% InvThroughput: 55834139 -> 55645301 (-0.34%); split: -0.35%, +0.01% VClause: 829066 -> 828984 (-0.01%); split: -0.04%, +0.03% SClause: 1237366 -> 1237448 (+0.01%); split: -0.02%, +0.02% VALU: 23643291 -> 23643292 (+0.00%); split: -0.00%, +0.00% Foz-DB Navi31: Totals from 70576 (88.91% of 79377) affected shaders: Instrs: 40928125 -> 41211820 (+0.69%); split: -0.01%, +0.70% CodeSize: 215770956 -> 216897948 (+0.52%); split: -0.00%, +0.53% Latency: 288139802 -> 286038405 (-0.73%); split: -0.75%, +0.02% InvThroughput: 46391629 -> 46300275 (-0.20%); split: -0.20%, +0.01% VClause: 829987 -> 829997 (+0.00%); split: -0.02%, +0.02% SClause: 1229345 -> 1229425 (+0.01%); split: -0.02%, +0.02% VALU: 24515334 -> 24515335 (+0.00%) Foz-DB Navi21: Instrs: 45512672 -> 45527322 (+0.03%); split: -0.01%, +0.04% CodeSize: 244254716 -> 244311472 (+0.02%); split: -0.01%, +0.03% Latency: 314034443 -> 311473726 (-0.82%); split: -0.83%, +0.01% InvThroughput: 73373201 -> 73220438 (-0.21%); split: -0.21%, +0.00% VClause: 914819 -> 914853 (+0.00%); split: -0.02%, +0.02% SClause: 1283331 -> 1283302 (-0.00%); split: -0.01%, +0.01% Foz-DB Vega10: Totals from 41908 (66.49% of 63026) affected shaders: Instrs: 22770415 -> 22779136 (+0.04%); split: -0.01%, +0.04% CodeSize: 118195752 -> 118230540 (+0.03%); split: -0.00%, +0.03% Latency: 242119940 -> 239665380 (-1.01%); split: -1.02%, +0.01% InvThroughput: 131459884 -> 131182979 (-0.21%); split: -0.21%, +0.00% VClause: 493311 -> 493215 (-0.02%); split: -0.05%, +0.03% SClause: 758814 -> 758761 (-0.01%); split: -0.02%, +0.01% Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_scheduler_ilp.cpp | 29 +++++++++++++++++--------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/amd/compiler/aco_scheduler_ilp.cpp b/src/amd/compiler/aco_scheduler_ilp.cpp index 5e561634543..a064713fecf 100644 --- a/src/amd/compiler/aco_scheduler_ilp.cpp +++ b/src/amd/compiler/aco_scheduler_ilp.cpp @@ -49,10 +49,9 @@ struct InstrInfo { struct RegisterInfo { mask_t read_mask; /* bitmask of nodes which have to be scheduled before the next write. */ - int8_t latency; /* estimated outstanding latency of last register write outside the DAG. */ - uint8_t direct_dependency : 4; /* node that has to be scheduled before any other access. */ - uint8_t has_direct_dependency : 1; /* whether there is an unscheduled direct dependency. */ - uint8_t padding : 3; + uint16_t latency : 11; /* estimated outstanding latency of last register write outside the DAG. */ + uint16_t direct_dependency : 4; /* node that has to be scheduled before any other access. */ + uint16_t has_direct_dependency : 1; /* whether there is an unscheduled direct dependency. */ }; struct SchedILPContext { @@ -292,12 +291,22 @@ get_latency(const Instruction* const instr) return 5; if (instr->isSALU()) return 2; + /* Based on get_wait_counter_info in aco_statistics.cpp. */ if (instr->isVMEM() || instr->isFlatLike()) - return 32; - if (instr->isSMEM()) - return 5; - if (instr->accessesLDS()) - return 2; + return 320; + if (instr->isSMEM()) { + if (instr->operands.empty()) + return 1; + if (instr->operands[0].size() == 2 || + (instr->operands[1].isConstant() && + (instr->operands.size() < 3 || instr->operands[2].isConstant()))) + return 30; + return 200; + } + if (instr->isLDSDIR()) + return 13; + if (instr->isDS()) + return 20; return 0; } @@ -480,7 +489,7 @@ remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_ ctx.regs[flat_scr_hi].read_mask &= mask; } - const int8_t latency = get_latency(instr); + const int latency = get_latency(instr); for (const Definition& def : instr->definitions) { for (unsigned i = 0; i < def.size(); i++) {