From 69bc4efa378657d3f1755efcf981718be02888dc Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 3 Nov 2025 14:07:28 +0000 Subject: [PATCH] aco/sched_ilp: improve scheduling with VMEM/DS->VALU WaW MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This improves scheduling with one side of a divergent branch writing to a VGPR using VMEM/DS, and the other writing using VALU. At the merge block, it will properly consider that the VGPR was written by a VMEM/DS. fossil-db (navi31): Totals from 1224 (1.53% of 79825) affected shaders: Instrs: 5264815 -> 5267604 (+0.05%); split: -0.00%, +0.06% CodeSize: 27406404 -> 27422132 (+0.06%); split: -0.00%, +0.06% Latency: 48325204 -> 48293975 (-0.06%); split: -0.09%, +0.03% InvThroughput: 8923880 -> 8919191 (-0.05%); split: -0.07%, +0.02% fossil-db (navi21): Totals from 1267 (1.59% of 79825) affected shaders: Instrs: 4628583 -> 4629190 (+0.01%); split: -0.00%, +0.01% CodeSize: 24974672 -> 24977188 (+0.01%); split: -0.00%, +0.01% Latency: 45080476 -> 44998120 (-0.18%); split: -0.20%, +0.02% InvThroughput: 12288202 -> 12269634 (-0.15%); split: -0.16%, +0.01% Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_scheduler_ilp.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_scheduler_ilp.cpp b/src/amd/compiler/aco_scheduler_ilp.cpp index f2d7e2f50af..f7f7828330e 100644 --- a/src/amd/compiler/aco_scheduler_ilp.cpp +++ b/src/amd/compiler/aco_scheduler_ilp.cpp @@ -573,8 +573,12 @@ remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_ if (ctx.regs[reg].has_direct_dependency && ctx.regs[reg].direct_dependency == idx) { ctx.regs[reg].has_direct_dependency = false; if (!ctx.is_vopd) { + /* Do MAX2() so that the latency from both predecessors of a merge block are considered. */ + if (BITSET_TEST(ctx.reg_has_latency, reg)) + ctx.regs[reg].latency = MAX2(ctx.regs[reg].latency, latency); + else + ctx.regs[reg].latency = latency; BITSET_SET(ctx.reg_has_latency, reg); - ctx.regs[reg].latency = latency; } } }