From df1de388a3232be7e70c3cd47651b0da4e0367d2 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Fri, 24 Jan 2025 08:42:00 +0100 Subject: [PATCH] aco/sched_ilp: reorder VINTRP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VINTRP(gfx6-gfx10.3) is mostly just VALU, but we treated it like memory instructions as an afterthought. This had issues as VINTRP was never reordered with itself, or other memory instructions. Reordering VINTRP in clauses increases ILP. We don't really need collect_clause_dependencies for VINTRP either, because they ususally have the same dependencies already. That means we can still form VINTRP clauses by selecting preferably VINTRP after a previous one. Foz-DB Navi21: Totals from 34184 (43.16% of 79206) affected shaders: Instrs: 18811270 -> 18812046 (+0.00%); split: -0.01%, +0.02% CodeSize: 103627276 -> 103630056 (+0.00%); split: -0.01%, +0.01% Latency: 188379364 -> 187936731 (-0.23%); split: -0.27%, +0.03% InvThroughput: 42600163 -> 42590608 (-0.02%); split: -0.03%, +0.00% VClause: 378960 -> 378912 (-0.01%); split: -0.02%, +0.00% SClause: 727560 -> 720573 (-0.96%); split: -1.08%, +0.12% Reviewed-by: Daniel Schürmann Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_scheduler_ilp.cpp | 19 +++++++++++++------ src/amd/compiler/tests/test_d3d11_derivs.cpp | 2 +- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/amd/compiler/aco_scheduler_ilp.cpp b/src/amd/compiler/aco_scheduler_ilp.cpp index 1adfd43a47a..4589427d9a1 100644 --- a/src/amd/compiler/aco_scheduler_ilp.cpp +++ b/src/amd/compiler/aco_scheduler_ilp.cpp @@ -80,7 +80,7 @@ struct SchedILPContext { bool can_reorder(const Instruction* const instr) { - if (instr->isVALU()) + if (instr->isVALU() || instr->isVINTRP()) return true; if (!instr->isSALU() || instr->isSOPP()) return false; @@ -285,7 +285,7 @@ unsigned get_latency(const Instruction* const instr) { /* Note, that these are not accurate latency estimations. */ - if (instr->isVALU()) + if (instr->isVALU() || instr->isVINTRP()) return 5; if (instr->isSALU()) return 2; @@ -432,9 +432,6 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx) * any cases that are actually a concern for clause formation are added as transitive * dependencies. */ write_dep_mask &= ~ctx.non_reorder_mask; - /* Ignore RaW for VINTERP. */ - if (instr->isVINTRP()) - entry.dependency_mask &= ~ctx.non_reorder_mask; ctx.potential_partial_clause = true; } else if (ctx.last_non_reorderable != UINT8_MAX) { ctx.potential_partial_clause = false; @@ -558,8 +555,14 @@ select_instruction_ilp(const SchedILPContext& ctx) mask = collect_clause_dependencies(ctx, ctx.next_non_reorderable, 0); } + /* VINTRP(gfx6-10.3) can be handled like alu, but switching between VINTRP and other + * alu has a cost. So if the previous instr was VINTRP, try to keep selecting VINTRP. + */ + bool prefer_vintrp = ctx.prev_info.instr && ctx.prev_info.instr->isVINTRP(); + /* Select the instruction with highest priority of all candidates. */ unsigned idx = -1u; + bool idx_vintrp = false; int32_t priority = INT32_MIN; u_foreach_bit (i, mask) { const InstrInfo& candidate = ctx.nodes[i]; @@ -568,8 +571,12 @@ select_instruction_ilp(const SchedILPContext& ctx) if (candidate.dependency_mask) continue; - if (idx == -1u || candidate.priority > priority) { + bool is_vintrp = prefer_vintrp && candidate.instr->isVINTRP(); + + if (idx == -1u || (is_vintrp && !idx_vintrp) || + (is_vintrp == idx_vintrp && candidate.priority > priority)) { idx = i; + idx_vintrp = is_vintrp; priority = candidate.priority; } } diff --git a/src/amd/compiler/tests/test_d3d11_derivs.cpp b/src/amd/compiler/tests/test_d3d11_derivs.cpp index 3906d7295e3..69a879da883 100644 --- a/src/amd/compiler/tests/test_d3d11_derivs.cpp +++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp @@ -378,8 +378,8 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9) //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.y ; $_ //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_ - //>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp ; $_ //>> v_mov_b32_e32 v#ry, 0.5 ; $_ + //>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp ; $_ //>> v_mov_b32_e32 v#rx, v#rx_tmp ; $_ //>> v_mov_b32_e32 v#rl, v#rl_tmp ; $_ //>> BB1: