aco/sched_ilp: reorder VINTRP

VINTRP(gfx6-gfx10.3) is mostly just VALU, but we treated it like memory
instructions as an afterthought. This had issues as VINTRP was never reordered
with itself, or other memory instructions. Reordering VINTRP in clauses
increases ILP. We don't really need collect_clause_dependencies for VINTRP
either, because they ususally have the same dependencies already. That means
we can still form VINTRP clauses by selecting preferably VINTRP after a
previous one.

Foz-DB Navi21:
Totals from 34184 (43.16% of 79206) affected shaders:
Instrs: 18811270 -> 18812046 (+0.00%); split: -0.01%, +0.02%
CodeSize: 103627276 -> 103630056 (+0.00%); split: -0.01%, +0.01%
Latency: 188379364 -> 187936731 (-0.23%); split: -0.27%, +0.03%
InvThroughput: 42600163 -> 42590608 (-0.02%); split: -0.03%, +0.00%
VClause: 378960 -> 378912 (-0.01%); split: -0.02%, +0.00%
SClause: 727560 -> 720573 (-0.96%); split: -1.08%, +0.12%

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Reviewed-by: Daniel Schürmann <None>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33111>
This commit is contained in:
Georg Lehmann 2025-01-24 08:42:00 +01:00 committed by Marge Bot
parent 45ccd6487f
commit df1de388a3
2 changed files with 14 additions and 7 deletions

View file

@ -80,7 +80,7 @@ struct SchedILPContext {
bool
can_reorder(const Instruction* const instr)
{
if (instr->isVALU())
if (instr->isVALU() || instr->isVINTRP())
return true;
if (!instr->isSALU() || instr->isSOPP())
return false;
@ -285,7 +285,7 @@ unsigned
get_latency(const Instruction* const instr)
{
/* Note, that these are not accurate latency estimations. */
if (instr->isVALU())
if (instr->isVALU() || instr->isVINTRP())
return 5;
if (instr->isSALU())
return 2;
@ -432,9 +432,6 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
* any cases that are actually a concern for clause formation are added as transitive
* dependencies. */
write_dep_mask &= ~ctx.non_reorder_mask;
/* Ignore RaW for VINTERP. */
if (instr->isVINTRP())
entry.dependency_mask &= ~ctx.non_reorder_mask;
ctx.potential_partial_clause = true;
} else if (ctx.last_non_reorderable != UINT8_MAX) {
ctx.potential_partial_clause = false;
@ -558,8 +555,14 @@ select_instruction_ilp(const SchedILPContext& ctx)
mask = collect_clause_dependencies(ctx, ctx.next_non_reorderable, 0);
}
/* VINTRP(gfx6-10.3) can be handled like alu, but switching between VINTRP and other
* alu has a cost. So if the previous instr was VINTRP, try to keep selecting VINTRP.
*/
bool prefer_vintrp = ctx.prev_info.instr && ctx.prev_info.instr->isVINTRP();
/* Select the instruction with highest priority of all candidates. */
unsigned idx = -1u;
bool idx_vintrp = false;
int32_t priority = INT32_MIN;
u_foreach_bit (i, mask) {
const InstrInfo& candidate = ctx.nodes[i];
@ -568,8 +571,12 @@ select_instruction_ilp(const SchedILPContext& ctx)
if (candidate.dependency_mask)
continue;
if (idx == -1u || candidate.priority > priority) {
bool is_vintrp = prefer_vintrp && candidate.instr->isVINTRP();
if (idx == -1u || (is_vintrp && !idx_vintrp) ||
(is_vintrp == idx_vintrp && candidate.priority > priority)) {
idx = i;
idx_vintrp = is_vintrp;
priority = candidate.priority;
}
}

View file

@ -378,8 +378,8 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9)
//>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.y ; $_
//>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x ; $_
//>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp ; $_
//>> v_mov_b32_e32 v#ry, 0.5 ; $_
//>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp ; $_
//>> v_mov_b32_e32 v#rx, v#rx_tmp ; $_
//>> v_mov_b32_e32 v#rl, v#rl_tmp ; $_
//>> BB1: