nir/load_store_vectorize: hoist base addr instead of subtracting

Totals from 3130 (3.92% of 79839) affected shaders: (Navi48)
Instrs: 2634316 -> 2633652 (-0.03%); split: -0.06%, +0.04%
CodeSize: 13999784 -> 13996888 (-0.02%); split: -0.05%, +0.03%
SpillSGPRs: 1771 -> 1778 (+0.40%)
Latency: 27233464 -> 27230934 (-0.01%); split: -0.02%, +0.01%
InvThroughput: 4234587 -> 4234550 (-0.00%); split: -0.00%, +0.00%
VClause: 54684 -> 54689 (+0.01%)
SClause: 62743 -> 62912 (+0.27%); split: -0.08%, +0.35%
Copies: 162594 -> 163729 (+0.70%); split: -0.22%, +0.91%
PreSGPRs: 146909 -> 146914 (+0.00%); split: -0.01%, +0.01%
VALU: 1558771 -> 1558778 (+0.00%)
SALU: 337715 -> 338168 (+0.13%); split: -0.30%, +0.44%

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37163>
This commit is contained in:
Daniel Schürmann 2025-07-19 12:49:57 +02:00 committed by Marge Bot
parent cfba417316
commit a53190a426

View file

@ -892,6 +892,31 @@ subtract_deref(nir_builder *b, nir_deref_instr *deref, int64_t offset)
b, deref, nir_imm_intN_t(b, -offset, deref->def.bit_size));
}
static void
hoist_base_addr(nir_instr *instr, nir_instr *to_hoist)
{
/* Return if this instruction already dominates the first load. */
if (to_hoist->block != instr->block || to_hoist->index <= instr->index)
return;
/* Only the offset calculation (consisting of ALU and load_const)
* differs between the vectorized loads.
*/
assert(to_hoist->type == nir_instr_type_load_const ||
to_hoist->type == nir_instr_type_alu);
if (to_hoist->type == nir_instr_type_alu) {
/* For ALU, recursively hoist the sources. */
nir_alu_instr *alu = nir_instr_as_alu(to_hoist);
for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++)
hoist_base_addr(instr, alu->src[i].src.ssa->parent_instr);
}
nir_instr_move(nir_before_instr(instr), to_hoist);
to_hoist->index = instr->index;
return;
}
static void
vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx,
struct entry *low, struct entry *high,
@ -958,21 +983,16 @@ vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx,
/* update the offset */
if (first != low && info->base_src >= 0) {
/* let nir_opt_algebraic() remove this addition. this doesn't have much
* issues with subtracting 16 from expressions like "(i + 1) * 16" because
* nir_opt_algebraic() turns them into "i * 16 + 16" */
b->cursor = nir_before_instr(first->instr);
/* Hoist low base addr before first intrinsic. */
nir_def *base = low->intrin->src[info->base_src].ssa;
hoist_base_addr(first->instr, base->parent_instr);
nir_src_rewrite(&first->intrin->src[info->base_src], base);
if (nir_intrinsic_has_offset_shift(first->intrin)) {
nir_add_io_offset(b, first->intrin, -(int)(high_start / 8u));
} else {
/* TODO once all intrinsics that need a scaled offset use
* offset_shift, this old path can be removed.
*/
nir_def *new_base = first->intrin->src[info->base_src].ssa;
new_base = nir_iadd_imm(
b, new_base, -(int)(high_start / 8u / get_offset_scale(first)));
nir_src_rewrite(&first->intrin->src[info->base_src], new_base);
nir_intrinsic_set_offset_shift(first->intrin,
nir_intrinsic_offset_shift(low->intrin));
}
}
@ -1013,7 +1033,7 @@ vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx,
nir_intrinsic_set_range_base(first->intrin, old_low_range_base);
nir_intrinsic_set_range(first->intrin, range);
} else if (nir_intrinsic_has_base(first->intrin) && info->base_src == -1 && info->deref_src == -1) {
} else if (nir_intrinsic_has_base(first->intrin) && info->deref_src == -1) {
nir_intrinsic_set_base(first->intrin, nir_intrinsic_base(low->intrin));
}
@ -1737,6 +1757,8 @@ process_block(nir_function_impl *impl, struct vectorize_ctx *ctx, nir_block *blo
unsigned next_index = 0;
nir_foreach_instr_safe(instr, block) {
instr->index = next_index++;
if (handle_barrier(ctx, &progress, impl, instr))
continue;
@ -1760,7 +1782,7 @@ process_block(nir_function_impl *impl, struct vectorize_ctx *ctx, nir_block *blo
/* create entry */
struct entry *entry = create_entry(ctx, ctx, info, intrin);
entry->index = next_index++;
entry->index = next_index;
list_addtail(&entry->head, &ctx->entries[mode_index]);