nak: optimize iadds with an uniform operand in iadds of address calculations

Instead of doing the iadd manually we can use the uniform slot of the
ld/st/atom instruction getting rid of the iadd altogether.

Additionally for global memory we can also consume a 32 bit offset instead
of requiring it to be 64 bit.

Totals from 158539 (13.07% of 1212873) affected shaders:
CodeSize: 2308216336 -> 2242231136 (-2.86%); split: -2.86%, +0.00%
Number of GPRs: 8682436 -> 8662675 (-0.23%); split: -0.26%, +0.04%
SLM Size: 238816 -> 238604 (-0.09%)
Static cycle count: 2169063422 -> 2147747544 (-0.98%); split: -0.99%, +0.01%
Spills to memory: 25845 -> 25799 (-0.18%); split: -0.20%, +0.02%
Fills from memory: 25845 -> 25799 (-0.18%); split: -0.20%, +0.02%
Spills to reg: 45053 -> 45273 (+0.49%); split: -0.04%, +0.53%
Fills from reg: 36385 -> 36757 (+1.02%); split: -0.04%, +1.06%
Max warps/SM: 6027232 -> 6034616 (+0.12%); split: +0.12%, -0.00%
This commit is contained in:
Karol Herbst 2026-03-15 21:16:30 +01:00 committed by Karol Herbst
parent eeadd23c09
commit 0b4705ec95

View file

@ -1130,6 +1130,113 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
return progress;
}
static bool
is_divergent_phi(nir_instr *instr)
{
if (instr->type != nir_instr_type_phi)
return false;
nir_phi_instr *phi = nir_instr_as_phi(instr);
return nak_nir_phi_is_divergent(phi);
}
static bool
nak_nir_opt_uniform_address_impl(struct nir_builder *b,
nir_intrinsic_instr *intr, void *cb_data)
{
switch (intr->intrinsic) {
case nir_intrinsic_cmat_load_shared_nv:
case nir_intrinsic_global_atomic_nv:
case nir_intrinsic_load_global_nv:
case nir_intrinsic_load_scratch_nv:
case nir_intrinsic_load_shared_nv:
case nir_intrinsic_shared_atomic_nv:
case nir_intrinsic_store_global_nv:
case nir_intrinsic_store_scratch_nv:
case nir_intrinsic_store_shared_nv: {
nir_src *offset_src = nir_get_io_offset_src(intr);
nir_def *offset = offset_src->ssa;
nir_src *uniform_offset_src = nir_get_io_uniform_offset_src(intr);
nir_def *uniform_offset = uniform_offset_src->ssa;
nir_block *use_block = intr->instr.block;
assert(nir_src_as_uint(*uniform_offset_src) == 0);
/* Nak can't collect vectors in non uniform control flow, so don't
* even try */
if (offset->bit_size == 64 && nak_block_is_divergent(use_block))
return false;
/* We ignore any constant offset */
if (nir_src_is_const(*offset_src))
return false;
/* If the source is already uniform, just swap them as the uniform slot
* should be 0 */
if (!nir_def_is_divergent_at_use_block(offset, use_block)) {
if (is_divergent_phi(nir_def_instr(offset)))
return false;
nir_src_rewrite(uniform_offset_src, offset);
nir_src_rewrite(offset_src, uniform_offset);
return true;
}
nir_alu_instr *iadd = nir_def_as_alu_or_null(offset_src->ssa);
if (!iadd || iadd->op != nir_op_iadd)
return false;
unsigned src0_div = nir_def_is_divergent_at_use_block(iadd->src[0].src.ssa, use_block);
unsigned src1_div = nir_def_is_divergent_at_use_block(iadd->src[1].src.ssa, use_block);
if (src0_div && src1_div)
return false;
b->cursor = nir_before_instr(&intr->instr);
nir_def *addr, *uaddr;
if (src0_div) {
assert(!src1_div);
addr = nir_ssa_for_alu_src(b, iadd, 0);
uaddr = nir_ssa_for_alu_src(b, iadd, 1);
} else {
assert(src1_div);
addr = nir_ssa_for_alu_src(b, iadd, 1);
uaddr = nir_ssa_for_alu_src(b, iadd, 0);
}
if (is_divergent_phi(nir_def_instr(uaddr)))
return false;
/* We can remove a u2u64 on the non uniform src */
if (addr->bit_size == 64) {
nir_alu_instr *u2u64 = nir_def_as_alu_or_null(addr);
if (u2u64 && u2u64->op == nir_op_u2u64)
addr = nir_ssa_for_alu_src(b, u2u64, 0);
}
nir_src_rewrite(offset_src, addr);
nir_src_rewrite(uniform_offset_src, uaddr);
return true;
}
default:
return false;
}
}
/** This pass assumes it is ran after nir_opt_offset */
static bool
nak_nir_opt_uniform_address(nir_shader *nir)
{
if (nak_debug_no_ugpr())
return false;
nir_divergence_analysis(nir);
return nir_shader_intrinsics_pass(
nir,
nak_nir_opt_uniform_address_impl,
nir_metadata_control_flow,
NULL
);
}
static bool
nak_nir_opt_offset_shift_nv_impl(struct nir_builder *b,
nir_intrinsic_instr *intrin, void *data)
@ -1333,6 +1440,12 @@ nak_postprocess_nir(nir_shader *nir,
.cb_data = nak,
};
OPT(nir, nir_opt_offsets, &nak_offset_options);
if (nak->sm >= 73) {
OPT(nir, nak_nir_opt_uniform_address);
/* TODO: as we eliminate u2u64s we could fold more offsets in, however
* This would require us to verify it doesn't overflow, which we can't. */
/* OPT(nir, nir_opt_offsets, &nak_offset_options); */
}
/* Should run after nir_opt_offsets, because nir_opt_algebraic will move
* iadds down the chain */