mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 06:58:05 +02:00
nak: optimize iadds with an uniform operand in iadds of address calculations
Instead of doing the iadd manually we can use the uniform slot of the ld/st/atom instruction getting rid of the iadd altogether. Additionally for global memory we can also consume a 32 bit offset instead of requiring it to be 64 bit. Totals from 158539 (13.07% of 1212873) affected shaders: CodeSize: 2308216336 -> 2242231136 (-2.86%); split: -2.86%, +0.00% Number of GPRs: 8682436 -> 8662675 (-0.23%); split: -0.26%, +0.04% SLM Size: 238816 -> 238604 (-0.09%) Static cycle count: 2169063422 -> 2147747544 (-0.98%); split: -0.99%, +0.01% Spills to memory: 25845 -> 25799 (-0.18%); split: -0.20%, +0.02% Fills from memory: 25845 -> 25799 (-0.18%); split: -0.20%, +0.02% Spills to reg: 45053 -> 45273 (+0.49%); split: -0.04%, +0.53% Fills from reg: 36385 -> 36757 (+1.02%); split: -0.04%, +1.06% Max warps/SM: 6027232 -> 6034616 (+0.12%); split: +0.12%, -0.00%
This commit is contained in:
parent
eeadd23c09
commit
0b4705ec95
1 changed files with 113 additions and 0 deletions
|
|
@ -1130,6 +1130,113 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
|
|||
return progress;
|
||||
}
|
||||
|
||||
static bool
|
||||
is_divergent_phi(nir_instr *instr)
|
||||
{
|
||||
if (instr->type != nir_instr_type_phi)
|
||||
return false;
|
||||
nir_phi_instr *phi = nir_instr_as_phi(instr);
|
||||
return nak_nir_phi_is_divergent(phi);
|
||||
}
|
||||
|
||||
static bool
|
||||
nak_nir_opt_uniform_address_impl(struct nir_builder *b,
|
||||
nir_intrinsic_instr *intr, void *cb_data)
|
||||
{
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_cmat_load_shared_nv:
|
||||
case nir_intrinsic_global_atomic_nv:
|
||||
case nir_intrinsic_load_global_nv:
|
||||
case nir_intrinsic_load_scratch_nv:
|
||||
case nir_intrinsic_load_shared_nv:
|
||||
case nir_intrinsic_shared_atomic_nv:
|
||||
case nir_intrinsic_store_global_nv:
|
||||
case nir_intrinsic_store_scratch_nv:
|
||||
case nir_intrinsic_store_shared_nv: {
|
||||
nir_src *offset_src = nir_get_io_offset_src(intr);
|
||||
nir_def *offset = offset_src->ssa;
|
||||
nir_src *uniform_offset_src = nir_get_io_uniform_offset_src(intr);
|
||||
nir_def *uniform_offset = uniform_offset_src->ssa;
|
||||
nir_block *use_block = intr->instr.block;
|
||||
|
||||
assert(nir_src_as_uint(*uniform_offset_src) == 0);
|
||||
|
||||
/* Nak can't collect vectors in non uniform control flow, so don't
|
||||
* even try */
|
||||
if (offset->bit_size == 64 && nak_block_is_divergent(use_block))
|
||||
return false;
|
||||
|
||||
/* We ignore any constant offset */
|
||||
if (nir_src_is_const(*offset_src))
|
||||
return false;
|
||||
|
||||
/* If the source is already uniform, just swap them as the uniform slot
|
||||
* should be 0 */
|
||||
if (!nir_def_is_divergent_at_use_block(offset, use_block)) {
|
||||
if (is_divergent_phi(nir_def_instr(offset)))
|
||||
return false;
|
||||
nir_src_rewrite(uniform_offset_src, offset);
|
||||
nir_src_rewrite(offset_src, uniform_offset);
|
||||
return true;
|
||||
}
|
||||
|
||||
nir_alu_instr *iadd = nir_def_as_alu_or_null(offset_src->ssa);
|
||||
if (!iadd || iadd->op != nir_op_iadd)
|
||||
return false;
|
||||
|
||||
unsigned src0_div = nir_def_is_divergent_at_use_block(iadd->src[0].src.ssa, use_block);
|
||||
unsigned src1_div = nir_def_is_divergent_at_use_block(iadd->src[1].src.ssa, use_block);
|
||||
if (src0_div && src1_div)
|
||||
return false;
|
||||
|
||||
b->cursor = nir_before_instr(&intr->instr);
|
||||
|
||||
nir_def *addr, *uaddr;
|
||||
if (src0_div) {
|
||||
assert(!src1_div);
|
||||
addr = nir_ssa_for_alu_src(b, iadd, 0);
|
||||
uaddr = nir_ssa_for_alu_src(b, iadd, 1);
|
||||
} else {
|
||||
assert(src1_div);
|
||||
addr = nir_ssa_for_alu_src(b, iadd, 1);
|
||||
uaddr = nir_ssa_for_alu_src(b, iadd, 0);
|
||||
}
|
||||
|
||||
if (is_divergent_phi(nir_def_instr(uaddr)))
|
||||
return false;
|
||||
|
||||
/* We can remove a u2u64 on the non uniform src */
|
||||
if (addr->bit_size == 64) {
|
||||
nir_alu_instr *u2u64 = nir_def_as_alu_or_null(addr);
|
||||
if (u2u64 && u2u64->op == nir_op_u2u64)
|
||||
addr = nir_ssa_for_alu_src(b, u2u64, 0);
|
||||
}
|
||||
|
||||
nir_src_rewrite(offset_src, addr);
|
||||
nir_src_rewrite(uniform_offset_src, uaddr);
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** This pass assumes it is ran after nir_opt_offset */
|
||||
static bool
|
||||
nak_nir_opt_uniform_address(nir_shader *nir)
|
||||
{
|
||||
if (nak_debug_no_ugpr())
|
||||
return false;
|
||||
nir_divergence_analysis(nir);
|
||||
return nir_shader_intrinsics_pass(
|
||||
nir,
|
||||
nak_nir_opt_uniform_address_impl,
|
||||
nir_metadata_control_flow,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
nak_nir_opt_offset_shift_nv_impl(struct nir_builder *b,
|
||||
nir_intrinsic_instr *intrin, void *data)
|
||||
|
|
@ -1333,6 +1440,12 @@ nak_postprocess_nir(nir_shader *nir,
|
|||
.cb_data = nak,
|
||||
};
|
||||
OPT(nir, nir_opt_offsets, &nak_offset_options);
|
||||
if (nak->sm >= 73) {
|
||||
OPT(nir, nak_nir_opt_uniform_address);
|
||||
/* TODO: as we eliminate u2u64s we could fold more offsets in, however
|
||||
* This would require us to verify it doesn't overflow, which we can't. */
|
||||
/* OPT(nir, nir_opt_offsets, &nak_offset_options); */
|
||||
}
|
||||
|
||||
/* Should run after nir_opt_offsets, because nir_opt_algebraic will move
|
||||
* iadds down the chain */
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue