radv: lower push constants in NIR

fossil-db (navi21):
Totals from 879 (1.11% of 79395) affected shaders:
Instrs: 1359371 -> 1360237 (+0.06%); split: -0.02%, +0.08%
CodeSize: 7290856 -> 7294308 (+0.05%); split: -0.01%, +0.06%
SpillSGPRs: 751 -> 800 (+6.52%)
Latency: 21923904 -> 21923983 (+0.00%); split: -0.03%, +0.03%
InvThroughput: 7029748 -> 7029528 (-0.00%); split: -0.03%, +0.03%
VClause: 23595 -> 23610 (+0.06%)
SClause: 31819 -> 32256 (+1.37%); split: -0.07%, +1.44%
Copies: 109175 -> 110089 (+0.84%); split: -0.13%, +0.97%
Branches: 32068 -> 32072 (+0.01%); split: -0.02%, +0.03%
PreSGPRs: 41831 -> 41774 (-0.14%); split: -0.15%, +0.01%
PreVGPRs: 53605 -> 53604 (-0.00%)
VALU: 1020426 -> 1020521 (+0.01%); split: -0.00%, +0.01%
SALU: 135931 -> 136850 (+0.68%); split: -0.08%, +0.76%
SMEM: 51688 -> 51686 (-0.00%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29675>
This commit is contained in:
Rhys Perry 2024-06-06 12:57:26 +01:00 committed by Marge Bot
parent 1ca97f019e
commit edbb75ce3a
2 changed files with 59 additions and 0 deletions

View file

@ -342,6 +342,55 @@ update_image_intrinsic(nir_builder *b, apply_layout_state *state, nir_intrinsic_
}
}
static bool
can_increase_load_size(nir_intrinsic_instr *intrin, unsigned offset, unsigned old, unsigned new)
{
/* Only increase the size of loads if doing so won't extend into a new page/cache-line. */
unsigned align_mul = MIN2(nir_intrinsic_align_mul(intrin), 64u);
unsigned end = (nir_intrinsic_align_offset(intrin) + offset + old) & (align_mul - 1);
return (new - old) <= (align_mul - end);
}
static nir_def *
load_push_constant(nir_builder *b, apply_layout_state *state, nir_intrinsic_instr *intrin)
{
unsigned base = nir_intrinsic_base(intrin);
unsigned bit_size = intrin->def.bit_size;
unsigned count = intrin->def.num_components * (bit_size / 32u);
assert(bit_size >= 32);
/* Try to use inline push constants when possible. */
if (nir_src_is_const(intrin->src[0])) {
unsigned start = (base + nir_src_as_uint(intrin->src[0])) / 4u;
uint64_t mask = BITFIELD64_MASK(count) << start;
if ((state->args->ac.inline_push_const_mask & mask) == mask &&
start + count <= (sizeof(state->args->ac.inline_push_const_mask) * 8u)) {
start = util_bitcount64(state->args->ac.inline_push_const_mask & BITFIELD64_MASK(start));
nir_def *res[NIR_MAX_VEC_COMPONENTS * 2];
for (unsigned i = 0; i < count; i++)
res[i] = get_scalar_arg(b, 1, state->args->ac.inline_push_consts[start + i]);
return nir_extract_bits(b, res, count, 0, intrin->def.num_components, bit_size);
}
}
nir_def *addr = get_scalar_arg(b, 1, state->args->ac.push_constants);
addr = convert_pointer_to_64_bit(b, state, addr);
nir_def *offset = nir_iadd_imm_nuw(b, intrin->src[0].ssa, base);
nir_def *data[NIR_MAX_VEC_COMPONENTS];
unsigned num_loads = 0;
for (unsigned start = 0; start < count;) {
unsigned size = 1 << (util_last_bit(count - start) - 1); /* Round down to power of two. */
/* Try to round up to power of two instead. */
if (size < (count - start) && can_increase_load_size(intrin, start * 4, size, size * 2))
size *= 2;
data[num_loads++] = nir_load_smem_amd(b, size, addr, nir_iadd_imm_nuw(b, offset, start * 4));
start += size;
}
return nir_extract_bits(b, data, num_loads, 0, intrin->def.num_components, bit_size);
}
static void
apply_layout_to_intrin(nir_builder *b, apply_layout_state *state, nir_intrinsic_instr *intrin)
{
@ -382,6 +431,11 @@ apply_layout_to_intrin(nir_builder *b, apply_layout_state *state, nir_intrinsic_
case nir_intrinsic_image_deref_descriptor_amd:
update_image_intrinsic(b, state, intrin);
break;
case nir_intrinsic_load_push_constant: {
nir_def_rewrite_uses(&intrin->def, load_push_constant(b, state, intrin));
nir_instr_remove(&intrin->instr);
break;
}
default:
break;
}

View file

@ -434,6 +434,11 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
NIR_PASS(_, stage->nir, nir_copy_prop);
NIR_PASS(_, stage->nir, nir_opt_shrink_stores, !instance->drirc.disable_shrink_image_store);
/* Ensure vectorized load_push_constant still have constant offsets, for
* radv_nir_apply_pipeline_layout. */
if (stage->args.ac.inline_push_const_mask)
NIR_PASS(_, stage->nir, nir_opt_constant_folding);
/* Gather info again, to update whether 8/16-bit are used. */
nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir));
}