nir: remove duplicate alu channels in nir_opt_shrink_vectors

This will clean code like:
   vec3 32 ssa_8 = frcp ssa_7.www
   vec3 32 ssa_9 = fmul ssa_7.xyz, ssa_8
into
   vec1 32 ssa_8 = frcp ssa_7.w
   vec3 32 ssa_9 = fmul ssa_7.xyz, ssa_8.xxx

This helps r300 driver because we can only do single channel for math
ops at a time, so the first version would result in three frcp
instructions. The nir_opt_shrink_vectors comments even claim the pass
should be doing this, however it actually does it only for nir_op_vecx
instructions, so extend this for generic alu instructions.

RV530 shader-db:
total instructions in shared programs: 135032 -> 133707 (-0.98%)
instructions in affected programs: 46121 -> 44796 (-2.87%)
helped: 452
HURT: 26
total temps in shared programs: 17051 -> 17033 (-0.11%)
temps in affected programs: 1509 -> 1491 (-1.19%)
helped: 91
HURT: 30

12.02->12.08 (+0.5%) fps gain in Unigine Sanctuary (n=5) with RV530

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/7051
Signed-off-by: Pavel Ondračka <pavel.ondracka@gmail.com>
Reiewed-by: Gert Wollny <gert.wollny@collabora.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20213>
This commit is contained in:
Pavel Ondračka 2022-12-04 17:34:37 +01:00
parent 980df9ede1
commit cb7f201288

View file

@ -192,49 +192,60 @@ opt_shrink_vectors_alu(nir_builder *b, nir_alu_instr *instr)
return false;
unsigned mask = nir_ssa_def_components_read(def);
unsigned last_bit = util_last_bit(mask);
unsigned num_components = util_bitcount(mask);
unsigned rounded = round_up_components(num_components);
assert(rounded <= def->num_components);
num_components = rounded;
/* return, if there is nothing to do */
if (mask == 0 || num_components == def->num_components)
if (mask == 0)
return false;
const bool is_bitfield_mask = last_bit == num_components;
if (is_bitfield_mask) {
/* just reduce the number of components and return */
def->num_components = num_components;
instr->dest.write_mask = mask;
return true;
}
uint8_t reswizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
unsigned index = 0;
for (unsigned i = 0; i < last_bit; i++) {
unsigned num_components = 0;
bool progress = false;
for (unsigned i = 0; i < def->num_components; i++) {
/* skip unused components */
if (!((mask >> i) & 0x1))
continue;
/* reswizzle the sources */
for (int k = 0; k < nir_op_infos[instr->op].num_inputs; k++) {
instr->src[k].swizzle[index] = instr->src[k].swizzle[i];
reswizzle[i] = index;
/* Try reuse a component with the same swizzles */
unsigned j;
for (j = 0; j < num_components; j++) {
bool duplicate_channel = true;
for (unsigned k = 0; k < nir_op_infos[instr->op].num_inputs; k++) {
if (nir_op_infos[instr->op].input_sizes[k] != 0 ||
instr->src[k].swizzle[i] != instr->src[k].swizzle[j]) {
duplicate_channel = false;
break;
}
}
if (duplicate_channel) {
reswizzle[i] = j;
progress = true;
break;
}
}
/* Otherwise, just append the value */
if (j == num_components) {
for (int k = 0; k < nir_op_infos[instr->op].num_inputs; k++) {
instr->src[k].swizzle[num_components] = instr->src[k].swizzle[i];
}
if (i != num_components)
progress = true;
reswizzle[i] = num_components++;
}
index++;
}
assert(index == num_components);
unsigned rounded = round_up_components(num_components);
assert(rounded <= def->num_components);
/* update dest */
def->num_components = num_components;
instr->dest.write_mask = BITFIELD_MASK(num_components);
def->num_components = rounded;
instr->dest.write_mask = BITFIELD_MASK(rounded);
/* update uses */
reswizzle_alu_uses(def, reswizzle);
if (progress)
reswizzle_alu_uses(def, reswizzle);
return true;
return progress;
}
static bool