radv/nir/lower_cmat: vectorize GFX11 ACC -> B conversion

Foz-DB Navi31:
Totals from 10 out of 14 FSR4 shaders:
Instrs: 64204 -> 60749 (-5.38%)
CodeSize: 439052 -> 417668 (-4.87%)
SpillVGPRs: 186 -> 188 (+1.08%)
Scratch: 23808 -> 24064 (+1.08%)
Latency: 208878 -> 202903 (-2.86%)
InvThroughput: 232898 -> 225688 (-3.10%)
VClause: 902 -> 907 (+0.55%); split: -1.55%, +2.11%
Copies: 6418 -> 3762 (-41.38%)
Branches: 55 -> 37 (-32.73%)
PreSGPRs: 297 -> 298 (+0.34%)
PreVGPRs: 2299 -> 2303 (+0.17%)
VALU: 54762 -> 51489 (-5.98%)
SALU: 956 -> 938 (-1.88%)
VMEM: 3469 -> 3473 (+0.12%)
VOPD: 3895 -> 2126 (-45.42%)

Acked-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36115>
This commit is contained in:
Georg Lehmann 2025-07-14 14:17:30 +02:00 committed by Marge Bot
parent d672737372
commit 7546169e1c

View file

@ -234,29 +234,76 @@ convert_use(nir_builder *b, nir_def *src, enum glsl_cmat_use src_use, enum glsl_
assert(params->gfx_level < GFX12);
nir_def *tmp[NIR_MAX_VEC_COMPONENTS];
if (params->wave_size == 64) {
nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, UINT32_MAX, 64));
if (src->bit_size == 32) {
if (params->wave_size == 64) {
nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, UINT32_MAX, 64));
for (int i = 0; i < num_comps; i++) {
nir_def *comp = components[i];
nir_def *half_swap = nir_rotate(b, comp, nir_imm_int(b, 32), .cluster_size = 64);
tmp[i * 2] = nir_bcsel(b, low_lanes, comp, half_swap);
tmp[i * 2 + 1] = nir_bcsel(b, low_lanes, half_swap, comp);
}
num_comps *= 2;
memcpy(components, tmp, sizeof(components));
}
nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, 0xffff0000ffffull, params->wave_size));
for (int i = 0; i < num_comps; i++) {
unsigned swap16 = 0x1f | (0x10 << 10);
nir_def *half_swap = nir_masked_swizzle_amd(b, components[i], .swizzle_mask = swap16, .fetch_inactive = 1);
tmp[i * 2] = nir_bcsel(b, low_lanes, components[i], half_swap);
tmp[i * 2 + 1] = nir_bcsel(b, low_lanes, half_swap, components[i]);
}
num_comps *= 2;
memcpy(components, tmp, sizeof(components));
} else {
/* Same as above, but operate on 32 bits at once, using byte_perm as vectorized bcsel. */
assert(src->bit_size < 32);
nir_def *packed = nir_extract_bits(b, components, num_comps, 0, num_comps / (32 / src->bit_size), 32);
num_comps /= 32 / src->bit_size;
for (unsigned i = 0; i < num_comps; i++)
components[i] = nir_channel(b, packed, i);
nir_def *low_sel = nir_imm_int(b, src->bit_size == 8 ? 0x05010400 : 0x05040100);
nir_def *high_sel = nir_imm_int(b, src->bit_size == 8 ? 0x01050004 : 0x01000504);
if (params->wave_size == 64) {
nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, UINT32_MAX, 64));
nir_def *first_perm = nir_bcsel(b, low_lanes, low_sel, high_sel);
nir_def *second_perm = nir_ior_imm(b, first_perm, 0x02020202);
for (int i = 0; i < num_comps; i++) {
nir_def *comp = components[i];
nir_def *half_swap = nir_rotate(b, comp, nir_imm_int(b, 32), .cluster_size = 64);
tmp[i * 2] = nir_byte_perm_amd(b, half_swap, comp, first_perm);
tmp[i * 2 + 1] = nir_byte_perm_amd(b, half_swap, comp, second_perm);
}
num_comps *= 2;
memcpy(components, tmp, sizeof(components));
}
nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, 0xffff0000ffffull, params->wave_size));
nir_def *first_perm = nir_bcsel(b, low_lanes, low_sel, high_sel);
nir_def *second_perm = nir_ior_imm(b, first_perm, 0x02020202);
for (int i = 0; i < num_comps; i++) {
nir_def *comp = components[i];
nir_def *half_swap = nir_rotate(b, comp, nir_imm_int(b, 32), .cluster_size = 64);
tmp[i * 2] = nir_bcsel(b, low_lanes, comp, half_swap);
tmp[i * 2 + 1] = nir_bcsel(b, low_lanes, half_swap, comp);
unsigned swap16 = 0x1f | (0x10 << 10);
nir_def *half_swap = nir_masked_swizzle_amd(b, comp, .swizzle_mask = swap16, .fetch_inactive = 1);
tmp[i * 2] = nir_byte_perm_amd(b, half_swap, comp, first_perm);
tmp[i * 2 + 1] = nir_byte_perm_amd(b, half_swap, comp, second_perm);
}
num_comps *= 2;
memcpy(components, tmp, sizeof(components));
nir_def *unpacked =
nir_extract_bits(b, components, num_comps, 0, num_comps * (32 / src->bit_size), src->bit_size);
num_comps *= 32 / src->bit_size;
for (unsigned i = 0; i < num_comps; i++)
components[i] = nir_channel(b, unpacked, i);
}
nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, 0xffff0000ffffull, params->wave_size));
for (int i = 0; i < num_comps; i++) {
unsigned swap16 = 0x1f | (0x10 << 10);
nir_def *half_swap = nir_masked_swizzle_amd(b, components[i], .swizzle_mask = swap16, .fetch_inactive = 1);
tmp[i * 2] = nir_bcsel(b, low_lanes, components[i], half_swap);
tmp[i * 2 + 1] = nir_bcsel(b, low_lanes, half_swap, components[i]);
}
num_comps *= 2;
memcpy(components, tmp, sizeof(components));
assert(num_comps == 16);
} else if (src_use == GLSL_CMAT_USE_B && dst_use == GLSL_CMAT_USE_ACCUMULATOR) {
assert(params->gfx_level < GFX12);