radv/nir/lower_cmat: vectorize GFX11 ACC -> B conversion

Foz-DB Navi31: Totals from 10 out of 14 FSR4 shaders: Instrs: 64204 -> 60749 (-5.38%) CodeSize: 439052 -> 417668 (-4.87%) SpillVGPRs: 186 -> 188 (+1.08%) Scratch: 23808 -> 24064 (+1.08%) Latency: 208878 -> 202903 (-2.86%) InvThroughput: 232898 -> 225688 (-3.10%) VClause: 902 -> 907 (+0.55%); split: -1.55%, +2.11% Copies: 6418 -> 3762 (-41.38%) Branches: 55 -> 37 (-32.73%) PreSGPRs: 297 -> 298 (+0.34%) PreVGPRs: 2299 -> 2303 (+0.17%) VALU: 54762 -> 51489 (-5.98%) SALU: 956 -> 938 (-1.88%) VMEM: 3469 -> 3473 (+0.12%) VOPD: 3895 -> 2126 (-45.42%) Acked-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36115>
2026-04-28 08:40:37 +02:00 · 2025-07-14 14:17:30 +02:00 · 2025-07-14 14:17:30 +02:00 · 7546169e1c
commit 7546169e1c
parent d672737372
1 changed files with 63 additions and 16 deletions
--- a/src/amd/vulkan/nir/radv_nir_lower_cooperative_matrix.c
+++ b/src/amd/vulkan/nir/radv_nir_lower_cooperative_matrix.c
@ -234,29 +234,76 @@ convert_use(nir_builder *b, nir_def *src, enum glsl_cmat_use src_use, enum glsl_
      assert(params->gfx_level < GFX12);
      nir_def *tmp[NIR_MAX_VEC_COMPONENTS];

-      if (params->wave_size == 64) {
-         nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, UINT32_MAX, 64));
+      if (src->bit_size == 32) {
+         if (params->wave_size == 64) {
+            nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, UINT32_MAX, 64));
+            for (int i = 0; i < num_comps; i++) {
+               nir_def *comp = components[i];
+               nir_def *half_swap = nir_rotate(b, comp, nir_imm_int(b, 32), .cluster_size = 64);
+
+               tmp[i * 2] = nir_bcsel(b, low_lanes, comp, half_swap);
+               tmp[i * 2 + 1] = nir_bcsel(b, low_lanes, half_swap, comp);
+            }
+            num_comps *= 2;
+            memcpy(components, tmp, sizeof(components));
+         }
+
+         nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, 0xffff0000ffffull, params->wave_size));
+         for (int i = 0; i < num_comps; i++) {
+            unsigned swap16 = 0x1f | (0x10 << 10);
+            nir_def *half_swap = nir_masked_swizzle_amd(b, components[i], .swizzle_mask = swap16, .fetch_inactive = 1);
+            tmp[i * 2] = nir_bcsel(b, low_lanes, components[i], half_swap);
+            tmp[i * 2 + 1] = nir_bcsel(b, low_lanes, half_swap, components[i]);
+         }
+
+         num_comps *= 2;
+         memcpy(components, tmp, sizeof(components));
+      } else {
+         /* Same as above, but operate on 32 bits at once, using byte_perm as vectorized bcsel. */
+         assert(src->bit_size < 32);
+         nir_def *packed = nir_extract_bits(b, components, num_comps, 0, num_comps / (32 / src->bit_size), 32);
+         num_comps /= 32 / src->bit_size;
+         for (unsigned i = 0; i < num_comps; i++)
+            components[i] = nir_channel(b, packed, i);
+
+         nir_def *low_sel = nir_imm_int(b, src->bit_size == 8 ? 0x05010400 : 0x05040100);
+         nir_def *high_sel = nir_imm_int(b, src->bit_size == 8 ? 0x01050004 : 0x01000504);
+
+         if (params->wave_size == 64) {
+            nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, UINT32_MAX, 64));
+            nir_def *first_perm = nir_bcsel(b, low_lanes, low_sel, high_sel);
+            nir_def *second_perm = nir_ior_imm(b, first_perm, 0x02020202);
+            for (int i = 0; i < num_comps; i++) {
+               nir_def *comp = components[i];
+               nir_def *half_swap = nir_rotate(b, comp, nir_imm_int(b, 32), .cluster_size = 64);
+
+               tmp[i * 2] = nir_byte_perm_amd(b, half_swap, comp, first_perm);
+               tmp[i * 2 + 1] = nir_byte_perm_amd(b, half_swap, comp, second_perm);
+            }
+            num_comps *= 2;
+            memcpy(components, tmp, sizeof(components));
+         }
+
+         nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, 0xffff0000ffffull, params->wave_size));
+         nir_def *first_perm = nir_bcsel(b, low_lanes, low_sel, high_sel);
+         nir_def *second_perm = nir_ior_imm(b, first_perm, 0x02020202);
         for (int i = 0; i < num_comps; i++) {
            nir_def *comp = components[i];
-            nir_def *half_swap = nir_rotate(b, comp, nir_imm_int(b, 32), .cluster_size = 64);
-
-            tmp[i * 2] = nir_bcsel(b, low_lanes, comp, half_swap);
-            tmp[i * 2 + 1] = nir_bcsel(b, low_lanes, half_swap, comp);
+            unsigned swap16 = 0x1f | (0x10 << 10);
+            nir_def *half_swap = nir_masked_swizzle_amd(b, comp, .swizzle_mask = swap16, .fetch_inactive = 1);
+            tmp[i * 2] = nir_byte_perm_amd(b, half_swap, comp, first_perm);
+            tmp[i * 2 + 1] = nir_byte_perm_amd(b, half_swap, comp, second_perm);
         }
         num_comps *= 2;
         memcpy(components, tmp, sizeof(components));
+
+         nir_def *unpacked =
+            nir_extract_bits(b, components, num_comps, 0, num_comps * (32 / src->bit_size), src->bit_size);
+         num_comps *= 32 / src->bit_size;
+         for (unsigned i = 0; i < num_comps; i++)
+            components[i] = nir_channel(b, unpacked, i);
      }

-      nir_def *low_lanes = nir_inverse_ballot(b, 1, nir_imm_intN_t(b, 0xffff0000ffffull, params->wave_size));
-      for (int i = 0; i < num_comps; i++) {
-         unsigned swap16 = 0x1f | (0x10 << 10);
-         nir_def *half_swap = nir_masked_swizzle_amd(b, components[i], .swizzle_mask = swap16, .fetch_inactive = 1);
-         tmp[i * 2] = nir_bcsel(b, low_lanes, components[i], half_swap);
-         tmp[i * 2 + 1] = nir_bcsel(b, low_lanes, half_swap, components[i]);
-      }
-
-      num_comps *= 2;
-      memcpy(components, tmp, sizeof(components));
      assert(num_comps == 16);
   } else if (src_use == GLSL_CMAT_USE_B && dst_use == GLSL_CMAT_USE_ACCUMULATOR) {
      assert(params->gfx_level < GFX12);