radv: vectorize some integer arithmetic and bcsel with scalar condition

Totals from 106 (0.13% of 79839) affected shaders: (Navi48) Instrs: 131026 -> 130042 (-0.75%); split: -0.82%, +0.07% CodeSize: 719120 -> 711516 (-1.06%); split: -1.20%, +0.14% VGPRs: 5244 -> 5232 (-0.23%) Latency: 2020748 -> 2004602 (-0.80%); split: -0.81%, +0.01% InvThroughput: 393330 -> 385414 (-2.01%); split: -2.01%, +0.00% VClause: 2193 -> 2192 (-0.05%) Copies: 13963 -> 13558 (-2.90%); split: -2.91%, +0.01% PreVGPRs: 2953 -> 2921 (-1.08%) VALU: 65595 -> 64835 (-1.16%); split: -1.16%, +0.00% SALU: 26887 -> 26611 (-1.03%) VMEM: 2921 -> 3005 (+2.88%) VOPD: 168 -> 173 (+2.98%) Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35784>
2026-05-05 16:08:04 +02:00 · 2025-06-27 08:29:39 +02:00 · 2025-06-27 08:29:39 +02:00 · 4671e5f20d
commit 4671e5f20d
parent 764ee3a834
2 changed files with 49 additions and 2 deletions
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@ -253,6 +253,23 @@ ycbcr_conversion_lookup(const void *data, uint32_t set, uint32_t binding, uint32
   return ycbcr_samplers + array_index;
 }

+static uint8_t
+max_alu_src_identity_swizzle(const nir_alu_instr *alu, const nir_alu_src *src)
+{
+   uint8_t max_vector = 32 / alu->def.bit_size;
+   if (nir_src_is_const(src->src))
+      return max_vector;
+
+   /* Return the number of correctly swizzled components. */
+   for (unsigned i = 1; i < alu->def.num_components; i++) {
+      if (src->swizzle[i] != src->swizzle[0] + i)
+         /* Ensure that the result is a power of 2. */
+         return MAX2(i & 0x6, 1);
+   }
+
+   return max_vector;
+}
+
 static uint8_t
 opt_vectorize_callback(const nir_instr *instr, const void *_)
 {
@ -281,10 +298,38 @@ opt_vectorize_callback(const nir_instr *instr, const void *_)
   }

   const unsigned bit_size = alu->def.bit_size;
-   if (bit_size != 16)
+   if (bit_size == 16 && aco_nir_op_supports_packed_math_16bit(alu))
+      return 2;
+
+   if (bit_size != 8 && bit_size != 16)
      return 1;

-   return aco_nir_op_supports_packed_math_16bit(alu) ? 2 : 1;
+   /* Keep some opcodes vectorized if the operation can be performed as
+    * 32-bit instruction with packed sources. The condition is that the
+    * sources must have identity swizzles. */
+   uint8_t target_width = 32 / bit_size;
+   switch (alu->op) {
+   case nir_op_bcsel:
+      /* Must have scalar condition. */
+      for (unsigned i = 1; i < alu->def.num_components; i++) {
+         if (alu->src[0].swizzle[i] != alu->src[0].swizzle[0])
+            return 1;
+      }
+      for (unsigned idx = 1; idx < 3; idx++)
+         target_width = MIN2(target_width, max_alu_src_identity_swizzle(alu, &alu->src[idx]));
+      break;
+   case nir_op_iand:
+   case nir_op_ior:
+   case nir_op_ixor:
+   case nir_op_inot:
+      for (unsigned idx = 0; idx < nir_op_infos[alu->op].num_inputs; idx++)
+         target_width = MIN2(target_width, max_alu_src_identity_swizzle(alu, &alu->src[idx]));
+      break;
+   default:
+      return 1;
+   }
+
+   return target_width;
 }

 static nir_component_mask_t
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@ -98,6 +98,8 @@ vectorize_vec2_16bit(const nir_instr *instr, const void *_)
   const unsigned bit_size = alu->def.bit_size;
   if (bit_size == 16)
      return 2;
+   else if (bit_size == 8)
+      return 4;
   else
      return 1;
 }