radv: improve LDS alignment check for load/store vectorization

Previously, this could vectorize two scalar 16-bit loads into a u8vec4 load. No fossil-db changes. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11113>
2026-05-02 07:58:07 +02:00 · 2021-06-01 11:20:00 +01:00 · 2021-06-01 11:20:00 +01:00 · d2b9c7e982
commit d2b9c7e982
parent 4870d7d829
1 changed files with 16 additions and 6 deletions
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@ -3124,13 +3124,23 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_s
   case nir_intrinsic_load_shared:
   case nir_intrinsic_store_shared:
      if (bit_size * num_components ==
-          96) /* 96 bit loads require 128 bit alignment and are split otherwise */
+          96) { /* 96 bit loads require 128 bit alignment and are split otherwise */
         return align % 16 == 0;
-      else if (bit_size * num_components ==
-               128) /* 128 bit loads require 64 bit alignment and are split otherwise */
-         return align % 8 == 0;
-      else
-         return align % (bit_size == 8 ? 2 : 4) == 0;
+      } else if (bit_size == 16 && (align % 4)) {
+         /* AMD hardware can't do 2-byte aligned f16vec2 loads, but they are useful for ALU
+          * vectorization, because our vectorizer requires the scalar IR to already contain vectors.
+          */
+         return (align % 2 == 0) && num_components <= 2;
+      } else {
+         if (num_components == 3) {
+            /* AMD hardware can't do 3-component loads except for 96-bit loads, handled above. */
+            return false;
+         }
+         unsigned req = bit_size * num_components;
+         if (req == 64 || req == 128) /* 64-bit and 128-bit loads can use ds_read2_b{32,64} */
+            req /= 2u;
+         return align % (req / 8u) == 0;
+      }
   default:
      return false;
   }