nir/lower_task_shader: fix task payload corruption when shared memory workaround is enabled

We were not taking into account that when all invocations within workgroup are active, we'll copy more data than needed, corrupting task payload of other workgroups. Fixes: 8aff8d3dd4 ("nir: Add common task shader lowering to make the backend's job easier.") Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20080>
2025-12-24 08:50:13 +01:00 · 2022-11-30 13:47:19 +01:00 · 2022-11-30 13:47:19 +01:00 · ffefa386fd
commit ffefa386fd
parent bd30adf89d
1 changed files with 75 additions and 19 deletions
--- a/src/compiler/nir/nir_lower_task_shader.c
+++ b/src/compiler/nir/nir_lower_task_shader.c
@ -191,22 +191,50 @@ lower_task_payload_to_shared(nir_builder *b,
   return true;
 }

+static void
+copy_shared_to_payload(nir_builder *b,
+                       unsigned num_components,
+                       nir_ssa_def *addr,
+                       unsigned shared_base,
+                       unsigned off)
+{
+   /* Read from shared memory. */
+   nir_ssa_def *copy = nir_load_shared(b, num_components, 32, addr,
+                                       .align_mul = 16,
+                                       .base = shared_base + off);
+
+   /* Write to task payload memory. */
+   nir_store_task_payload(b, copy, addr, .base = off);
+}
+
 static void
 emit_shared_to_payload_copy(nir_builder *b,
                            uint32_t payload_addr,
                            uint32_t payload_size,
                            lower_task_state *s)
 {
+   /* Copy from shared memory to task payload using as much parallelism
+    * as possible. This is achieved by splitting the work into max 3 phases:
+    * 1) copy maximum number of vec4s using all invocations within workgroup
+    * 2) copy maximum number of vec4s using some invocations
+    * 3) copy remaining dwords (< 4) using only the first invocation
+    */
   const unsigned invocations = b->shader->info.workgroup_size[0] *
-                          b->shader->info.workgroup_size[1] *
-                          b->shader->info.workgroup_size[2];
-   const unsigned bytes_per_copy = 16;
-   const unsigned copies_needed = DIV_ROUND_UP(payload_size, bytes_per_copy);
-   const unsigned copies_per_invocation = DIV_ROUND_UP(copies_needed, invocations);
+                                b->shader->info.workgroup_size[1] *
+                                b->shader->info.workgroup_size[2];
+   const unsigned vec4size = 16;
+   const unsigned whole_wg_vec4_copies = payload_size / vec4size;
+   const unsigned vec4_copies_per_invocation = whole_wg_vec4_copies / invocations;
+   const unsigned remaining_vec4_copies = whole_wg_vec4_copies % invocations;
+   const unsigned remaining_dwords =
+         DIV_ROUND_UP(payload_size
+                       - vec4size * vec4_copies_per_invocation * invocations
+                       - vec4size * remaining_vec4_copies,
+                      4);
   const unsigned base_shared_addr = s->payload_shared_addr + payload_addr;

   nir_ssa_def *invocation_index = nir_load_local_invocation_index(b);
-   nir_ssa_def *addr = nir_imul_imm(b, invocation_index, bytes_per_copy);
+   nir_ssa_def *addr = nir_imul_imm(b, invocation_index, vec4size);

   /* Wait for all previous shared stores to finish.
    * This is necessary because we placed the payload in shared memory.
@ -216,22 +244,50 @@ emit_shared_to_payload_copy(nir_builder *b,
                         .memory_semantics = NIR_MEMORY_ACQ_REL,
                         .memory_modes = nir_var_mem_shared);

-   for (unsigned i = 0; i < copies_per_invocation; ++i) {
-      /* Payload_size is a size of user-accessible payload, but on some
-       * hardware (e.g. Intel) payload has a private header, which we have
-       * to offset (payload_offset_in_bytes).
-       */
-      unsigned const_off =
-            bytes_per_copy * invocations * i + s->payload_offset_in_bytes;
+   /* Payload_size is a size of user-accessible payload, but on some
+    * hardware (e.g. Intel) payload has a private header, which we have
+    * to offset (payload_offset_in_bytes).
+    */
+   unsigned off = s->payload_offset_in_bytes;

-      /* Read from shared memory. */
-      nir_ssa_def *copy =
-         nir_load_shared(b, 4, 32, addr, .align_mul = 16,
-                         .base = base_shared_addr + const_off);
+   /* Technically dword-alignment is not necessary for correctness
+    * of the code below, but even if backend implements unaligned
+    * load/stores, they will very likely be slow(er).
+    */
+   assert(off % 4 == 0);

-      /* Write to task payload memory. */
-      nir_store_task_payload(b, copy, addr, .base = const_off);
+   /* Copy full vec4s using all invocations in workgroup. */
+   for (unsigned i = 0; i < vec4_copies_per_invocation; ++i) {
+      copy_shared_to_payload(b, vec4size / 4, addr, base_shared_addr, off);
+      off += vec4size * invocations;
   }
+
+   /* Copy full vec4s using only the invocations needed to not overflow. */
+   if (remaining_vec4_copies > 0) {
+      assert(remaining_vec4_copies < invocations);
+
+      nir_ssa_def *cmp = nir_ilt(b, invocation_index, nir_imm_int(b, remaining_vec4_copies));
+      nir_if *if_stmt = nir_push_if(b, cmp);
+      {
+         copy_shared_to_payload(b, vec4size / 4, addr, base_shared_addr, off);
+      }
+      nir_pop_if(b, if_stmt);
+      off += vec4size * remaining_vec4_copies;
+   }
+
+   /* Copy the last few dwords not forming full vec4. */
+   if (remaining_dwords > 0) {
+      assert(remaining_dwords < 4);
+      nir_ssa_def *cmp = nir_ieq(b, invocation_index, nir_imm_int(b, 0));
+      nir_if *if_stmt = nir_push_if(b, cmp);
+      {
+         copy_shared_to_payload(b, remaining_dwords, addr, base_shared_addr, off);
+      }
+      nir_pop_if(b, if_stmt);
+      off += remaining_dwords * 4;
+   }
+
+   assert(s->payload_offset_in_bytes + ALIGN(payload_size, 4) == off);
 }

 static bool