gallium/tgsi_exec: Fix shared memory atomic ops.

It was only executing the first channel, ignoring the rest. I also cleaned things up to not loop over rgba, since atomics are only ever to a single 32-bit value per invocation. This worked on softpipe previously because it only dispatches 1 CS invocation per TGSI exec machine anyway, wasting the other 3 slots. Reviewed-by: Dave Airlie <airlied@redhat.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8276>
2026-05-08 06:58:05 +02:00 · 2020-12-30 11:21:49 -08:00 · 2020-12-30 11:21:49 -08:00 · 85be1f867c
commit 85be1f867c
parent 28a202f2d3
1 changed files with 70 additions and 59 deletions
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@ -4292,76 +4292,87 @@ static void
 exec_atomop_mem(struct tgsi_exec_machine *mach,
                const struct tgsi_full_instruction *inst)
 {
-   union tgsi_exec_channel r[4];
-   union tgsi_exec_channel value[4], value2[4];
-   char *ptr = mach->LocalMem;
-   uint32_t val;
+   union tgsi_exec_channel offset, r0, r1;
   uint chan, i;
-   uint32_t offset;
   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
   int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
-   IFETCH(&r[0], 1, TGSI_CHAN_X);
+   IFETCH(&offset, 1, TGSI_CHAN_X);

-   if (r[0].u[0] >= mach->LocalMemSize)
+   if (!(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X))
      return;

-   offset = r[0].u[0];
-   ptr += offset;
-   for (i = 0; i < 4; i++) {
-      FETCH(&value[i], 2, TGSI_CHAN_X + i);
-      if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
-         FETCH(&value2[i], 3, TGSI_CHAN_X + i);
+   void *ptr[TGSI_QUAD_SIZE];
+   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
+      if (likely(mach->LocalMemSize >= 4 && offset.u[i] <= mach->LocalMemSize - 4))
+         ptr[i] = (char *)mach->LocalMem + offset.u[i];
+      else
+         ptr[i] = NULL;
   }

-   memcpy(&r[0].u[0], ptr, 4);
-   val = r[0].u[0];
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_ATOMUADD:
-      val += value[0].u[0];
-      break;
-   case TGSI_OPCODE_ATOMXOR:
-      val ^= value[0].u[0];
-      break;
-   case TGSI_OPCODE_ATOMOR:
-      val |= value[0].u[0];
-      break;
-   case TGSI_OPCODE_ATOMAND:
-      val &= value[0].u[0];
-      break;
-   case TGSI_OPCODE_ATOMUMIN:
-      val = MIN2(val, value[0].u[0]);
-      break;
-   case TGSI_OPCODE_ATOMUMAX:
-      val = MAX2(val, value[0].u[0]);
-      break;
-   case TGSI_OPCODE_ATOMIMIN:
-      val = MIN2(r[0].i[0], value[0].i[0]);
-      break;
-   case TGSI_OPCODE_ATOMIMAX:
-      val = MAX2(r[0].i[0], value[0].i[0]);
-      break;
-   case TGSI_OPCODE_ATOMXCHG:
-      val = value[0].i[0];
-      break;
-   case TGSI_OPCODE_ATOMCAS:
-      if (val == value[0].u[0])
-         val = value2[0].u[0];
-      break;
-   case TGSI_OPCODE_ATOMFADD:
-      val = fui(r[0].f[0] + value[0].f[0]);
-      break;
-   default:
-      break;
-   }
-   for (i = 0; i < TGSI_QUAD_SIZE; i++)
-      if (execmask & (1 << i))
-         memcpy(ptr, &val, 4);
+   FETCH(&r0, 2, TGSI_CHAN_X);
+   if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
+      FETCH(&r1, 3, TGSI_CHAN_X);

-   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
-         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+   /* The load/op/store sequence has to happen inside the loop since ptr
+    * may have the same ptr in some of the invocations.
+    */
+   for (int i = 0; i < TGSI_QUAD_SIZE; i++) {
+      if (!(execmask & (1 << i)))
+         continue;
+
+      uint32_t val = 0;
+      if (ptr[i]) {
+         memcpy(&val, ptr[i], sizeof(val));
+
+         uint32_t result;
+         switch (inst->Instruction.Opcode) {
+         case TGSI_OPCODE_ATOMUADD:
+            result = val + r0.u[i];
+            break;
+         case TGSI_OPCODE_ATOMXOR:
+            result = val ^ r0.u[i];
+            break;
+         case TGSI_OPCODE_ATOMOR:
+            result = val | r0.u[i];
+            break;
+         case TGSI_OPCODE_ATOMAND:
+            result = val & r0.u[i];
+            break;
+         case TGSI_OPCODE_ATOMUMIN:
+            result = MIN2(val, r0.u[i]);
+            break;
+         case TGSI_OPCODE_ATOMUMAX:
+            result = MAX2(val, r0.u[i]);
+            break;
+         case TGSI_OPCODE_ATOMIMIN:
+            result = MIN2((int32_t)val, r0.i[i]);
+            break;
+         case TGSI_OPCODE_ATOMIMAX:
+            result = MAX2((int32_t)val, r0.i[i]);
+            break;
+         case TGSI_OPCODE_ATOMXCHG:
+            result = r0.u[i];
+            break;
+         case TGSI_OPCODE_ATOMCAS:
+            if (val == r0.u[i])
+               result = r1.u[i];
+            else
+               result = val;
+            break;
+         case TGSI_OPCODE_ATOMFADD:
+               result = fui(uif(val) + r0.f[i]);
+            break;
+         default:
+            unreachable("bad atomic op");
+         }
+         memcpy(ptr[i], &result, sizeof(result));
      }
+
+      r0.u[i] = val;
   }
+
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
+      store_dest(mach, &r0, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
 }

 static void