From 85be1f867c16acc773609acf6bd9baaa2c457936 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 30 Dec 2020 11:21:49 -0800 Subject: [PATCH] gallium/tgsi_exec: Fix shared memory atomic ops. It was only executing the first channel, ignoring the rest. I also cleaned things up to not loop over rgba, since atomics are only ever to a single 32-bit value per invocation. This worked on softpipe previously because it only dispatches 1 CS invocation per TGSI exec machine anyway, wasting the other 3 slots. Reviewed-by: Dave Airlie Part-of: --- src/gallium/auxiliary/tgsi/tgsi_exec.c | 129 ++++++++++++++----------- 1 file changed, 70 insertions(+), 59 deletions(-) diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c index 9c23d5ae65b..1238d273229 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c @@ -4292,76 +4292,87 @@ static void exec_atomop_mem(struct tgsi_exec_machine *mach, const struct tgsi_full_instruction *inst) { - union tgsi_exec_channel r[4]; - union tgsi_exec_channel value[4], value2[4]; - char *ptr = mach->LocalMem; - uint32_t val; + union tgsi_exec_channel offset, r0, r1; uint chan, i; - uint32_t offset; int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0]; int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask; - IFETCH(&r[0], 1, TGSI_CHAN_X); + IFETCH(&offset, 1, TGSI_CHAN_X); - if (r[0].u[0] >= mach->LocalMemSize) + if (!(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X)) return; - offset = r[0].u[0]; - ptr += offset; - for (i = 0; i < 4; i++) { - FETCH(&value[i], 2, TGSI_CHAN_X + i); - if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) - FETCH(&value2[i], 3, TGSI_CHAN_X + i); + void *ptr[TGSI_QUAD_SIZE]; + for (i = 0; i < TGSI_QUAD_SIZE; i++) { + if (likely(mach->LocalMemSize >= 4 && offset.u[i] <= mach->LocalMemSize - 4)) + ptr[i] = (char *)mach->LocalMem + offset.u[i]; + else + ptr[i] = NULL; } - memcpy(&r[0].u[0], ptr, 4); - val = r[0].u[0]; - switch (inst->Instruction.Opcode) { - case TGSI_OPCODE_ATOMUADD: - val += value[0].u[0]; - break; - case TGSI_OPCODE_ATOMXOR: - val ^= value[0].u[0]; - break; - case TGSI_OPCODE_ATOMOR: - val |= value[0].u[0]; - break; - case TGSI_OPCODE_ATOMAND: - val &= value[0].u[0]; - break; - case TGSI_OPCODE_ATOMUMIN: - val = MIN2(val, value[0].u[0]); - break; - case TGSI_OPCODE_ATOMUMAX: - val = MAX2(val, value[0].u[0]); - break; - case TGSI_OPCODE_ATOMIMIN: - val = MIN2(r[0].i[0], value[0].i[0]); - break; - case TGSI_OPCODE_ATOMIMAX: - val = MAX2(r[0].i[0], value[0].i[0]); - break; - case TGSI_OPCODE_ATOMXCHG: - val = value[0].i[0]; - break; - case TGSI_OPCODE_ATOMCAS: - if (val == value[0].u[0]) - val = value2[0].u[0]; - break; - case TGSI_OPCODE_ATOMFADD: - val = fui(r[0].f[0] + value[0].f[0]); - break; - default: - break; - } - for (i = 0; i < TGSI_QUAD_SIZE; i++) - if (execmask & (1 << i)) - memcpy(ptr, &val, 4); + FETCH(&r0, 2, TGSI_CHAN_X); + if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) + FETCH(&r1, 3, TGSI_CHAN_X); - for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - if (inst->Dst[0].Register.WriteMask & (1 << chan)) { - store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT); + /* The load/op/store sequence has to happen inside the loop since ptr + * may have the same ptr in some of the invocations. + */ + for (int i = 0; i < TGSI_QUAD_SIZE; i++) { + if (!(execmask & (1 << i))) + continue; + + uint32_t val = 0; + if (ptr[i]) { + memcpy(&val, ptr[i], sizeof(val)); + + uint32_t result; + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_ATOMUADD: + result = val + r0.u[i]; + break; + case TGSI_OPCODE_ATOMXOR: + result = val ^ r0.u[i]; + break; + case TGSI_OPCODE_ATOMOR: + result = val | r0.u[i]; + break; + case TGSI_OPCODE_ATOMAND: + result = val & r0.u[i]; + break; + case TGSI_OPCODE_ATOMUMIN: + result = MIN2(val, r0.u[i]); + break; + case TGSI_OPCODE_ATOMUMAX: + result = MAX2(val, r0.u[i]); + break; + case TGSI_OPCODE_ATOMIMIN: + result = MIN2((int32_t)val, r0.i[i]); + break; + case TGSI_OPCODE_ATOMIMAX: + result = MAX2((int32_t)val, r0.i[i]); + break; + case TGSI_OPCODE_ATOMXCHG: + result = r0.u[i]; + break; + case TGSI_OPCODE_ATOMCAS: + if (val == r0.u[i]) + result = r1.u[i]; + else + result = val; + break; + case TGSI_OPCODE_ATOMFADD: + result = fui(uif(val) + r0.f[i]); + break; + default: + unreachable("bad atomic op"); + } + memcpy(ptr[i], &result, sizeof(result)); } + + r0.u[i] = val; } + + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) + store_dest(mach, &r0, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT); } static void