gallium/tgsi_exec: Fix shared memory atomic ops.

It was only executing the first channel, ignoring the rest.  I also
cleaned things up to not loop over rgba, since atomics are only ever to a
single 32-bit value per invocation.

This worked on softpipe previously because it only dispatches 1 CS
invocation per TGSI exec machine anyway, wasting the other 3 slots.

Reviewed-by: Dave Airlie <airlied@redhat.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8276>
This commit is contained in:
Eric Anholt 2020-12-30 11:21:49 -08:00 committed by Marge Bot
parent 28a202f2d3
commit 85be1f867c

View file

@ -4292,76 +4292,87 @@ static void
exec_atomop_mem(struct tgsi_exec_machine *mach,
const struct tgsi_full_instruction *inst)
{
union tgsi_exec_channel r[4];
union tgsi_exec_channel value[4], value2[4];
char *ptr = mach->LocalMem;
uint32_t val;
union tgsi_exec_channel offset, r0, r1;
uint chan, i;
uint32_t offset;
int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
IFETCH(&r[0], 1, TGSI_CHAN_X);
IFETCH(&offset, 1, TGSI_CHAN_X);
if (r[0].u[0] >= mach->LocalMemSize)
if (!(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X))
return;
offset = r[0].u[0];
ptr += offset;
for (i = 0; i < 4; i++) {
FETCH(&value[i], 2, TGSI_CHAN_X + i);
if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
FETCH(&value2[i], 3, TGSI_CHAN_X + i);
void *ptr[TGSI_QUAD_SIZE];
for (i = 0; i < TGSI_QUAD_SIZE; i++) {
if (likely(mach->LocalMemSize >= 4 && offset.u[i] <= mach->LocalMemSize - 4))
ptr[i] = (char *)mach->LocalMem + offset.u[i];
else
ptr[i] = NULL;
}
memcpy(&r[0].u[0], ptr, 4);
val = r[0].u[0];
switch (inst->Instruction.Opcode) {
case TGSI_OPCODE_ATOMUADD:
val += value[0].u[0];
break;
case TGSI_OPCODE_ATOMXOR:
val ^= value[0].u[0];
break;
case TGSI_OPCODE_ATOMOR:
val |= value[0].u[0];
break;
case TGSI_OPCODE_ATOMAND:
val &= value[0].u[0];
break;
case TGSI_OPCODE_ATOMUMIN:
val = MIN2(val, value[0].u[0]);
break;
case TGSI_OPCODE_ATOMUMAX:
val = MAX2(val, value[0].u[0]);
break;
case TGSI_OPCODE_ATOMIMIN:
val = MIN2(r[0].i[0], value[0].i[0]);
break;
case TGSI_OPCODE_ATOMIMAX:
val = MAX2(r[0].i[0], value[0].i[0]);
break;
case TGSI_OPCODE_ATOMXCHG:
val = value[0].i[0];
break;
case TGSI_OPCODE_ATOMCAS:
if (val == value[0].u[0])
val = value2[0].u[0];
break;
case TGSI_OPCODE_ATOMFADD:
val = fui(r[0].f[0] + value[0].f[0]);
break;
default:
break;
}
for (i = 0; i < TGSI_QUAD_SIZE; i++)
if (execmask & (1 << i))
memcpy(ptr, &val, 4);
FETCH(&r0, 2, TGSI_CHAN_X);
if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
FETCH(&r1, 3, TGSI_CHAN_X);
for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
/* The load/op/store sequence has to happen inside the loop since ptr
* may have the same ptr in some of the invocations.
*/
for (int i = 0; i < TGSI_QUAD_SIZE; i++) {
if (!(execmask & (1 << i)))
continue;
uint32_t val = 0;
if (ptr[i]) {
memcpy(&val, ptr[i], sizeof(val));
uint32_t result;
switch (inst->Instruction.Opcode) {
case TGSI_OPCODE_ATOMUADD:
result = val + r0.u[i];
break;
case TGSI_OPCODE_ATOMXOR:
result = val ^ r0.u[i];
break;
case TGSI_OPCODE_ATOMOR:
result = val | r0.u[i];
break;
case TGSI_OPCODE_ATOMAND:
result = val & r0.u[i];
break;
case TGSI_OPCODE_ATOMUMIN:
result = MIN2(val, r0.u[i]);
break;
case TGSI_OPCODE_ATOMUMAX:
result = MAX2(val, r0.u[i]);
break;
case TGSI_OPCODE_ATOMIMIN:
result = MIN2((int32_t)val, r0.i[i]);
break;
case TGSI_OPCODE_ATOMIMAX:
result = MAX2((int32_t)val, r0.i[i]);
break;
case TGSI_OPCODE_ATOMXCHG:
result = r0.u[i];
break;
case TGSI_OPCODE_ATOMCAS:
if (val == r0.u[i])
result = r1.u[i];
else
result = val;
break;
case TGSI_OPCODE_ATOMFADD:
result = fui(uif(val) + r0.f[i]);
break;
default:
unreachable("bad atomic op");
}
memcpy(ptr[i], &result, sizeof(result));
}
r0.u[i] = val;
}
for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
store_dest(mach, &r0, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
}
static void