mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 06:58:05 +02:00
gallium/tgsi_exec: Fix shared memory atomic ops.
It was only executing the first channel, ignoring the rest. I also cleaned things up to not loop over rgba, since atomics are only ever to a single 32-bit value per invocation. This worked on softpipe previously because it only dispatches 1 CS invocation per TGSI exec machine anyway, wasting the other 3 slots. Reviewed-by: Dave Airlie <airlied@redhat.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8276>
This commit is contained in:
parent
28a202f2d3
commit
85be1f867c
1 changed files with 70 additions and 59 deletions
|
|
@ -4292,76 +4292,87 @@ static void
|
|||
exec_atomop_mem(struct tgsi_exec_machine *mach,
|
||||
const struct tgsi_full_instruction *inst)
|
||||
{
|
||||
union tgsi_exec_channel r[4];
|
||||
union tgsi_exec_channel value[4], value2[4];
|
||||
char *ptr = mach->LocalMem;
|
||||
uint32_t val;
|
||||
union tgsi_exec_channel offset, r0, r1;
|
||||
uint chan, i;
|
||||
uint32_t offset;
|
||||
int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
|
||||
int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
|
||||
IFETCH(&r[0], 1, TGSI_CHAN_X);
|
||||
IFETCH(&offset, 1, TGSI_CHAN_X);
|
||||
|
||||
if (r[0].u[0] >= mach->LocalMemSize)
|
||||
if (!(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X))
|
||||
return;
|
||||
|
||||
offset = r[0].u[0];
|
||||
ptr += offset;
|
||||
for (i = 0; i < 4; i++) {
|
||||
FETCH(&value[i], 2, TGSI_CHAN_X + i);
|
||||
if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
|
||||
FETCH(&value2[i], 3, TGSI_CHAN_X + i);
|
||||
void *ptr[TGSI_QUAD_SIZE];
|
||||
for (i = 0; i < TGSI_QUAD_SIZE; i++) {
|
||||
if (likely(mach->LocalMemSize >= 4 && offset.u[i] <= mach->LocalMemSize - 4))
|
||||
ptr[i] = (char *)mach->LocalMem + offset.u[i];
|
||||
else
|
||||
ptr[i] = NULL;
|
||||
}
|
||||
|
||||
memcpy(&r[0].u[0], ptr, 4);
|
||||
val = r[0].u[0];
|
||||
switch (inst->Instruction.Opcode) {
|
||||
case TGSI_OPCODE_ATOMUADD:
|
||||
val += value[0].u[0];
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMXOR:
|
||||
val ^= value[0].u[0];
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMOR:
|
||||
val |= value[0].u[0];
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMAND:
|
||||
val &= value[0].u[0];
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMUMIN:
|
||||
val = MIN2(val, value[0].u[0]);
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMUMAX:
|
||||
val = MAX2(val, value[0].u[0]);
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMIMIN:
|
||||
val = MIN2(r[0].i[0], value[0].i[0]);
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMIMAX:
|
||||
val = MAX2(r[0].i[0], value[0].i[0]);
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMXCHG:
|
||||
val = value[0].i[0];
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMCAS:
|
||||
if (val == value[0].u[0])
|
||||
val = value2[0].u[0];
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMFADD:
|
||||
val = fui(r[0].f[0] + value[0].f[0]);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
for (i = 0; i < TGSI_QUAD_SIZE; i++)
|
||||
if (execmask & (1 << i))
|
||||
memcpy(ptr, &val, 4);
|
||||
FETCH(&r0, 2, TGSI_CHAN_X);
|
||||
if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
|
||||
FETCH(&r1, 3, TGSI_CHAN_X);
|
||||
|
||||
for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
|
||||
if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
|
||||
store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
|
||||
/* The load/op/store sequence has to happen inside the loop since ptr
|
||||
* may have the same ptr in some of the invocations.
|
||||
*/
|
||||
for (int i = 0; i < TGSI_QUAD_SIZE; i++) {
|
||||
if (!(execmask & (1 << i)))
|
||||
continue;
|
||||
|
||||
uint32_t val = 0;
|
||||
if (ptr[i]) {
|
||||
memcpy(&val, ptr[i], sizeof(val));
|
||||
|
||||
uint32_t result;
|
||||
switch (inst->Instruction.Opcode) {
|
||||
case TGSI_OPCODE_ATOMUADD:
|
||||
result = val + r0.u[i];
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMXOR:
|
||||
result = val ^ r0.u[i];
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMOR:
|
||||
result = val | r0.u[i];
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMAND:
|
||||
result = val & r0.u[i];
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMUMIN:
|
||||
result = MIN2(val, r0.u[i]);
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMUMAX:
|
||||
result = MAX2(val, r0.u[i]);
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMIMIN:
|
||||
result = MIN2((int32_t)val, r0.i[i]);
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMIMAX:
|
||||
result = MAX2((int32_t)val, r0.i[i]);
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMXCHG:
|
||||
result = r0.u[i];
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMCAS:
|
||||
if (val == r0.u[i])
|
||||
result = r1.u[i];
|
||||
else
|
||||
result = val;
|
||||
break;
|
||||
case TGSI_OPCODE_ATOMFADD:
|
||||
result = fui(uif(val) + r0.f[i]);
|
||||
break;
|
||||
default:
|
||||
unreachable("bad atomic op");
|
||||
}
|
||||
memcpy(ptr[i], &result, sizeof(result));
|
||||
}
|
||||
|
||||
r0.u[i] = val;
|
||||
}
|
||||
|
||||
for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
|
||||
store_dest(mach, &r0, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue