r600: Implement memoryBarrier() in the non-SFN path.

Previously we were just doing a group barrier for both membar and barrier.
This sometimes worked out, because atomics and reads waited for ack
already, but writes were not waiting for ack.  Use the need_wait_ack
pattern that scratch writes used, with a little refactoring for
reusability.

The refactor also incidentally fixes the atomics waiting for outstanding
acks to be > 1 instead of > 0.

Cc: mesa-stable
Fixes: #6028
Reviewed-by: Gert Wollny <gert.wollny@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14429>
This commit is contained in:
Emma Anholt 2022-02-11 15:11:59 -08:00
parent 991a95a352
commit b8324a7387
6 changed files with 86 additions and 37 deletions

View file

@ -600,14 +600,10 @@ dEQP-GLES31.functional.stencil_texturing.format.depth32f_stencil8_2d_array,Fail
dEQP-GLES31.functional.stencil_texturing.format.stencil_index8_2d,Fail
dEQP-GLES31.functional.stencil_texturing.format.stencil_index8_2d_array,Fail
dEQP-GLES31.functional.synchronization.in_invocation.image_alias_overwrite,Fail
dEQP-GLES31.functional.synchronization.in_invocation.image_alias_write,Fail
dEQP-GLES31.functional.synchronization.in_invocation.ssbo_alias_overwrite,Fail
dEQP-GLES31.functional.synchronization.in_invocation.ssbo_alias_write,Fail
dEQP-GLES31.functional.synchronization.in_invocation.ssbo_overwrite,Fail
dEQP-GLES31.functional.synchronization.in_invocation.ssbo_write_read,Fail
dEQP-GLES31.functional.synchronization.inter_invocation.image_alias_overwrite,Fail
dEQP-GLES31.functional.synchronization.inter_invocation.image_alias_write,Fail
dEQP-GLES31.functional.synchronization.inter_invocation.image_overwrite,Fail
dEQP-GLES31.functional.synchronization.inter_invocation.ssbo_alias_overwrite,Fail
dEQP-GLES31.functional.synchronization.inter_invocation.ssbo_alias_write,Fail
@ -1046,8 +1042,6 @@ spec@arb_shader_image_load_store@invalid@imageLoad/unbound image test,Fail
spec@arb_shader_image_load_store@max-images,Fail
spec@arb_shader_image_load_store@max-images@Combined max image uniforms test,Fail
spec@arb_shader_image_load_store@max-images@Fragment shader max image uniforms test,Fail
spec@arb_shader_image_load_store@restrict,Fail
spec@arb_shader_image_load_store@restrict@no qualifier image aliasing test,Fail
spec@arb_shader_storage_buffer_object@array-ssbo-auto-binding,Fail
spec@arb_shader_storage_buffer_object@compiler@atomicmin-swizzle.vert,Fail

View file

@ -165,7 +165,9 @@
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE 0x00000000
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND 0x00000001
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ 0x00000002
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_ACK 0x00000002
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND 0x00000003
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK 0x00000003
#define S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((unsigned)(x) & 0x7F) << 15)
#define G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((x) >> 15) & 0x7F)

View file

@ -240,14 +240,48 @@ int r600_bytecode_add_pending_output(struct r600_bytecode *bc,
return 0;
}
void r600_bytecode_need_wait_ack(struct r600_bytecode *bc, boolean need_wait_ack)
void
r600_bytecode_add_ack(struct r600_bytecode *bc)
{
bc->need_wait_ack = need_wait_ack;
bc->need_wait_ack = true;
}
boolean r600_bytecode_get_need_wait_ack(struct r600_bytecode *bc)
int
r600_bytecode_wait_acks(struct r600_bytecode *bc)
{
return bc->need_wait_ack;
/* Store acks are an R700+ feature. */
if (bc->chip_class < R700)
return 0;
if (!bc->need_wait_ack)
return 0;
int ret = r600_bytecode_add_cfinst(bc, CF_OP_WAIT_ACK);
if (ret != 0)
return ret;
struct r600_bytecode_cf *cf = bc->cf_last;
cf->barrier = 1;
/* Request a wait if the number of outstanding acks is > 0 */
cf->cf_addr = 0;
return 0;
}
uint32_t
r600_bytecode_write_export_ack_type(struct r600_bytecode *bc, bool indirect)
{
if (bc->chip_class >= R700) {
if (indirect)
return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK_EG;
else
return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_ACK_EG;
} else {
if (indirect)
return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
else
return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
}
}
/* alu instructions that can ony exits once per group */
@ -1536,10 +1570,8 @@ int r600_bytecode_add_cfinst(struct r600_bytecode *bc, unsigned op)
int r;
/* Emit WAIT_ACK before control flow to ensure pending writes are always acked. */
if (op != CF_OP_MEM_SCRATCH && bc->need_wait_ack) {
bc->need_wait_ack = false;
r = r600_bytecode_add_cfinst(bc, CF_OP_WAIT_ACK);
}
if (op != CF_OP_WAIT_ACK && op != CF_OP_MEM_SCRATCH)
r600_bytecode_wait_acks(bc);
r = r600_bytecode_add_cf(bc);
if (r)

View file

@ -313,8 +313,11 @@ int r600_bytecode_add_output(struct r600_bytecode *bc,
const struct r600_bytecode_output *output);
int r600_bytecode_add_pending_output(struct r600_bytecode *bc,
const struct r600_bytecode_output *output);
void r600_bytecode_need_wait_ack(struct r600_bytecode *bc, boolean needed);
boolean r600_bytecode_get_need_wait_ack(struct r600_bytecode *bc);
void r600_bytecode_add_ack(struct r600_bytecode *bc);
int r600_bytecode_wait_acks(struct r600_bytecode *bc);
uint32_t r600_bytecode_write_export_ack_type(struct r600_bytecode *bc, bool indirect);
int r600_bytecode_build(struct r600_bytecode *bc);
int r600_bytecode_add_cf(struct r600_bytecode *bc);
int r600_bytecode_add_cfinst(struct r600_bytecode *bc,

View file

@ -978,9 +978,18 @@ static int tgsi_barrier(struct r600_shader_ctx *ctx)
r = r600_bytecode_add_alu(ctx->bc, &alu);
if (r)
return r;
/* XXX: Need to implement GWS ops to sync across wavefronts */
return 0;
}
static int tgsi_membar(struct r600_shader_ctx *ctx)
{
/* Wait for any SSBO/image stores to land. */
return r600_bytecode_wait_acks(ctx->bc);
}
static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed)
{
// pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays
@ -1662,10 +1671,7 @@ static void tgsi_src(struct r600_shader_ctx *ctx,
else {
struct r600_bytecode_vtx vtx;
if (r600_bytecode_get_need_wait_ack(ctx->bc)) {
r600_bytecode_need_wait_ack(ctx->bc, false);
r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
}
r600_bytecode_wait_acks(ctx->bc);
memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
vtx.op = FETCH_OP_READ_SCRATCH;
@ -4475,7 +4481,7 @@ static void tgsi_dst(struct r600_shader_ctx *ctx,
cf.op = CF_OP_MEM_SCRATCH;
cf.elem_size = 3;
cf.gpr = reg;
cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
cf.type = r600_bytecode_write_export_ack_type(ctx->bc, tgsi_dst->Register.Indirect);
cf.mark = 1;
cf.comp_mask = inst->Dst[0].Register.WriteMask;
cf.swizzle_x = 0;
@ -4485,10 +4491,6 @@ static void tgsi_dst(struct r600_shader_ctx *ctx,
cf.burst_count = 1;
if (tgsi_dst->Register.Indirect) {
if (ctx->bc->chip_class < R700)
cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
else
cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK;
cf.index_gpr = ctx->bc->ar_reg;
}
else {
@ -4500,8 +4502,8 @@ static void tgsi_dst(struct r600_shader_ctx *ctx,
if (r)
return;
if (ctx->bc->chip_class >= R700)
r600_bytecode_need_wait_ack(ctx->bc, true);
r600_bytecode_add_ack(ctx->bc);
}
return;
}
@ -8952,9 +8954,8 @@ static int tgsi_load_rat(struct r600_shader_ctx *ctx)
cf->mark = 1;
cf->output.elem_size = 0;
r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
cf = ctx->bc->cf_last;
cf->barrier = 1;
r600_bytecode_add_ack(ctx->bc);
r600_bytecode_wait_acks(ctx->bc);
desc = util_format_description(inst->Memory.Format);
r600_vertex_data_type(inst->Memory.Format,
@ -9055,6 +9056,7 @@ static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
return r;
}
cf = NULL;
lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
for (i = 0; i <= lasti; i++) {
struct r600_bytecode_alu alu;
@ -9095,6 +9097,14 @@ static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
cf->barrier = 1;
cf->output.elem_size = 0;
}
/* Request an ack from the last write emitted. */
if (cf) {
cf->mark = true;
cf->output.type = r600_bytecode_write_export_ack_type(ctx->bc, true);
r600_bytecode_add_ack(ctx->bc);
}
return 0;
}
@ -9144,7 +9154,7 @@ static int tgsi_store_rat(struct r600_shader_ctx *ctx)
cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
cf->rat.inst = V_RAT_INST_STORE_TYPED;
cf->rat.index_mode = rat_index_mode;
cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
cf->output.type = r600_bytecode_write_export_ack_type(ctx->bc, true);
cf->output.gpr = val_gpr;
cf->output.index_gpr = idx_gpr;
cf->output.comp_mask = 0xf;
@ -9152,6 +9162,10 @@ static int tgsi_store_rat(struct r600_shader_ctx *ctx)
cf->vpm = 1;
cf->barrier = 1;
cf->output.elem_size = 0;
cf->mark = 1;
r600_bytecode_add_ack(ctx->bc);
return 0;
}
@ -9324,10 +9338,9 @@ static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
cf->barrier = 1;
cf->mark = 1;
cf->output.elem_size = 0;
r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
cf = ctx->bc->cf_last;
cf->barrier = 1;
cf->cf_addr = 1;
r600_bytecode_add_ack(ctx->bc);
r600_bytecode_wait_acks(ctx->bc);
memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
@ -12084,7 +12097,7 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] =
[TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
[TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
[TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
[TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
[TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_membar},
[113] = { ALU_OP0_NOP, tgsi_unsupported},
[114] = { ALU_OP0_NOP, tgsi_unsupported},
[115] = { ALU_OP0_NOP, tgsi_unsupported},
@ -12311,7 +12324,7 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] =
[TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
[TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
[TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
[TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
[TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_membar},
[113] = { ALU_OP0_NOP, tgsi_unsupported},
[114] = { ALU_OP0_NOP, tgsi_unsupported},
[115] = { ALU_OP0_NOP, tgsi_unsupported},

View file

@ -118,6 +118,11 @@
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND 0x00000001
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ 0x00000002
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND 0x00000003
/* R700+-only */
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_ACK_EG 0x00000002
#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK_EG 0x00000003
#define S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((unsigned)(x) & 0x7F) << 15)
#define G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((x) >> 15) & 0x7F)
#define C_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR 0xFFC07FFF