diff --git a/src/gallium/drivers/r600/ci/r600-turks-fails.txt b/src/gallium/drivers/r600/ci/r600-turks-fails.txt index 937c7d2d988..2b4ea28e2a6 100644 --- a/src/gallium/drivers/r600/ci/r600-turks-fails.txt +++ b/src/gallium/drivers/r600/ci/r600-turks-fails.txt @@ -600,14 +600,10 @@ dEQP-GLES31.functional.stencil_texturing.format.depth32f_stencil8_2d_array,Fail dEQP-GLES31.functional.stencil_texturing.format.stencil_index8_2d,Fail dEQP-GLES31.functional.stencil_texturing.format.stencil_index8_2d_array,Fail -dEQP-GLES31.functional.synchronization.in_invocation.image_alias_overwrite,Fail -dEQP-GLES31.functional.synchronization.in_invocation.image_alias_write,Fail dEQP-GLES31.functional.synchronization.in_invocation.ssbo_alias_overwrite,Fail dEQP-GLES31.functional.synchronization.in_invocation.ssbo_alias_write,Fail dEQP-GLES31.functional.synchronization.in_invocation.ssbo_overwrite,Fail dEQP-GLES31.functional.synchronization.in_invocation.ssbo_write_read,Fail -dEQP-GLES31.functional.synchronization.inter_invocation.image_alias_overwrite,Fail -dEQP-GLES31.functional.synchronization.inter_invocation.image_alias_write,Fail dEQP-GLES31.functional.synchronization.inter_invocation.image_overwrite,Fail dEQP-GLES31.functional.synchronization.inter_invocation.ssbo_alias_overwrite,Fail dEQP-GLES31.functional.synchronization.inter_invocation.ssbo_alias_write,Fail @@ -1046,8 +1042,6 @@ spec@arb_shader_image_load_store@invalid@imageLoad/unbound image test,Fail spec@arb_shader_image_load_store@max-images,Fail spec@arb_shader_image_load_store@max-images@Combined max image uniforms test,Fail spec@arb_shader_image_load_store@max-images@Fragment shader max image uniforms test,Fail -spec@arb_shader_image_load_store@restrict,Fail -spec@arb_shader_image_load_store@restrict@no qualifier image aliasing test,Fail spec@arb_shader_storage_buffer_object@array-ssbo-auto-binding,Fail spec@arb_shader_storage_buffer_object@compiler@atomicmin-swizzle.vert,Fail diff --git a/src/gallium/drivers/r600/eg_sq.h b/src/gallium/drivers/r600/eg_sq.h index c6280167a92..bdf9b330ee9 100644 --- a/src/gallium/drivers/r600/eg_sq.h +++ b/src/gallium/drivers/r600/eg_sq.h @@ -165,7 +165,9 @@ #define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE 0x00000000 #define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND 0x00000001 #define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ 0x00000002 +#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_ACK 0x00000002 #define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND 0x00000003 +#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK 0x00000003 #define S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((unsigned)(x) & 0x7F) << 15) #define G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((x) >> 15) & 0x7F) diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index 321d5e3b555..c0a29e3d2f4 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -240,14 +240,48 @@ int r600_bytecode_add_pending_output(struct r600_bytecode *bc, return 0; } -void r600_bytecode_need_wait_ack(struct r600_bytecode *bc, boolean need_wait_ack) +void +r600_bytecode_add_ack(struct r600_bytecode *bc) { - bc->need_wait_ack = need_wait_ack; + bc->need_wait_ack = true; } -boolean r600_bytecode_get_need_wait_ack(struct r600_bytecode *bc) +int +r600_bytecode_wait_acks(struct r600_bytecode *bc) { - return bc->need_wait_ack; + /* Store acks are an R700+ feature. */ + if (bc->chip_class < R700) + return 0; + + if (!bc->need_wait_ack) + return 0; + + int ret = r600_bytecode_add_cfinst(bc, CF_OP_WAIT_ACK); + if (ret != 0) + return ret; + + struct r600_bytecode_cf *cf = bc->cf_last; + cf->barrier = 1; + /* Request a wait if the number of outstanding acks is > 0 */ + cf->cf_addr = 0; + + return 0; +} + +uint32_t +r600_bytecode_write_export_ack_type(struct r600_bytecode *bc, bool indirect) +{ + if (bc->chip_class >= R700) { + if (indirect) + return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK_EG; + else + return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_ACK_EG; + } else { + if (indirect) + return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; + else + return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; + } } /* alu instructions that can ony exits once per group */ @@ -1536,10 +1570,8 @@ int r600_bytecode_add_cfinst(struct r600_bytecode *bc, unsigned op) int r; /* Emit WAIT_ACK before control flow to ensure pending writes are always acked. */ - if (op != CF_OP_MEM_SCRATCH && bc->need_wait_ack) { - bc->need_wait_ack = false; - r = r600_bytecode_add_cfinst(bc, CF_OP_WAIT_ACK); - } + if (op != CF_OP_WAIT_ACK && op != CF_OP_MEM_SCRATCH) + r600_bytecode_wait_acks(bc); r = r600_bytecode_add_cf(bc); if (r) diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h index a526993b318..501d827744a 100644 --- a/src/gallium/drivers/r600/r600_asm.h +++ b/src/gallium/drivers/r600/r600_asm.h @@ -313,8 +313,11 @@ int r600_bytecode_add_output(struct r600_bytecode *bc, const struct r600_bytecode_output *output); int r600_bytecode_add_pending_output(struct r600_bytecode *bc, const struct r600_bytecode_output *output); -void r600_bytecode_need_wait_ack(struct r600_bytecode *bc, boolean needed); -boolean r600_bytecode_get_need_wait_ack(struct r600_bytecode *bc); + +void r600_bytecode_add_ack(struct r600_bytecode *bc); +int r600_bytecode_wait_acks(struct r600_bytecode *bc); +uint32_t r600_bytecode_write_export_ack_type(struct r600_bytecode *bc, bool indirect); + int r600_bytecode_build(struct r600_bytecode *bc); int r600_bytecode_add_cf(struct r600_bytecode *bc); int r600_bytecode_add_cfinst(struct r600_bytecode *bc, diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index dad366cda5b..664a0d1faf5 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -978,9 +978,18 @@ static int tgsi_barrier(struct r600_shader_ctx *ctx) r = r600_bytecode_add_alu(ctx->bc, &alu); if (r) return r; + + /* XXX: Need to implement GWS ops to sync across wavefronts */ + return 0; } +static int tgsi_membar(struct r600_shader_ctx *ctx) +{ + /* Wait for any SSBO/image stores to land. */ + return r600_bytecode_wait_acks(ctx->bc); +} + static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed) { // pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays @@ -1662,10 +1671,7 @@ static void tgsi_src(struct r600_shader_ctx *ctx, else { struct r600_bytecode_vtx vtx; - if (r600_bytecode_get_need_wait_ack(ctx->bc)) { - r600_bytecode_need_wait_ack(ctx->bc, false); - r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK); - } + r600_bytecode_wait_acks(ctx->bc); memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); vtx.op = FETCH_OP_READ_SCRATCH; @@ -4475,7 +4481,7 @@ static void tgsi_dst(struct r600_shader_ctx *ctx, cf.op = CF_OP_MEM_SCRATCH; cf.elem_size = 3; cf.gpr = reg; - cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; + cf.type = r600_bytecode_write_export_ack_type(ctx->bc, tgsi_dst->Register.Indirect); cf.mark = 1; cf.comp_mask = inst->Dst[0].Register.WriteMask; cf.swizzle_x = 0; @@ -4485,10 +4491,6 @@ static void tgsi_dst(struct r600_shader_ctx *ctx, cf.burst_count = 1; if (tgsi_dst->Register.Indirect) { - if (ctx->bc->chip_class < R700) - cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; - else - cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK; cf.index_gpr = ctx->bc->ar_reg; } else { @@ -4500,8 +4502,8 @@ static void tgsi_dst(struct r600_shader_ctx *ctx, if (r) return; - if (ctx->bc->chip_class >= R700) - r600_bytecode_need_wait_ack(ctx->bc, true); + r600_bytecode_add_ack(ctx->bc); + } return; } @@ -8952,9 +8954,8 @@ static int tgsi_load_rat(struct r600_shader_ctx *ctx) cf->mark = 1; cf->output.elem_size = 0; - r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK); - cf = ctx->bc->cf_last; - cf->barrier = 1; + r600_bytecode_add_ack(ctx->bc); + r600_bytecode_wait_acks(ctx->bc); desc = util_format_description(inst->Memory.Format); r600_vertex_data_type(inst->Memory.Format, @@ -9055,6 +9056,7 @@ static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx) return r; } + cf = NULL; lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); for (i = 0; i <= lasti; i++) { struct r600_bytecode_alu alu; @@ -9095,6 +9097,14 @@ static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx) cf->barrier = 1; cf->output.elem_size = 0; } + + /* Request an ack from the last write emitted. */ + if (cf) { + cf->mark = true; + cf->output.type = r600_bytecode_write_export_ack_type(ctx->bc, true); + r600_bytecode_add_ack(ctx->bc); + } + return 0; } @@ -9144,7 +9154,7 @@ static int tgsi_store_rat(struct r600_shader_ctx *ctx) cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index; cf->rat.inst = V_RAT_INST_STORE_TYPED; cf->rat.index_mode = rat_index_mode; - cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; + cf->output.type = r600_bytecode_write_export_ack_type(ctx->bc, true); cf->output.gpr = val_gpr; cf->output.index_gpr = idx_gpr; cf->output.comp_mask = 0xf; @@ -9152,6 +9162,10 @@ static int tgsi_store_rat(struct r600_shader_ctx *ctx) cf->vpm = 1; cf->barrier = 1; cf->output.elem_size = 0; + cf->mark = 1; + + r600_bytecode_add_ack(ctx->bc); + return 0; } @@ -9324,10 +9338,9 @@ static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx) cf->barrier = 1; cf->mark = 1; cf->output.elem_size = 0; - r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK); - cf = ctx->bc->cf_last; - cf->barrier = 1; - cf->cf_addr = 1; + + r600_bytecode_add_ack(ctx->bc); + r600_bytecode_wait_acks(ctx->bc); memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) { @@ -12084,7 +12097,7 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, - [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, + [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_membar}, [113] = { ALU_OP0_NOP, tgsi_unsupported}, [114] = { ALU_OP0_NOP, tgsi_unsupported}, [115] = { ALU_OP0_NOP, tgsi_unsupported}, @@ -12311,7 +12324,7 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, - [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, + [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_membar}, [113] = { ALU_OP0_NOP, tgsi_unsupported}, [114] = { ALU_OP0_NOP, tgsi_unsupported}, [115] = { ALU_OP0_NOP, tgsi_unsupported}, diff --git a/src/gallium/drivers/r600/r600_sq.h b/src/gallium/drivers/r600/r600_sq.h index 6b07dc1ecfc..12c2c61150a 100644 --- a/src/gallium/drivers/r600/r600_sq.h +++ b/src/gallium/drivers/r600/r600_sq.h @@ -118,6 +118,11 @@ #define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND 0x00000001 #define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ 0x00000002 #define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND 0x00000003 + +/* R700+-only */ +#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_ACK_EG 0x00000002 +#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK_EG 0x00000003 + #define S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((unsigned)(x) & 0x7F) << 15) #define G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((x) >> 15) & 0x7F) #define C_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR 0xFFC07FFF