diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c index aac729f9d57..ecc1cf4d912 100644 --- a/src/freedreno/ir3/ir3.c +++ b/src/freedreno/ir3/ir3.c @@ -695,6 +695,8 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n, } switch (opc_cat(instr->opc)) { + case 0: /* end, chmask */ + return flags == 0; case 1: valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV; if (flags & ~valid_flags) diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index bcac2103e74..89b3474ca3c 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -304,11 +304,11 @@ struct ir3_instruction { int off; /* component/offset */ } split; struct { - /* for output collects, this maps back to the entry in the + /* Per-source index back to the entry in the * ir3_shader_variant::outputs table. */ - int outidx; - } collect; + unsigned *outidxs; + } end; struct { unsigned samp, tex; unsigned input_offset; @@ -462,7 +462,6 @@ struct ir3 { gl_shader_stage type; DECLARE_ARRAY(struct ir3_instruction *, inputs); - DECLARE_ARRAY(struct ir3_instruction *, outputs); /* Track bary.f (and ldlv) instructions.. this is needed in * scheduling to ensure that all varying fetches happen before @@ -1228,14 +1227,6 @@ static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n) #define foreach_input(__ininstr, __ir) \ foreach_input_n(__ininstr, __i, __ir) -/* iterators for shader outputs: */ -#define foreach_output_n(__outinstr, __cnt, __ir) \ - for (struct ir3_instruction *__outinstr = (void *)~0; __outinstr; __outinstr = NULL) \ - for (unsigned __cnt = 0; __cnt < (__ir)->outputs_count; __cnt++) \ - if ((__outinstr = (__ir)->outputs[__cnt])) -#define foreach_output(__outinstr, __ir) \ - foreach_output_n(__outinstr, __i, __ir) - /* iterators for instructions: */ #define foreach_instr(__instr, __list) \ list_for_each_entry(struct ir3_instruction, __instr, __list, node) diff --git a/src/freedreno/ir3/ir3_a6xx.c b/src/freedreno/ir3/ir3_a6xx.c index 412a7528584..8481e5b0307 100644 --- a/src/freedreno/ir3/ir3_a6xx.c +++ b/src/freedreno/ir3/ir3_a6xx.c @@ -445,13 +445,5 @@ ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so) } } - /* we also need to fixup shader outputs: */ - foreach_output_n (out, n, ir) { - if (is_atomic(out->opc) && (out->flags & IR3_INSTR_G)) { - ir->outputs[n] = get_atomic_dest_mov(out); - progress = true; - } - } - return progress; } diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 883f0314539..34f165b4d81 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -2999,27 +2999,6 @@ emit_function(struct ir3_context *ctx, nir_function_impl *impl) emit_stream_out(ctx); } - /* Vertex shaders in a tessellation or geometry pipeline treat END as a - * NOP and has an epilogue that writes the VS outputs to local storage, to - * be read by the HS. Then it resets execution mask (chmask) and chains - * to the next shader (chsh). - */ - if ((ctx->so->type == MESA_SHADER_VERTEX && - (ctx->so->key.has_gs || ctx->so->key.tessellation)) || - (ctx->so->type == MESA_SHADER_TESS_EVAL && ctx->so->key.has_gs)) { - struct ir3_instruction *chmask = - ir3_CHMASK(ctx->block); - chmask->barrier_class = IR3_BARRIER_EVERYTHING; - chmask->barrier_conflict = IR3_BARRIER_EVERYTHING; - - struct ir3_instruction *chsh = - ir3_CHSH(ctx->block); - chsh->barrier_class = IR3_BARRIER_EVERYTHING; - chsh->barrier_conflict = IR3_BARRIER_EVERYTHING; - } else { - ir3_END(ctx->block); - } - setup_predecessors(ctx->ir); } @@ -3550,26 +3529,36 @@ output_slot_used_for_binning(gl_varying_slot slot) slot == VARYING_SLOT_CLIP_DIST0 || slot == VARYING_SLOT_CLIP_DIST1; } + +static struct ir3_instruction *find_end(struct ir3 *ir) +{ + foreach_block_rev (block, &ir->block_list) { + foreach_instr_rev(instr, &block->instr_list) { + if (instr->opc == OPC_END || instr->opc == OPC_CHMASK) + return instr; + } + } + unreachable("couldn't find end instruction"); +} + static void -fixup_binning_pass(struct ir3_context *ctx) +fixup_binning_pass(struct ir3_context *ctx, struct ir3_instruction *end) { struct ir3_shader_variant *so = ctx->so; - struct ir3 *ir = ctx->ir; unsigned i, j; /* first pass, remove unused outputs from the IR level outputs: */ - for (i = 0, j = 0; i < ir->outputs_count; i++) { - struct ir3_instruction *out = ir->outputs[i]; - assert(out->opc == OPC_META_COLLECT); - unsigned outidx = out->collect.outidx; + for (i = 0, j = 0; i < end->regs_count - 1; i++) { + unsigned outidx = end->end.outidxs[i]; unsigned slot = so->outputs[outidx].slot; if (output_slot_used_for_binning(slot)) { - ir->outputs[j] = ir->outputs[i]; + end->regs[j + 1] = end->regs[i + 1]; + end->end.outidxs[j] = end->end.outidxs[i]; j++; } } - ir->outputs_count = j; + end->regs_count = j + 1; /* second pass, cleanup the unused slots in ir3_shader_variant::outputs * table: @@ -3581,9 +3570,9 @@ fixup_binning_pass(struct ir3_context *ctx) so->outputs[j] = so->outputs[i]; /* fixup outidx to point to new output table entry: */ - foreach_output (out, ir) { - if (out->collect.outidx == i) { - out->collect.outidx = j; + for (unsigned k = 0; k < end->regs_count - 1; k++) { + if (end->end.outidxs[k] == i) { + end->end.outidxs[k] = j; break; } } @@ -3671,61 +3660,27 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ir = so->ir = ctx->ir; - assert((ctx->noutputs % 4) == 0); - - /* Setup IR level outputs, which are "collects" that gather - * the scalar components of outputs. + /* Vertex shaders in a tessellation or geometry pipeline treat END as a + * NOP and has an epilogue that writes the VS outputs to local storage, to + * be read by the HS. Then it resets execution mask (chmask) and chains + * to the next shader (chsh). */ - for (unsigned i = 0; i < ctx->noutputs; i += 4) { - unsigned ncomp = 0; - /* figure out the # of components written: - * - * TODO do we need to handle holes, ie. if .x and .z - * components written, but .y component not written? - */ - for (unsigned j = 0; j < 4; j++) { - if (!ctx->outputs[i + j]) - break; - ncomp++; - } + if ((so->type == MESA_SHADER_VERTEX && + (so->key.has_gs || so->key.tessellation)) || + (so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) { + struct ir3_instruction *outputs[3]; + unsigned outidxs[3]; + unsigned outputs_count = 0; - /* Note that in some stages, like TCS, store_output is - * lowered to memory writes, so no components of the - * are "written" from the PoV of traditional store- - * output instructions: - */ - if (!ncomp) - continue; - - struct ir3_instruction *out = - ir3_create_collect(ctx, &ctx->outputs[i], ncomp); - - int outidx = i / 4; - assert(outidx < so->outputs_count); - - /* stash index into so->outputs[] so we can map the - * output back to slot/etc later: - */ - out->collect.outidx = outidx; - - array_insert(ir, ir->outputs, out); - } - - /* Set up the gs header as an output for the vertex shader so it won't - * clobber it for the tess ctrl shader. - * - * TODO this could probably be done more cleanly in a nir pass. - */ - if (ctx->so->type == MESA_SHADER_VERTEX || - (ctx->so->key.has_gs && ctx->so->type == MESA_SHADER_TESS_EVAL)) { if (ctx->primitive_id) { unsigned n = so->outputs_count++; so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID; struct ir3_instruction *out = ir3_create_collect(ctx, &ctx->primitive_id, 1); - out->collect.outidx = n; - array_insert(ir, ir->outputs, out); + outputs[outputs_count] = out; + outidxs[outputs_count] = n; + outputs_count++; } if (ctx->gs_header) { @@ -3733,8 +3688,9 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3; struct ir3_instruction *out = ir3_create_collect(ctx, &ctx->gs_header, 1); - out->collect.outidx = n; - array_insert(ir, ir->outputs, out); + outputs[outputs_count] = out; + outidxs[outputs_count] = n; + outputs_count++; } if (ctx->tcs_header) { @@ -3742,40 +3698,115 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3; struct ir3_instruction *out = ir3_create_collect(ctx, &ctx->tcs_header, 1); - out->collect.outidx = n; - array_insert(ir, ir->outputs, out); + outputs[outputs_count] = out; + outidxs[outputs_count] = n; + outputs_count++; } - } - /* for a6xx+, binning and draw pass VS use same VBO state, so we - * need to make sure not to remove any inputs that are used by - * the nonbinning VS. - */ - if (ctx->compiler->gpu_id >= 600 && so->binning_pass && - so->type == MESA_SHADER_VERTEX) { - for (int i = 0; i < ctx->ninputs; i++) { - struct ir3_instruction *in = ctx->inputs[i]; + struct ir3_instruction *chmask = + ir3_instr_create(ctx->block, OPC_CHMASK, outputs_count + 1); + chmask->barrier_class = IR3_BARRIER_EVERYTHING; + chmask->barrier_conflict = IR3_BARRIER_EVERYTHING; - if (!in) + __ssa_dst(chmask); + for (unsigned i = 0; i < outputs_count; i++) + __ssa_src(chmask, outputs[i], 0); + + chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count); + memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count); + + array_insert(ctx->block, ctx->block->keeps, chmask); + + struct ir3_instruction *chsh = + ir3_CHSH(ctx->block); + chsh->barrier_class = IR3_BARRIER_EVERYTHING; + chsh->barrier_conflict = IR3_BARRIER_EVERYTHING; + } else { + assert((ctx->noutputs % 4) == 0); + unsigned outidxs[ctx->noutputs / 4]; + struct ir3_instruction *outputs[ctx->noutputs / 4]; + unsigned outputs_count = 0; + + /* Setup IR level outputs, which are "collects" that gather + * the scalar components of outputs. + */ + for (unsigned i = 0; i < ctx->noutputs; i += 4) { + unsigned ncomp = 0; + /* figure out the # of components written: + * + * TODO do we need to handle holes, ie. if .x and .z + * components written, but .y component not written? + */ + for (unsigned j = 0; j < 4; j++) { + if (!ctx->outputs[i + j]) + break; + ncomp++; + } + + /* Note that in some stages, like TCS, store_output is + * lowered to memory writes, so no components of the + * are "written" from the PoV of traditional store- + * output instructions: + */ + if (!ncomp) continue; - unsigned n = i / 4; - unsigned c = i % 4; + struct ir3_instruction *out = + ir3_create_collect(ctx, &ctx->outputs[i], ncomp); - debug_assert(n < so->nonbinning->inputs_count); + int outidx = i / 4; + assert(outidx < so->outputs_count); - if (so->nonbinning->inputs[n].sysval) - continue; - - /* be sure to keep inputs, even if only used in VS */ - if (so->nonbinning->inputs[n].compmask & (1 << c)) - array_insert(in->block, in->block->keeps, in); + outidxs[outputs_count] = outidx; + outputs[outputs_count] = out; + outputs_count++; } + + /* for a6xx+, binning and draw pass VS use same VBO state, so we + * need to make sure not to remove any inputs that are used by + * the nonbinning VS. + */ + if (ctx->compiler->gpu_id >= 600 && so->binning_pass && + so->type == MESA_SHADER_VERTEX) { + for (int i = 0; i < ctx->ninputs; i++) { + struct ir3_instruction *in = ctx->inputs[i]; + + if (!in) + continue; + + unsigned n = i / 4; + unsigned c = i % 4; + + debug_assert(n < so->nonbinning->inputs_count); + + if (so->nonbinning->inputs[n].sysval) + continue; + + /* be sure to keep inputs, even if only used in VS */ + if (so->nonbinning->inputs[n].compmask & (1 << c)) + array_insert(in->block, in->block->keeps, in); + } + } + + struct ir3_instruction *end = ir3_instr_create(ctx->block, OPC_END, + outputs_count + 1); + + __ssa_dst(end); + for (unsigned i = 0; i < outputs_count; i++) { + __ssa_src(end, outputs[i], 0); + } + + end->end.outidxs = ralloc_array(end, unsigned, outputs_count); + memcpy(end->end.outidxs, outidxs, sizeof(unsigned) * outputs_count); + + array_insert(ctx->block, ctx->block->keeps, end); + + /* at this point, for binning pass, throw away unneeded outputs: */ + if (so->binning_pass && (ctx->compiler->gpu_id < 600)) + fixup_binning_pass(ctx, end); + } - /* at this point, for binning pass, throw away unneeded outputs: */ - if (so->binning_pass && (ctx->compiler->gpu_id < 600)) - fixup_binning_pass(ctx); ir3_debug_print(ir, "AFTER: nir->ir3"); ir3_validate(ir); @@ -3794,7 +3825,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, * we can re-use same VS_CONST state group. */ if (so->binning_pass && (ctx->compiler->gpu_id >= 600)) { - fixup_binning_pass(ctx); + fixup_binning_pass(ctx, find_end(ctx->so->ir)); /* cleanup the result of removing unneeded outputs: */ while (IR3_PASS(ir, ir3_dce, so)) {} } @@ -3915,12 +3946,14 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, for (unsigned i = 0; i < so->outputs_count; i++) so->outputs[i].regid = INVALID_REG; - foreach_output (out, ir) { - assert(out->opc == OPC_META_COLLECT); - unsigned outidx = out->collect.outidx; + struct ir3_instruction *end = find_end(so->ir); - so->outputs[outidx].regid = out->regs[0]->num; - so->outputs[outidx].half = !!(out->regs[0]->flags & IR3_REG_HALF); + for (unsigned i = 1; i < end->regs_count; i++) { + unsigned outidx = end->end.outidxs[i - 1]; + struct ir3_register *reg = end->regs[i]; + + so->outputs[outidx].regid = reg->num; + so->outputs[outidx].half = !!(reg->flags & IR3_REG_HALF); } foreach_input (in, ir) { diff --git a/src/freedreno/ir3/ir3_cp.c b/src/freedreno/ir3/ir3_cp.c index 20453da5ef7..7e65228cb97 100644 --- a/src/freedreno/ir3/ir3_cp.c +++ b/src/freedreno/ir3/ir3_cp.c @@ -350,8 +350,10 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr, return true; } } else if ((is_same_type_mov(src) || is_const_mov(src)) && - /* cannot collapse const/immed/etc into meta instrs: */ - !is_meta(instr)) { + /* cannot collapse const/immed/etc into meta instrs and control + * flow: + */ + !is_meta(instr) && opc_cat(instr->opc) != 0) { /* immed/const/etc cases, which require some special handling: */ struct ir3_register *src_reg = src->regs[1]; unsigned new_flags = reg->flags; @@ -643,11 +645,6 @@ ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so) ir3_clear_mark(ir); - foreach_output_n (out, n, ir) { - instr_cp(&ctx, out); - ir->outputs[n] = eliminate_output_mov(&ctx, out); - } - foreach_block (block, &ir->block_list) { if (block->condition) { instr_cp(&ctx, block->condition); diff --git a/src/freedreno/ir3/ir3_dce.c b/src/freedreno/ir3/ir3_dce.c index e1e303f5482..a87704700c9 100644 --- a/src/freedreno/ir3/ir3_dce.c +++ b/src/freedreno/ir3/ir3_dce.c @@ -137,9 +137,6 @@ find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so) foreach_array (arr, &ir->array_list) arr->unused = true; - foreach_output (out, ir) - instr_dce(out, false); - foreach_block (block, &ir->block_list) { for (i = 0; i < block->keeps_count; i++) instr_dce(block->keeps[i], false); diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c index c85b9c8a383..c0615ffdc06 100644 --- a/src/freedreno/ir3/ir3_delay.c +++ b/src/freedreno/ir3/ir3_delay.c @@ -80,6 +80,10 @@ ir3_delayslots(struct ir3_instruction *assigner, if (assigner->opc == OPC_MOVMSK) return 4; + /* As far as we know, shader outputs don't need any delay. */ + if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK) + return 0; + /* assigner must be alu: */ if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) || is_mem(consumer)) { diff --git a/src/freedreno/ir3/ir3_group.c b/src/freedreno/ir3/ir3_group.c index 61ecd5320b3..f0bb646b08a 100644 --- a/src/freedreno/ir3/ir3_group.c +++ b/src/freedreno/ir3/ir3_group.c @@ -165,9 +165,6 @@ find_neighbors(struct ir3 *ir) bool progress = false; unsigned i; - foreach_output (out, ir) - progress |= instr_find_neighbors(out); - foreach_block (block, &ir->block_list) { for (i = 0; i < block->keeps_count; i++) { struct ir3_instruction *instr = block->keeps[i]; diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c index 1f60b4dfaf4..4eb5362960d 100644 --- a/src/freedreno/ir3/ir3_print.c +++ b/src/freedreno/ir3/ir3_print.c @@ -240,7 +240,7 @@ print_instr(struct ir3_instruction *instr, int lvl) printf(" "); } - if (!is_flow(instr)) { + if (!is_flow(instr) || instr->opc == OPC_END || instr->opc == OPC_CHMASK) { for (unsigned i = 0, n = 0; i < instr->regs_count; i++) { struct ir3_register *reg = instr->regs[i]; @@ -397,9 +397,4 @@ ir3_print(struct ir3 *ir) { foreach_block (block, &ir->block_list) print_block(block, 0); - - foreach_output_n (out, i, ir) { - printf("out%d: ", i); - print_instr(out, 0); - } } diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c index 28994620304..7ef7dbf37d9 100644 --- a/src/freedreno/ir3/ir3_ra.c +++ b/src/freedreno/ir3/ir3_ra.c @@ -1233,9 +1233,6 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block) foreach_input (in, ctx->ir) { reg_assign(ctx, in->regs[0], in); } - foreach_output (out, ctx->ir) { - reg_assign(ctx, out->regs[0], out); - } } } diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c index 55f4c730cda..431d7e98109 100644 --- a/src/freedreno/ir3/ir3_sched.c +++ b/src/freedreno/ir3/ir3_sched.c @@ -971,16 +971,15 @@ mark_kill_path(struct ir3_instruction *instr) static bool is_output_collect(struct ir3_instruction *instr) { - struct ir3 *ir = instr->block->shader; + if (instr->opc != OPC_META_COLLECT) + return false; - for (unsigned i = 0; i < ir->outputs_count; i++) { - struct ir3_instruction *collect = ir->outputs[i]; - assert(collect->opc == OPC_META_COLLECT); - if (instr == collect) - return true; + foreach_ssa_use (use, instr) { + if (use->opc != OPC_END && use->opc != OPC_CHMASK) + return false; } - return false; + return true; } /* Is it's only use as output? */ diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c index 90822b0ed50..a4bfd88dced 100644 --- a/src/freedreno/ir3/ir3_shader.c +++ b/src/freedreno/ir3/ir3_shader.c @@ -661,17 +661,6 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out) fetch->wrmask, fetch->cmd); } - foreach_output_n (instr, i, ir) { - reg = instr->regs[0]; - regid = reg->num; - fprintf(out, "@out(%sr%d.%c)\tout%d", - (reg->flags & IR3_REG_HALF) ? "h" : "", - (regid >> 2), "xyzw"[regid & 0x3], i); - if (reg->wrmask > 0x1) - fprintf(out, " (wrmask=0x%x)", reg->wrmask); - fprintf(out, "\n"); - } - const struct ir3_const_state *const_state = ir3_const_state(so); for (i = 0; i < DIV_ROUND_UP(const_state->immediates_count, 4); i++) { fprintf(out, "@const(c%d.x)\t", const_state->offsets.immediate + i); diff --git a/src/freedreno/ir3/ir3_validate.c b/src/freedreno/ir3/ir3_validate.c index f6cc9ba159d..85038eedda1 100644 --- a/src/freedreno/ir3/ir3_validate.c +++ b/src/freedreno/ir3/ir3_validate.c @@ -108,6 +108,8 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr) } } else if (opc_cat(instr->opc) == 6) { /* handled below */ + } else if (opc_cat(instr->opc) == 0) { + /* end/chmask/etc are allowed to have different size sources */ } else if (n > 0) { validate_assert(ctx, (last_reg->flags & IR3_REG_HALF) == (reg->flags & IR3_REG_HALF)); }