ir3: Rework outputs

Instead of using a separate outputs array, make the "end" instruction (or chmask) take the outputs as sources. This works better for the new RA, because it better models the fact that outputs are consumed all at the same time. With the old model, each output collect would be assumed dead after it was processed and subsequent collects could use it when inserting shuffle code, which wouldn't work, and the new RA also deletes collect instructions after lowering them to moves so the information would be gone after RA. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10591>
2026-02-26 00:30:31 +01:00 · 2021-02-10 19:28:37 +01:00 · 2021-02-10 19:28:37 +01:00 · 3c8a5d7e17
commit 3c8a5d7e17
parent dd55bd8f68
13 changed files with 166 additions and 171 deletions
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@ -695,6 +695,8 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n,
 	}

 	switch (opc_cat(instr->opc)) {
+	case 0: /* end, chmask */
+		return flags == 0;
 	case 1:
 		valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV;
 		if (flags & ~valid_flags)
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@ -304,11 +304,11 @@ struct ir3_instruction {
 			int off;              /* component/offset */
 		} split;
 		struct {
-			/* for output collects, this maps back to the entry in the
+			/* Per-source index back to the entry in the
 			 * ir3_shader_variant::outputs table.
 			 */
-			int outidx;
-		} collect;
+			unsigned *outidxs;
+		} end;
 		struct {
 			unsigned samp, tex;
 			unsigned input_offset;
@ -462,7 +462,6 @@ struct ir3 {
 	gl_shader_stage type;

 	DECLARE_ARRAY(struct ir3_instruction *, inputs);
-	DECLARE_ARRAY(struct ir3_instruction *, outputs);

 	/* Track bary.f (and ldlv) instructions.. this is needed in
 	 * scheduling to ensure that all varying fetches happen before
@ -1228,14 +1227,6 @@ static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
 #define foreach_input(__ininstr, __ir) \
 	foreach_input_n(__ininstr, __i, __ir)

-/* iterators for shader outputs: */
-#define foreach_output_n(__outinstr, __cnt, __ir) \
-	for (struct ir3_instruction *__outinstr = (void *)~0; __outinstr; __outinstr = NULL) \
-		for (unsigned __cnt = 0; __cnt < (__ir)->outputs_count; __cnt++) \
-			if ((__outinstr = (__ir)->outputs[__cnt]))
-#define foreach_output(__outinstr, __ir) \
-	foreach_output_n(__outinstr, __i, __ir)
-
 /* iterators for instructions: */
 #define foreach_instr(__instr, __list) \
 	list_for_each_entry(struct ir3_instruction, __instr, __list, node)
--- a/src/freedreno/ir3/ir3_a6xx.c
+++ b/src/freedreno/ir3/ir3_a6xx.c
@ -445,13 +445,5 @@ ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so)
 		}
 	}

-	/* we also need to fixup shader outputs: */
-	foreach_output_n (out, n, ir) {
-		if (is_atomic(out->opc) && (out->flags & IR3_INSTR_G)) {
-			ir->outputs[n] = get_atomic_dest_mov(out);
-			progress = true;
-		}
-	}
-
 	return progress;
 }
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@ -2999,27 +2999,6 @@ emit_function(struct ir3_context *ctx, nir_function_impl *impl)
 		emit_stream_out(ctx);
 	}

-	/* Vertex shaders in a tessellation or geometry pipeline treat END as a
-	 * NOP and has an epilogue that writes the VS outputs to local storage, to
-	 * be read by the HS.  Then it resets execution mask (chmask) and chains
-	 * to the next shader (chsh).
-	 */
-	if ((ctx->so->type == MESA_SHADER_VERTEX &&
-				(ctx->so->key.has_gs || ctx->so->key.tessellation)) ||
-			(ctx->so->type == MESA_SHADER_TESS_EVAL && ctx->so->key.has_gs)) {
-		struct ir3_instruction *chmask =
-			ir3_CHMASK(ctx->block);
-		chmask->barrier_class = IR3_BARRIER_EVERYTHING;
-		chmask->barrier_conflict = IR3_BARRIER_EVERYTHING;
-
-		struct ir3_instruction *chsh =
-			ir3_CHSH(ctx->block);
-		chsh->barrier_class = IR3_BARRIER_EVERYTHING;
-		chsh->barrier_conflict = IR3_BARRIER_EVERYTHING;
-	} else {
-		ir3_END(ctx->block);
-	}
-
 	setup_predecessors(ctx->ir);
 }

@ -3550,26 +3529,36 @@ output_slot_used_for_binning(gl_varying_slot slot)
 		   slot == VARYING_SLOT_CLIP_DIST0 || slot == VARYING_SLOT_CLIP_DIST1;
 }

+
+static struct ir3_instruction *find_end(struct ir3 *ir)
+{
+	foreach_block_rev (block, &ir->block_list) {
+		foreach_instr_rev(instr, &block->instr_list) {
+			if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
+				return instr;
+		}
+	}
+	unreachable("couldn't find end instruction");
+}
+
 static void
-fixup_binning_pass(struct ir3_context *ctx)
+fixup_binning_pass(struct ir3_context *ctx, struct ir3_instruction *end)
 {
 	struct ir3_shader_variant *so = ctx->so;
-	struct ir3 *ir = ctx->ir;
 	unsigned i, j;

 	/* first pass, remove unused outputs from the IR level outputs: */
-	for (i = 0, j = 0; i < ir->outputs_count; i++) {
-		struct ir3_instruction *out = ir->outputs[i];
-		assert(out->opc == OPC_META_COLLECT);
-		unsigned outidx = out->collect.outidx;
+	for (i = 0, j = 0; i < end->regs_count - 1; i++) {
+		unsigned outidx = end->end.outidxs[i];
 		unsigned slot = so->outputs[outidx].slot;

 		if (output_slot_used_for_binning(slot)) {
-			ir->outputs[j] = ir->outputs[i];
+			end->regs[j + 1] = end->regs[i + 1];
+			end->end.outidxs[j] = end->end.outidxs[i];
 			j++;
 		}
 	}
-	ir->outputs_count = j;
+	end->regs_count = j + 1;

 	/* second pass, cleanup the unused slots in ir3_shader_variant::outputs
 	 * table:
@ -3581,9 +3570,9 @@ fixup_binning_pass(struct ir3_context *ctx)
 			so->outputs[j] = so->outputs[i];

 			/* fixup outidx to point to new output table entry: */
-			foreach_output (out, ir) {
-				if (out->collect.outidx == i) {
-					out->collect.outidx = j;
+			for (unsigned k = 0; k < end->regs_count - 1; k++) {
+				if (end->end.outidxs[k] == i) {
+					end->end.outidxs[k] = j;
 					break;
 				}
 			}
@ -3671,61 +3660,27 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,

 	ir = so->ir = ctx->ir;

-	assert((ctx->noutputs % 4) == 0);
-
-	/* Setup IR level outputs, which are "collects" that gather
-	 * the scalar components of outputs.
+	/* Vertex shaders in a tessellation or geometry pipeline treat END as a
+	 * NOP and has an epilogue that writes the VS outputs to local storage, to
+	 * be read by the HS.  Then it resets execution mask (chmask) and chains
+	 * to the next shader (chsh).
 	 */
-	for (unsigned i = 0; i < ctx->noutputs; i += 4) {
-		unsigned ncomp = 0;
-		/* figure out the # of components written:
-		 *
-		 * TODO do we need to handle holes, ie. if .x and .z
-		 * components written, but .y component not written?
-		 */
-		for (unsigned j = 0; j < 4; j++) {
-			if (!ctx->outputs[i + j])
-				break;
-			ncomp++;
-		}
+	if ((so->type == MESA_SHADER_VERTEX &&
+				(so->key.has_gs || so->key.tessellation)) ||
+			(so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {
+		struct ir3_instruction *outputs[3];
+		unsigned outidxs[3];
+		unsigned outputs_count = 0;

-		/* Note that in some stages, like TCS, store_output is
-		 * lowered to memory writes, so no components of the
-		 * are "written" from the PoV of traditional store-
-		 * output instructions:
-		 */
-		if (!ncomp)
-			continue;
-
-		struct ir3_instruction *out =
-			ir3_create_collect(ctx, &ctx->outputs[i], ncomp);
-
-		int outidx = i / 4;
-		assert(outidx < so->outputs_count);
-
-		/* stash index into so->outputs[] so we can map the
-		 * output back to slot/etc later:
-		 */
-		out->collect.outidx = outidx;
-
-		array_insert(ir, ir->outputs, out);
-	}
-
-	/* Set up the gs header as an output for the vertex shader so it won't
-	 * clobber it for the tess ctrl shader.
-	 *
-	 * TODO this could probably be done more cleanly in a nir pass.
-	 */
-	if (ctx->so->type == MESA_SHADER_VERTEX ||
-			(ctx->so->key.has_gs && ctx->so->type == MESA_SHADER_TESS_EVAL)) {
 		if (ctx->primitive_id) {
 			unsigned n = so->outputs_count++;
 			so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;

 			struct ir3_instruction *out =
 				ir3_create_collect(ctx, &ctx->primitive_id, 1);
-			out->collect.outidx = n;
-			array_insert(ir, ir->outputs, out);
+			outputs[outputs_count] = out;
+			outidxs[outputs_count] = n;
+			outputs_count++;
 		}

 		if (ctx->gs_header) {
@ -3733,8 +3688,9 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 			so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
 			struct ir3_instruction *out =
 				ir3_create_collect(ctx, &ctx->gs_header, 1);
-			out->collect.outidx = n;
-			array_insert(ir, ir->outputs, out);
+			outputs[outputs_count] = out;
+			outidxs[outputs_count] = n;
+			outputs_count++;
 		}

 		if (ctx->tcs_header) {
@ -3742,40 +3698,115 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 			so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
 			struct ir3_instruction *out =
 				ir3_create_collect(ctx, &ctx->tcs_header, 1);
-			out->collect.outidx = n;
-			array_insert(ir, ir->outputs, out);
+			outputs[outputs_count] = out;
+			outidxs[outputs_count] = n;
+			outputs_count++;
 		}
-	}

-	/* for a6xx+, binning and draw pass VS use same VBO state, so we
-	 * need to make sure not to remove any inputs that are used by
-	 * the nonbinning VS.
-	 */
-	if (ctx->compiler->gpu_id >= 600 && so->binning_pass &&
-			so->type == MESA_SHADER_VERTEX) {
-		for (int i = 0; i < ctx->ninputs; i++) {
-			struct ir3_instruction *in = ctx->inputs[i];
+		struct ir3_instruction *chmask =
+			ir3_instr_create(ctx->block, OPC_CHMASK, outputs_count + 1);
+		chmask->barrier_class = IR3_BARRIER_EVERYTHING;
+		chmask->barrier_conflict = IR3_BARRIER_EVERYTHING;

-			if (!in)
+		__ssa_dst(chmask);
+		for (unsigned i = 0; i < outputs_count; i++)
+			__ssa_src(chmask, outputs[i], 0);
+
+		chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);
+		memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
+
+		array_insert(ctx->block, ctx->block->keeps, chmask);
+
+		struct ir3_instruction *chsh =
+			ir3_CHSH(ctx->block);
+		chsh->barrier_class = IR3_BARRIER_EVERYTHING;
+		chsh->barrier_conflict = IR3_BARRIER_EVERYTHING;
+	} else {
+		assert((ctx->noutputs % 4) == 0);
+		unsigned outidxs[ctx->noutputs / 4];
+		struct ir3_instruction *outputs[ctx->noutputs / 4];
+		unsigned outputs_count = 0;
+
+		/* Setup IR level outputs, which are "collects" that gather
+		 * the scalar components of outputs.
+		 */
+		for (unsigned i = 0; i < ctx->noutputs; i += 4) {
+			unsigned ncomp = 0;
+			/* figure out the # of components written:
+			 *
+			 * TODO do we need to handle holes, ie. if .x and .z
+			 * components written, but .y component not written?
+			 */
+			for (unsigned j = 0; j < 4; j++) {
+				if (!ctx->outputs[i + j])
+					break;
+				ncomp++;
+			}
+
+			/* Note that in some stages, like TCS, store_output is
+			 * lowered to memory writes, so no components of the
+			 * are "written" from the PoV of traditional store-
+			 * output instructions:
+			 */
+			if (!ncomp)
 				continue;

-			unsigned n = i / 4;
-			unsigned c = i % 4;
+			struct ir3_instruction *out =
+				ir3_create_collect(ctx, &ctx->outputs[i], ncomp);

-			debug_assert(n < so->nonbinning->inputs_count);
+			int outidx = i / 4;
+			assert(outidx < so->outputs_count);

-			if (so->nonbinning->inputs[n].sysval)
-				continue;
-
-			/* be sure to keep inputs, even if only used in VS */
-			if (so->nonbinning->inputs[n].compmask & (1 << c))
-				array_insert(in->block, in->block->keeps, in);
+			outidxs[outputs_count] = outidx;
+			outputs[outputs_count] = out;
+			outputs_count++;
 		}
+
+		/* for a6xx+, binning and draw pass VS use same VBO state, so we
+		 * need to make sure not to remove any inputs that are used by
+		 * the nonbinning VS.
+		 */
+		if (ctx->compiler->gpu_id >= 600 && so->binning_pass &&
+				so->type == MESA_SHADER_VERTEX) {
+			for (int i = 0; i < ctx->ninputs; i++) {
+				struct ir3_instruction *in = ctx->inputs[i];
+
+				if (!in)
+					continue;
+
+				unsigned n = i / 4;
+				unsigned c = i % 4;
+
+				debug_assert(n < so->nonbinning->inputs_count);
+
+				if (so->nonbinning->inputs[n].sysval)
+					continue;
+
+				/* be sure to keep inputs, even if only used in VS */
+				if (so->nonbinning->inputs[n].compmask & (1 << c))
+					array_insert(in->block, in->block->keeps, in);
+			}
+		}
+
+		struct ir3_instruction *end = ir3_instr_create(ctx->block, OPC_END,
+				outputs_count + 1);
+
+		__ssa_dst(end);
+		for (unsigned i = 0; i < outputs_count; i++) {
+			__ssa_src(end, outputs[i], 0);
+		}
+
+		end->end.outidxs = ralloc_array(end, unsigned, outputs_count);
+		memcpy(end->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
+
+		array_insert(ctx->block, ctx->block->keeps, end);
+
+		/* at this point, for binning pass, throw away unneeded outputs: */
+		if (so->binning_pass && (ctx->compiler->gpu_id < 600))
+			fixup_binning_pass(ctx, end);
+
 	}

-	/* at this point, for binning pass, throw away unneeded outputs: */
-	if (so->binning_pass && (ctx->compiler->gpu_id < 600))
-		fixup_binning_pass(ctx);

 	ir3_debug_print(ir, "AFTER: nir->ir3");
 	ir3_validate(ir);
@ -3794,7 +3825,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 	 * we can re-use same VS_CONST state group.
 	 */
 	if (so->binning_pass && (ctx->compiler->gpu_id >= 600)) {
-		fixup_binning_pass(ctx);
+		fixup_binning_pass(ctx, find_end(ctx->so->ir));
 		/* cleanup the result of removing unneeded outputs: */
 		while (IR3_PASS(ir, ir3_dce, so)) {}
 	}
@ -3915,12 +3946,14 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 	for (unsigned i = 0; i < so->outputs_count; i++)
 		so->outputs[i].regid = INVALID_REG;

-	foreach_output (out, ir) {
-		assert(out->opc == OPC_META_COLLECT);
-		unsigned outidx = out->collect.outidx;
+	struct ir3_instruction *end = find_end(so->ir);

-		so->outputs[outidx].regid = out->regs[0]->num;
-		so->outputs[outidx].half  = !!(out->regs[0]->flags & IR3_REG_HALF);
+	for (unsigned i = 1; i < end->regs_count; i++) {
+		unsigned outidx = end->end.outidxs[i - 1];
+		struct ir3_register *reg = end->regs[i];
+
+		so->outputs[outidx].regid = reg->num;
+		so->outputs[outidx].half = !!(reg->flags & IR3_REG_HALF);
 	}

 	foreach_input (in, ir) {
--- a/src/freedreno/ir3/ir3_cp.c
+++ b/src/freedreno/ir3/ir3_cp.c
@ -350,8 +350,10 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
 			return true;
 		}
 	} else if ((is_same_type_mov(src) || is_const_mov(src)) &&
-			/* cannot collapse const/immed/etc into meta instrs: */
-			!is_meta(instr)) {
+			/* cannot collapse const/immed/etc into meta instrs and control
+			 * flow:
+			 */
+			!is_meta(instr) && opc_cat(instr->opc) != 0) {
 		/* immed/const/etc cases, which require some special handling: */
 		struct ir3_register *src_reg = src->regs[1];
 		unsigned new_flags = reg->flags;
@ -643,11 +645,6 @@ ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so)

 	ir3_clear_mark(ir);

-	foreach_output_n (out, n, ir) {
-		instr_cp(&ctx, out);
-		ir->outputs[n] = eliminate_output_mov(&ctx, out);
-	}
-
 	foreach_block (block, &ir->block_list) {
 		if (block->condition) {
 			instr_cp(&ctx, block->condition);
--- a/src/freedreno/ir3/ir3_dce.c
+++ b/src/freedreno/ir3/ir3_dce.c
@ -137,9 +137,6 @@ find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so)
 	foreach_array (arr, &ir->array_list)
 		arr->unused = true;

-	foreach_output (out, ir)
-		instr_dce(out, false);
-
 	foreach_block (block, &ir->block_list) {
 		for (i = 0; i < block->keeps_count; i++)
 			instr_dce(block->keeps[i], false);
--- a/src/freedreno/ir3/ir3_delay.c
+++ b/src/freedreno/ir3/ir3_delay.c
@ -80,6 +80,10 @@ ir3_delayslots(struct ir3_instruction *assigner,
 	if (assigner->opc == OPC_MOVMSK)
 		return 4;

+	/* As far as we know, shader outputs don't need any delay. */
+	if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
+		return 0;
+
 	/* assigner must be alu: */
 	if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
 			is_mem(consumer)) {
--- a/src/freedreno/ir3/ir3_group.c
+++ b/src/freedreno/ir3/ir3_group.c
@ -165,9 +165,6 @@ find_neighbors(struct ir3 *ir)
 	bool progress = false;
 	unsigned i;

-	foreach_output (out, ir)
-		progress |= instr_find_neighbors(out);
-
 	foreach_block (block, &ir->block_list) {
 		for (i = 0; i < block->keeps_count; i++) {
 			struct ir3_instruction *instr = block->keeps[i];
--- a/src/freedreno/ir3/ir3_print.c
+++ b/src/freedreno/ir3/ir3_print.c
@ -240,7 +240,7 @@ print_instr(struct ir3_instruction *instr, int lvl)
 		printf(" ");
 	}

-	if (!is_flow(instr)) {
+	if (!is_flow(instr) || instr->opc == OPC_END || instr->opc == OPC_CHMASK) {
 		for (unsigned i = 0, n = 0; i < instr->regs_count; i++) {
 			struct ir3_register *reg = instr->regs[i];

@ -397,9 +397,4 @@ ir3_print(struct ir3 *ir)
 {
 	foreach_block (block, &ir->block_list)
 		print_block(block, 0);
-
-	foreach_output_n (out, i, ir) {
-		printf("out%d: ", i);
-		print_instr(out, 0);
-	}
 }
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@ -1233,9 +1233,6 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		foreach_input (in, ctx->ir) {
 			reg_assign(ctx, in->regs[0], in);
 		}
-		foreach_output (out, ctx->ir) {
-			reg_assign(ctx, out->regs[0], out);
-		}
 	}
 }

--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@ -971,16 +971,15 @@ mark_kill_path(struct ir3_instruction *instr)
 static bool
 is_output_collect(struct ir3_instruction *instr)
 {
-	struct ir3 *ir = instr->block->shader;
+	if (instr->opc != OPC_META_COLLECT)
+		return false;

-	for (unsigned i = 0; i < ir->outputs_count; i++) {
-		struct ir3_instruction *collect = ir->outputs[i];
-		assert(collect->opc == OPC_META_COLLECT);
-		if (instr == collect)
-			return true;
+	foreach_ssa_use (use, instr) {
+		if (use->opc != OPC_END && use->opc != OPC_CHMASK)
+			return false;
 	}

-	return false;
+	return true;
 }

 /* Is it's only use as output? */
--- a/src/freedreno/ir3/ir3_shader.c
+++ b/src/freedreno/ir3/ir3_shader.c
@ -661,17 +661,6 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
 				fetch->wrmask, fetch->cmd);
 	}

-	foreach_output_n (instr, i, ir) {
-		reg = instr->regs[0];
-		regid = reg->num;
-		fprintf(out, "@out(%sr%d.%c)\tout%d",
-				(reg->flags & IR3_REG_HALF) ? "h" : "",
-				(regid >> 2), "xyzw"[regid & 0x3], i);
-		if (reg->wrmask > 0x1)
-			fprintf(out, " (wrmask=0x%x)", reg->wrmask);
-		fprintf(out, "\n");
-	}
-
 	const struct ir3_const_state *const_state = ir3_const_state(so);
 	for (i = 0; i < DIV_ROUND_UP(const_state->immediates_count, 4); i++) {
 		fprintf(out, "@const(c%d.x)\t", const_state->offsets.immediate + i);
--- a/src/freedreno/ir3/ir3_validate.c
+++ b/src/freedreno/ir3/ir3_validate.c
@ -108,6 +108,8 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
 			}
 		} else if (opc_cat(instr->opc) == 6) {
 			/* handled below */
+		} else if (opc_cat(instr->opc) == 0) {
+			/* end/chmask/etc are allowed to have different size sources */
 		} else if (n > 0) {
 			validate_assert(ctx, (last_reg->flags & IR3_REG_HALF) == (reg->flags & IR3_REG_HALF));
 		}