diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c
index aac729f9d57..ecc1cf4d912 100644
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -695,6 +695,8 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n,
 	}
 
 	switch (opc_cat(instr->opc)) {
+	case 0: /* end, chmask */
+		return flags == 0;
 	case 1:
 		valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV;
 		if (flags & ~valid_flags)
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index bcac2103e74..89b3474ca3c 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -304,11 +304,11 @@ struct ir3_instruction {
 			int off;              /* component/offset */
 		} split;
 		struct {
-			/* for output collects, this maps back to the entry in the
+			/* Per-source index back to the entry in the
 			 * ir3_shader_variant::outputs table.
 			 */
-			int outidx;
-		} collect;
+			unsigned *outidxs;
+		} end;
 		struct {
 			unsigned samp, tex;
 			unsigned input_offset;
@@ -462,7 +462,6 @@ struct ir3 {
 	gl_shader_stage type;
 
 	DECLARE_ARRAY(struct ir3_instruction *, inputs);
-	DECLARE_ARRAY(struct ir3_instruction *, outputs);
 
 	/* Track bary.f (and ldlv) instructions.. this is needed in
 	 * scheduling to ensure that all varying fetches happen before
@@ -1228,14 +1227,6 @@ static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
 #define foreach_input(__ininstr, __ir) \
 	foreach_input_n(__ininstr, __i, __ir)
 
-/* iterators for shader outputs: */
-#define foreach_output_n(__outinstr, __cnt, __ir) \
-	for (struct ir3_instruction *__outinstr = (void *)~0; __outinstr; __outinstr = NULL) \
-		for (unsigned __cnt = 0; __cnt < (__ir)->outputs_count; __cnt++) \
-			if ((__outinstr = (__ir)->outputs[__cnt]))
-#define foreach_output(__outinstr, __ir) \
-	foreach_output_n(__outinstr, __i, __ir)
-
 /* iterators for instructions: */
 #define foreach_instr(__instr, __list) \
 	list_for_each_entry(struct ir3_instruction, __instr, __list, node)
diff --git a/src/freedreno/ir3/ir3_a6xx.c b/src/freedreno/ir3/ir3_a6xx.c
index 412a7528584..8481e5b0307 100644
--- a/src/freedreno/ir3/ir3_a6xx.c
+++ b/src/freedreno/ir3/ir3_a6xx.c
@@ -445,13 +445,5 @@ ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so)
 		}
 	}
 
-	/* we also need to fixup shader outputs: */
-	foreach_output_n (out, n, ir) {
-		if (is_atomic(out->opc) && (out->flags & IR3_INSTR_G)) {
-			ir->outputs[n] = get_atomic_dest_mov(out);
-			progress = true;
-		}
-	}
-
 	return progress;
 }
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index 883f0314539..34f165b4d81 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -2999,27 +2999,6 @@ emit_function(struct ir3_context *ctx, nir_function_impl *impl)
 		emit_stream_out(ctx);
 	}
 
-	/* Vertex shaders in a tessellation or geometry pipeline treat END as a
-	 * NOP and has an epilogue that writes the VS outputs to local storage, to
-	 * be read by the HS.  Then it resets execution mask (chmask) and chains
-	 * to the next shader (chsh).
-	 */
-	if ((ctx->so->type == MESA_SHADER_VERTEX &&
-				(ctx->so->key.has_gs || ctx->so->key.tessellation)) ||
-			(ctx->so->type == MESA_SHADER_TESS_EVAL && ctx->so->key.has_gs)) {
-		struct ir3_instruction *chmask =
-			ir3_CHMASK(ctx->block);
-		chmask->barrier_class = IR3_BARRIER_EVERYTHING;
-		chmask->barrier_conflict = IR3_BARRIER_EVERYTHING;
-
-		struct ir3_instruction *chsh =
-			ir3_CHSH(ctx->block);
-		chsh->barrier_class = IR3_BARRIER_EVERYTHING;
-		chsh->barrier_conflict = IR3_BARRIER_EVERYTHING;
-	} else {
-		ir3_END(ctx->block);
-	}
-
 	setup_predecessors(ctx->ir);
 }
 
@@ -3550,26 +3529,36 @@ output_slot_used_for_binning(gl_varying_slot slot)
 		   slot == VARYING_SLOT_CLIP_DIST0 || slot == VARYING_SLOT_CLIP_DIST1;
 }
 
+
+static struct ir3_instruction *find_end(struct ir3 *ir)
+{
+	foreach_block_rev (block, &ir->block_list) {
+		foreach_instr_rev(instr, &block->instr_list) {
+			if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
+				return instr;
+		}
+	}
+	unreachable("couldn't find end instruction");
+}
+
 static void
-fixup_binning_pass(struct ir3_context *ctx)
+fixup_binning_pass(struct ir3_context *ctx, struct ir3_instruction *end)
 {
 	struct ir3_shader_variant *so = ctx->so;
-	struct ir3 *ir = ctx->ir;
 	unsigned i, j;
 
 	/* first pass, remove unused outputs from the IR level outputs: */
-	for (i = 0, j = 0; i < ir->outputs_count; i++) {
-		struct ir3_instruction *out = ir->outputs[i];
-		assert(out->opc == OPC_META_COLLECT);
-		unsigned outidx = out->collect.outidx;
+	for (i = 0, j = 0; i < end->regs_count - 1; i++) {
+		unsigned outidx = end->end.outidxs[i];
 		unsigned slot = so->outputs[outidx].slot;
 
 		if (output_slot_used_for_binning(slot)) {
-			ir->outputs[j] = ir->outputs[i];
+			end->regs[j + 1] = end->regs[i + 1];
+			end->end.outidxs[j] = end->end.outidxs[i];
 			j++;
 		}
 	}
-	ir->outputs_count = j;
+	end->regs_count = j + 1;
 
 	/* second pass, cleanup the unused slots in ir3_shader_variant::outputs
 	 * table:
@@ -3581,9 +3570,9 @@ fixup_binning_pass(struct ir3_context *ctx)
 			so->outputs[j] = so->outputs[i];
 
 			/* fixup outidx to point to new output table entry: */
-			foreach_output (out, ir) {
-				if (out->collect.outidx == i) {
-					out->collect.outidx = j;
+			for (unsigned k = 0; k < end->regs_count - 1; k++) {
+				if (end->end.outidxs[k] == i) {
+					end->end.outidxs[k] = j;
 					break;
 				}
 			}
@@ -3671,61 +3660,27 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 	ir = so->ir = ctx->ir;
 
-	assert((ctx->noutputs % 4) == 0);
-
-	/* Setup IR level outputs, which are "collects" that gather
-	 * the scalar components of outputs.
+	/* Vertex shaders in a tessellation or geometry pipeline treat END as a
+	 * NOP and has an epilogue that writes the VS outputs to local storage, to
+	 * be read by the HS.  Then it resets execution mask (chmask) and chains
+	 * to the next shader (chsh).
 	 */
-	for (unsigned i = 0; i < ctx->noutputs; i += 4) {
-		unsigned ncomp = 0;
-		/* figure out the # of components written:
-		 *
-		 * TODO do we need to handle holes, ie. if .x and .z
-		 * components written, but .y component not written?
-		 */
-		for (unsigned j = 0; j < 4; j++) {
-			if (!ctx->outputs[i + j])
-				break;
-			ncomp++;
-		}
+	if ((so->type == MESA_SHADER_VERTEX &&
+				(so->key.has_gs || so->key.tessellation)) ||
+			(so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {
+		struct ir3_instruction *outputs[3];
+		unsigned outidxs[3];
+		unsigned outputs_count = 0;
 
-		/* Note that in some stages, like TCS, store_output is
-		 * lowered to memory writes, so no components of the
-		 * are "written" from the PoV of traditional store-
-		 * output instructions:
-		 */
-		if (!ncomp)
-			continue;
-
-		struct ir3_instruction *out =
-			ir3_create_collect(ctx, &ctx->outputs[i], ncomp);
-
-		int outidx = i / 4;
-		assert(outidx < so->outputs_count);
-
-		/* stash index into so->outputs[] so we can map the
-		 * output back to slot/etc later:
-		 */
-		out->collect.outidx = outidx;
-
-		array_insert(ir, ir->outputs, out);
-	}
-
-	/* Set up the gs header as an output for the vertex shader so it won't
-	 * clobber it for the tess ctrl shader.
-	 *
-	 * TODO this could probably be done more cleanly in a nir pass.
-	 */
-	if (ctx->so->type == MESA_SHADER_VERTEX ||
-			(ctx->so->key.has_gs && ctx->so->type == MESA_SHADER_TESS_EVAL)) {
 		if (ctx->primitive_id) {
 			unsigned n = so->outputs_count++;
 			so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
 
 			struct ir3_instruction *out =
 				ir3_create_collect(ctx, &ctx->primitive_id, 1);
-			out->collect.outidx = n;
-			array_insert(ir, ir->outputs, out);
+			outputs[outputs_count] = out;
+			outidxs[outputs_count] = n;
+			outputs_count++;
 		}
 
 		if (ctx->gs_header) {
@@ -3733,8 +3688,9 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 			so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
 			struct ir3_instruction *out =
 				ir3_create_collect(ctx, &ctx->gs_header, 1);
-			out->collect.outidx = n;
-			array_insert(ir, ir->outputs, out);
+			outputs[outputs_count] = out;
+			outidxs[outputs_count] = n;
+			outputs_count++;
 		}
 
 		if (ctx->tcs_header) {
@@ -3742,40 +3698,115 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 			so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
 			struct ir3_instruction *out =
 				ir3_create_collect(ctx, &ctx->tcs_header, 1);
-			out->collect.outidx = n;
-			array_insert(ir, ir->outputs, out);
+			outputs[outputs_count] = out;
+			outidxs[outputs_count] = n;
+			outputs_count++;
 		}
-	}
 
-	/* for a6xx+, binning and draw pass VS use same VBO state, so we
-	 * need to make sure not to remove any inputs that are used by
-	 * the nonbinning VS.
-	 */
-	if (ctx->compiler->gpu_id >= 600 && so->binning_pass &&
-			so->type == MESA_SHADER_VERTEX) {
-		for (int i = 0; i < ctx->ninputs; i++) {
-			struct ir3_instruction *in = ctx->inputs[i];
+		struct ir3_instruction *chmask =
+			ir3_instr_create(ctx->block, OPC_CHMASK, outputs_count + 1);
+		chmask->barrier_class = IR3_BARRIER_EVERYTHING;
+		chmask->barrier_conflict = IR3_BARRIER_EVERYTHING;
 
-			if (!in)
+		__ssa_dst(chmask);
+		for (unsigned i = 0; i < outputs_count; i++)
+			__ssa_src(chmask, outputs[i], 0);
+
+		chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);
+		memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
+
+		array_insert(ctx->block, ctx->block->keeps, chmask);
+
+		struct ir3_instruction *chsh =
+			ir3_CHSH(ctx->block);
+		chsh->barrier_class = IR3_BARRIER_EVERYTHING;
+		chsh->barrier_conflict = IR3_BARRIER_EVERYTHING;
+	} else {
+		assert((ctx->noutputs % 4) == 0);
+		unsigned outidxs[ctx->noutputs / 4];
+		struct ir3_instruction *outputs[ctx->noutputs / 4];
+		unsigned outputs_count = 0;
+
+		/* Setup IR level outputs, which are "collects" that gather
+		 * the scalar components of outputs.
+		 */
+		for (unsigned i = 0; i < ctx->noutputs; i += 4) {
+			unsigned ncomp = 0;
+			/* figure out the # of components written:
+			 *
+			 * TODO do we need to handle holes, ie. if .x and .z
+			 * components written, but .y component not written?
+			 */
+			for (unsigned j = 0; j < 4; j++) {
+				if (!ctx->outputs[i + j])
+					break;
+				ncomp++;
+			}
+
+			/* Note that in some stages, like TCS, store_output is
+			 * lowered to memory writes, so no components of the
+			 * are "written" from the PoV of traditional store-
+			 * output instructions:
+			 */
+			if (!ncomp)
 				continue;
 
-			unsigned n = i / 4;
-			unsigned c = i % 4;
+			struct ir3_instruction *out =
+				ir3_create_collect(ctx, &ctx->outputs[i], ncomp);
 
-			debug_assert(n < so->nonbinning->inputs_count);
+			int outidx = i / 4;
+			assert(outidx < so->outputs_count);
 
-			if (so->nonbinning->inputs[n].sysval)
-				continue;
-
-			/* be sure to keep inputs, even if only used in VS */
-			if (so->nonbinning->inputs[n].compmask & (1 << c))
-				array_insert(in->block, in->block->keeps, in);
+			outidxs[outputs_count] = outidx;
+			outputs[outputs_count] = out;
+			outputs_count++;
 		}
+
+		/* for a6xx+, binning and draw pass VS use same VBO state, so we
+		 * need to make sure not to remove any inputs that are used by
+		 * the nonbinning VS.
+		 */
+		if (ctx->compiler->gpu_id >= 600 && so->binning_pass &&
+				so->type == MESA_SHADER_VERTEX) {
+			for (int i = 0; i < ctx->ninputs; i++) {
+				struct ir3_instruction *in = ctx->inputs[i];
+
+				if (!in)
+					continue;
+
+				unsigned n = i / 4;
+				unsigned c = i % 4;
+
+				debug_assert(n < so->nonbinning->inputs_count);
+
+				if (so->nonbinning->inputs[n].sysval)
+					continue;
+
+				/* be sure to keep inputs, even if only used in VS */
+				if (so->nonbinning->inputs[n].compmask & (1 << c))
+					array_insert(in->block, in->block->keeps, in);
+			}
+		}
+
+		struct ir3_instruction *end = ir3_instr_create(ctx->block, OPC_END,
+				outputs_count + 1);
+
+		__ssa_dst(end);
+		for (unsigned i = 0; i < outputs_count; i++) {
+			__ssa_src(end, outputs[i], 0);
+		}
+
+		end->end.outidxs = ralloc_array(end, unsigned, outputs_count);
+		memcpy(end->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
+
+		array_insert(ctx->block, ctx->block->keeps, end);
+
+		/* at this point, for binning pass, throw away unneeded outputs: */
+		if (so->binning_pass && (ctx->compiler->gpu_id < 600))
+			fixup_binning_pass(ctx, end);
+
 	}
 
-	/* at this point, for binning pass, throw away unneeded outputs: */
-	if (so->binning_pass && (ctx->compiler->gpu_id < 600))
-		fixup_binning_pass(ctx);
 
 	ir3_debug_print(ir, "AFTER: nir->ir3");
 	ir3_validate(ir);
@@ -3794,7 +3825,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 	 * we can re-use same VS_CONST state group.
 	 */
 	if (so->binning_pass && (ctx->compiler->gpu_id >= 600)) {
-		fixup_binning_pass(ctx);
+		fixup_binning_pass(ctx, find_end(ctx->so->ir));
 		/* cleanup the result of removing unneeded outputs: */
 		while (IR3_PASS(ir, ir3_dce, so)) {}
 	}
@@ -3915,12 +3946,14 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 	for (unsigned i = 0; i < so->outputs_count; i++)
 		so->outputs[i].regid = INVALID_REG;
 
-	foreach_output (out, ir) {
-		assert(out->opc == OPC_META_COLLECT);
-		unsigned outidx = out->collect.outidx;
+	struct ir3_instruction *end = find_end(so->ir);
 
-		so->outputs[outidx].regid = out->regs[0]->num;
-		so->outputs[outidx].half  = !!(out->regs[0]->flags & IR3_REG_HALF);
+	for (unsigned i = 1; i < end->regs_count; i++) {
+		unsigned outidx = end->end.outidxs[i - 1];
+		struct ir3_register *reg = end->regs[i];
+
+		so->outputs[outidx].regid = reg->num;
+		so->outputs[outidx].half = !!(reg->flags & IR3_REG_HALF);
 	}
 
 	foreach_input (in, ir) {
diff --git a/src/freedreno/ir3/ir3_cp.c b/src/freedreno/ir3/ir3_cp.c
index 20453da5ef7..7e65228cb97 100644
--- a/src/freedreno/ir3/ir3_cp.c
+++ b/src/freedreno/ir3/ir3_cp.c
@@ -350,8 +350,10 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
 			return true;
 		}
 	} else if ((is_same_type_mov(src) || is_const_mov(src)) &&
-			/* cannot collapse const/immed/etc into meta instrs: */
-			!is_meta(instr)) {
+			/* cannot collapse const/immed/etc into meta instrs and control
+			 * flow:
+			 */
+			!is_meta(instr) && opc_cat(instr->opc) != 0) {
 		/* immed/const/etc cases, which require some special handling: */
 		struct ir3_register *src_reg = src->regs[1];
 		unsigned new_flags = reg->flags;
@@ -643,11 +645,6 @@ ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so)
 
 	ir3_clear_mark(ir);
 
-	foreach_output_n (out, n, ir) {
-		instr_cp(&ctx, out);
-		ir->outputs[n] = eliminate_output_mov(&ctx, out);
-	}
-
 	foreach_block (block, &ir->block_list) {
 		if (block->condition) {
 			instr_cp(&ctx, block->condition);
diff --git a/src/freedreno/ir3/ir3_dce.c b/src/freedreno/ir3/ir3_dce.c
index e1e303f5482..a87704700c9 100644
--- a/src/freedreno/ir3/ir3_dce.c
+++ b/src/freedreno/ir3/ir3_dce.c
@@ -137,9 +137,6 @@ find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so)
 	foreach_array (arr, &ir->array_list)
 		arr->unused = true;
 
-	foreach_output (out, ir)
-		instr_dce(out, false);
-
 	foreach_block (block, &ir->block_list) {
 		for (i = 0; i < block->keeps_count; i++)
 			instr_dce(block->keeps[i], false);
diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c
index c85b9c8a383..c0615ffdc06 100644
--- a/src/freedreno/ir3/ir3_delay.c
+++ b/src/freedreno/ir3/ir3_delay.c
@@ -80,6 +80,10 @@ ir3_delayslots(struct ir3_instruction *assigner,
 	if (assigner->opc == OPC_MOVMSK)
 		return 4;
 
+	/* As far as we know, shader outputs don't need any delay. */
+	if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
+		return 0;
+
 	/* assigner must be alu: */
 	if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
 			is_mem(consumer)) {
diff --git a/src/freedreno/ir3/ir3_group.c b/src/freedreno/ir3/ir3_group.c
index 61ecd5320b3..f0bb646b08a 100644
--- a/src/freedreno/ir3/ir3_group.c
+++ b/src/freedreno/ir3/ir3_group.c
@@ -165,9 +165,6 @@ find_neighbors(struct ir3 *ir)
 	bool progress = false;
 	unsigned i;
 
-	foreach_output (out, ir)
-		progress |= instr_find_neighbors(out);
-
 	foreach_block (block, &ir->block_list) {
 		for (i = 0; i < block->keeps_count; i++) {
 			struct ir3_instruction *instr = block->keeps[i];
diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c
index 1f60b4dfaf4..4eb5362960d 100644
--- a/src/freedreno/ir3/ir3_print.c
+++ b/src/freedreno/ir3/ir3_print.c
@@ -240,7 +240,7 @@ print_instr(struct ir3_instruction *instr, int lvl)
 		printf(" ");
 	}
 
-	if (!is_flow(instr)) {
+	if (!is_flow(instr) || instr->opc == OPC_END || instr->opc == OPC_CHMASK) {
 		for (unsigned i = 0, n = 0; i < instr->regs_count; i++) {
 			struct ir3_register *reg = instr->regs[i];
 
@@ -397,9 +397,4 @@ ir3_print(struct ir3 *ir)
 {
 	foreach_block (block, &ir->block_list)
 		print_block(block, 0);
-
-	foreach_output_n (out, i, ir) {
-		printf("out%d: ", i);
-		print_instr(out, 0);
-	}
 }
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
index 28994620304..7ef7dbf37d9 100644
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -1233,9 +1233,6 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		foreach_input (in, ctx->ir) {
 			reg_assign(ctx, in->regs[0], in);
 		}
-		foreach_output (out, ctx->ir) {
-			reg_assign(ctx, out->regs[0], out);
-		}
 	}
 }
 
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
index 55f4c730cda..431d7e98109 100644
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -971,16 +971,15 @@ mark_kill_path(struct ir3_instruction *instr)
 static bool
 is_output_collect(struct ir3_instruction *instr)
 {
-	struct ir3 *ir = instr->block->shader;
+	if (instr->opc != OPC_META_COLLECT)
+		return false;
 
-	for (unsigned i = 0; i < ir->outputs_count; i++) {
-		struct ir3_instruction *collect = ir->outputs[i];
-		assert(collect->opc == OPC_META_COLLECT);
-		if (instr == collect)
-			return true;
+	foreach_ssa_use (use, instr) {
+		if (use->opc != OPC_END && use->opc != OPC_CHMASK)
+			return false;
 	}
 
-	return false;
+	return true;
 }
 
 /* Is it's only use as output? */
diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c
index 90822b0ed50..a4bfd88dced 100644
--- a/src/freedreno/ir3/ir3_shader.c
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -661,17 +661,6 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
 				fetch->wrmask, fetch->cmd);
 	}
 
-	foreach_output_n (instr, i, ir) {
-		reg = instr->regs[0];
-		regid = reg->num;
-		fprintf(out, "@out(%sr%d.%c)\tout%d",
-				(reg->flags & IR3_REG_HALF) ? "h" : "",
-				(regid >> 2), "xyzw"[regid & 0x3], i);
-		if (reg->wrmask > 0x1)
-			fprintf(out, " (wrmask=0x%x)", reg->wrmask);
-		fprintf(out, "\n");
-	}
-
 	const struct ir3_const_state *const_state = ir3_const_state(so);
 	for (i = 0; i < DIV_ROUND_UP(const_state->immediates_count, 4); i++) {
 		fprintf(out, "@const(c%d.x)\t", const_state->offsets.immediate + i);
diff --git a/src/freedreno/ir3/ir3_validate.c b/src/freedreno/ir3/ir3_validate.c
index f6cc9ba159d..85038eedda1 100644
--- a/src/freedreno/ir3/ir3_validate.c
+++ b/src/freedreno/ir3/ir3_validate.c
@@ -108,6 +108,8 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
 			}
 		} else if (opc_cat(instr->opc) == 6) {
 			/* handled below */
+		} else if (opc_cat(instr->opc) == 0) {
+			/* end/chmask/etc are allowed to have different size sources */
 		} else if (n > 0) {
 			validate_assert(ctx, (last_reg->flags & IR3_REG_HALF) == (reg->flags & IR3_REG_HALF));
 		}