From 9db5da0f388f22a3d705d02e34fe1026c9f176fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= <pavel.ondracka@gmail.com>
Date: Thu, 15 Jun 2023 15:10:01 +0200
Subject: [PATCH] r300: add partial CMP support on R5xx

VE_COND_MUX_GTE4 is a nice match for the TGSI CMP opcode, however
there is a big limitation due to the general shortcoming of the
vertex shader engine that any instruction can read only two different
temporary registers. So we still have to lower in some cases.

Shader-db RV530:
total instructions in shared programs: 130872 -> 130333 (-0.41%)
instructions in affected programs: 29854 -> 29315 (-1.81%)
helped: 294
HURT: 83
total temps in shared programs: 16747 -> 16775 (0.17%)
temps in affected programs: 407 -> 435 (6.88%)
helped: 10
HURT: 38

Reviewed-by: Filip Gawin <filip.gawin@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23691>
---
 .../drivers/r300/compiler/r3xx_vertprog.c     | 31 +++++++++++++++++++
 .../r300/compiler/radeon_program_alu.c        | 13 +++++++-
 src/gallium/drivers/r300/r300_screen.c        |  1 +
 src/gallium/drivers/r300/r300_state.c         |  1 -
 4 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
index 5ad1f4eb857..4ee3b878b4c 100644
--- a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
+++ b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
@@ -237,6 +237,36 @@ static void ei_math1(struct r300_vertex_program_code *vp,
 	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
 }
 
+static void ei_cmp(struct r300_vertex_program_code *vp,
+				struct rc_sub_instruction *vpi,
+				unsigned int * inst)
+{
+	inst[0] = PVS_OP_DST_OPERAND(VE_COND_MUX_GTE,
+				     0,
+				     0,
+				     t_dst_index(vp, &vpi->DstReg),
+				     t_dst_mask(vpi->DstReg.WriteMask),
+				     t_dst_class(vpi->DstReg.File),
+                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
+
+	/* Arguments with constant swizzles still count as a unique
+	 * temporary, so we should make sure these arguments share a
+	 * register index with one of the other arguments. */
+	for (unsigned i = 0; i < 3; i++) {
+		unsigned j = (i + 1) % 3;
+		if (vpi->SrcReg[i].File == RC_FILE_NONE &&
+			(vpi->SrcReg[j].File == RC_FILE_NONE ||
+			 vpi->SrcReg[j].File == RC_FILE_TEMPORARY)) {
+			vpi->SrcReg[i].Index = vpi->SrcReg[j].Index;
+			break;
+		}
+	}
+
+	inst[1] = t_src(vp, &vpi->SrcReg[0]);
+	inst[2] = t_src(vp, &vpi->SrcReg[2]);
+	inst[3] = t_src(vp, &vpi->SrcReg[1]);
+}
+
 static void ei_lit(struct r300_vertex_program_code *vp,
 				      struct rc_sub_instruction *vpi,
 				      unsigned int * inst)
@@ -414,6 +444,7 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
 		case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
 		case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
 		case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
+		case RC_OPCODE_CMP: ei_cmp(compiler->code, vpi, inst); break;
 		case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
 		case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
 		case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
diff --git a/src/gallium/drivers/r300/compiler/radeon_program_alu.c b/src/gallium/drivers/r300/compiler/radeon_program_alu.c
index 314fa7655ee..2e2d75143e1 100644
--- a/src/gallium/drivers/r300/compiler/radeon_program_alu.c
+++ b/src/gallium/drivers/r300/compiler/radeon_program_alu.c
@@ -395,7 +395,18 @@ int radeonTransformALU(
 static void transform_r300_vertex_CMP(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	/* There is no decent CMP available, so let's rig one up.
+	/* R5xx has a CMP, but we can use it only if it reads from less than
+	 * three different temps. */
+	if (c->is_r500 &&
+	    (inst->U.I.SrcReg[0].File != RC_FILE_TEMPORARY ||
+	     inst->U.I.SrcReg[1].File != RC_FILE_TEMPORARY ||
+	     inst->U.I.SrcReg[2].File != RC_FILE_TEMPORARY ||
+	     inst->U.I.SrcReg[0].Index == inst->U.I.SrcReg[1].Index ||
+	     inst->U.I.SrcReg[1].Index == inst->U.I.SrcReg[2].Index ||
+	     inst->U.I.SrcReg[0].Index == inst->U.I.SrcReg[2].Index))
+		return;
+
+	/* There is no decent CMP available on r300, so let's rig one up.
 	 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
 	 * The following sequence consumes zero to two temps and two extra slots
 	 * (the second temp and the second slot is consumed by transform_LRP),
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 1e920b0cc74..c7086853a8a 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -512,6 +512,7 @@ static int r300_get_video_param(struct pipe_screen *screen,
 
 static const nir_shader_compiler_options r500_vs_compiler_options = {
    COMMON_NIR_OPTIONS,
+   .has_fused_comp_and_csel = true,
 
    /* Have HW loops support and 1024 max instr count, but don't unroll *too*
     * hard.
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index 269281cd235..bd1202be031 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -1952,7 +1952,6 @@ static void* r300_create_vs_state(struct pipe_context* pipe,
            .ubo_vec4_max = 0x00ff,
        };
        static const struct nir_to_tgsi_options hwtcl_r500_options = {
-           .lower_cmp = true,
            .ubo_vec4_max = 0x00ff,
        };
        const struct nir_to_tgsi_options *ntt_options;