From 0508db91556242c57029ad538613c2b1ee1969ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= <pavel.ondracka@gmail.com>
Date: Wed, 9 Aug 2023 10:17:48 +0200
Subject: [PATCH] r300: implement bias presubtract

RV530 shader-db:
total instructions in shared programs: 129468 -> 128859 (-0.47%)
instructions in affected programs: 34432 -> 33823 (-1.77%)
helped: 362
HURT: 56
total presub in shared programs: 5411 -> 7635 (41.10%)
presub in affected programs: 2069 -> 4293 (107.49%)
helped: 8
HURT: 468
total temps in shared programs: 16918 -> 16944 (0.15%)
temps in affected programs: 2022 -> 2048 (1.29%)
helped: 73
HURT: 79
total lits in shared programs: 3555 -> 2913 (-18.06%)
lits in affected programs: 2346 -> 1704 (-27.37%)
helped: 479
HURT: 0
total cycles in shared programs: 194675 -> 194124 (-0.28%)
cycles in affected programs: 62939 -> 62388 (-0.88%)
helped: 343
HURT: 84

Also dEQP-GLES2.functional.shaders.random.trigonometric.fragment.15
now fits into the instruction limit on RV370.

Reviewed-by: Filip Gawin <filip.gawin@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24830>
---
 .../drivers/r300/ci/r300-rv370-fails.txt      |  1 -
 .../r300/ci/r300-rv370-swtcl-fails.txt        |  1 -
 .../r300/compiler/r300_nir_algebraic.py       |  6 ++
 .../drivers/r300/compiler/radeon_optimize.c   | 87 ++++++++++++++++++-
 4 files changed, 91 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/r300/ci/r300-rv370-fails.txt b/src/gallium/drivers/r300/ci/r300-rv370-fails.txt
index 4097cfeb002..0c43c21baff 100644
--- a/src/gallium/drivers/r300/ci/r300-rv370-fails.txt
+++ b/src/gallium/drivers/r300/ci/r300-rv370-fails.txt
@@ -53,7 +53,6 @@ dEQP-GLES2.functional.shaders.random.all_features.fragment.5,Fail
 dEQP-GLES2.functional.shaders.random.all_features.fragment.6,Fail
 dEQP-GLES2.functional.shaders.random.all_features.fragment.93,Fail
 dEQP-GLES2.functional.shaders.random.all_features.fragment.97,Fail
-dEQP-GLES2.functional.shaders.random.trigonometric.fragment.15,Fail
 dEQP-GLES2.functional.shaders.random.trigonometric.fragment.45,Fail
 dEQP-GLES2.functional.texture.filtering.cube.linear_linear_clamp_l8_npot,Fail
 dEQP-GLES2.functional.texture.filtering.cube.linear_linear_clamp_rgb888_npot,Fail
diff --git a/src/gallium/drivers/r300/ci/r300-rv370-swtcl-fails.txt b/src/gallium/drivers/r300/ci/r300-rv370-swtcl-fails.txt
index 3ce139c3732..b5060ffc0e0 100644
--- a/src/gallium/drivers/r300/ci/r300-rv370-swtcl-fails.txt
+++ b/src/gallium/drivers/r300/ci/r300-rv370-swtcl-fails.txt
@@ -47,7 +47,6 @@ dEQP-GLES2.functional.shaders.random.all_features.fragment.5,Fail
 dEQP-GLES2.functional.shaders.random.all_features.fragment.6,Fail
 dEQP-GLES2.functional.shaders.random.all_features.fragment.93,Fail
 dEQP-GLES2.functional.shaders.random.all_features.fragment.97,Fail
-dEQP-GLES2.functional.shaders.random.trigonometric.fragment.15,Fail
 dEQP-GLES2.functional.shaders.random.trigonometric.fragment.45,Fail
 dEQP-GLES2.functional.texture.filtering.cube.linear_linear_clamp_l8_npot,Fail
 dEQP-GLES2.functional.texture.filtering.cube.linear_linear_clamp_rgb888_npot,Fail
diff --git a/src/gallium/drivers/r300/compiler/r300_nir_algebraic.py b/src/gallium/drivers/r300/compiler/r300_nir_algebraic.py
index 7967f14f4f2..c27dba90ec4 100644
--- a/src/gallium/drivers/r300/compiler/r300_nir_algebraic.py
+++ b/src/gallium/drivers/r300/compiler/r300_nir_algebraic.py
@@ -70,6 +70,12 @@ r300_nir_prepare_presubtract = [
         (('fadd', ('fneg', a), 1.0), ('fadd', 1.0, ('fneg', a))),
         (('fadd', a, -1.0), ('fneg', ('fadd', 1.0, ('fneg', a)))),
         (('fadd', -1.0, a), ('fneg', ('fadd', 1.0, ('fneg', a)))),
+        # Bias presubtract 1 - 2 * x expects MAD -a 2.0 1.0 form.
+        (('ffma', 2.0, ('fneg', a), 1.0), ('ffma', ('fneg', a), 2.0, 1.0)),
+        (('ffma', a, -2.0, 1.0), ('fneg', ('ffma', ('fneg', a), 2.0, 1.0))),
+        (('ffma', -2.0, a, 1.0), ('fneg', ('ffma', ('fneg', a), 2.0, 1.0))),
+        (('ffma', 2.0, a, -1.0), ('fneg', ('ffma', ('fneg', a), 2.0, 1.0))),
+        (('ffma', a, 2.0, -1.0), ('fneg', ('ffma', ('fneg', a), 2.0, 1.0))),
 ]
 
 # Previous prepare_presubtract pass can sometimes produce double fneg patterns.
diff --git a/src/gallium/drivers/r300/compiler/radeon_optimize.c b/src/gallium/drivers/r300/compiler/radeon_optimize.c
index 9ecb9eccab9..662346853e6 100644
--- a/src/gallium/drivers/r300/compiler/radeon_optimize.c
+++ b/src/gallium/drivers/r300/compiler/radeon_optimize.c
@@ -481,7 +481,7 @@ static int is_presub_candidate(
 	unsigned int i;
 	unsigned int is_constant[2] = {0, 0};
 
-	assert(inst->U.I.Opcode == RC_OPCODE_ADD);
+	assert(inst->U.I.Opcode == RC_OPCODE_ADD || inst->U.I.Opcode == RC_OPCODE_MAD);
 
 	if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE
 			|| inst->U.I.SaturateMode
@@ -490,7 +490,7 @@ static int is_presub_candidate(
 		return 0;
 	}
 
-	/* If both sources use a constant swizzle, then we can't convert it to
+	/* If first two sources use a constant swizzle, then we can't convert it to
 	 * a presubtract operation.  In fact for the ADD and SUB presubtract
 	 * operations neither source can contain a constant swizzle.  This
 	 * specific case is checked in peephole_add_presub_add() when
@@ -573,6 +573,23 @@ static void presub_replace_inv(
 	inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
 }
 
+static void presub_replace_bias(
+	struct rc_instruction * inst_mad,
+	struct rc_instruction * inst_reader,
+	unsigned int src_index)
+{
+	/* We must be careful not to modify inst_mad, since it
+	 * is possible it will remain part of the program.*/
+	inst_reader->U.I.PreSub.SrcReg[0] = inst_mad->U.I.SrcReg[0];
+	inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
+	inst_reader->U.I.PreSub.Opcode = RC_PRESUB_BIAS;
+	inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index],
+						inst_reader->U.I.PreSub.SrcReg[0]);
+
+	inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
+	inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_BIAS;
+}
+
 /**
  * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
  * Use the presubtract 1 - src0 for all readers of TEMP[0].  The first source
@@ -622,6 +639,66 @@ static int peephole_add_presub_inv(
 	return 0;
 }
 
+/**
+ * PRESUB_BIAD: MAD -TEMP[0], 2.0, 1.0
+ * Use the presubtract 1 - 2*src0 for all readers of TEMP[0].  The first source
+ * of the add instruction must have the constant 1 swizzle.  This function
+ * does not check const registers to see if their value is 1.0, so it should
+ * be called after the constant_folding optimization.
+ * @return
+ * 	0 if the MAD instruction is still part of the program.
+ * 	1 if the MAD instruction is no longer part of the program.
+ */
+static int peephole_mad_presub_bias(
+	struct radeon_compiler * c,
+	struct rc_instruction * inst_mad)
+{
+	unsigned int i, swz;
+
+	if (!is_presub_candidate(c, inst_mad))
+		return 0;
+
+	/* Check if src2 is 1. */
+	for(i = 0; i < 4; i++ ) {
+		if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
+			continue;
+
+		swz = GET_SWZ(inst_mad->U.I.SrcReg[2].Swizzle, i);
+		if (swz != RC_SWIZZLE_ONE || inst_mad->U.I.SrcReg[2].Negate & (1 << i))
+			return 0;
+	}
+
+	/* Check if src1 is 2. */
+	struct rc_src_register src1_reg = inst_mad->U.I.SrcReg[1];
+	if ((src1_reg.Negate & inst_mad->U.I.DstReg.WriteMask) != 0 || src1_reg.Abs)
+		return 0;
+        struct rc_constant *constant = &c->Program.Constants.Constants[src1_reg.Index];
+	if (constant->Type != RC_CONSTANT_IMMEDIATE)
+		return 0;
+        for (i = 0; i < 4; i++) {
+		if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
+			continue;
+		swz = GET_SWZ(src1_reg.Swizzle, i);
+		if (swz >= RC_SWIZZLE_ZERO || constant->u.Immediate[swz] != 2.0)
+			return 0;
+	}
+
+	/* Check src0. */
+	if ((inst_mad->U.I.SrcReg[0].Negate & inst_mad->U.I.DstReg.WriteMask) !=
+						inst_mad->U.I.DstReg.WriteMask
+		|| inst_mad->U.I.SrcReg[0].Abs
+		|| src_has_const_swz(inst_mad->U.I.SrcReg[0])) {
+
+		return 0;
+	}
+
+	if (presub_helper(c, inst_mad, RC_PRESUB_BIAS, presub_replace_bias)) {
+		rc_remove_instruction(inst_mad);
+		return 1;
+	}
+	return 0;
+}
+
 struct peephole_mul_cb_data {
 	struct rc_dst_register * Writer;
 	unsigned int Clobbered;
@@ -821,6 +898,12 @@ static int peephole(struct radeon_compiler * c, struct rc_instruction * inst)
 			return 1;
 		break;
 	}
+	case RC_OPCODE_MAD:
+	{
+		if (peephole_mad_presub_bias(c, inst))
+			return 1;
+		break;
+	}
 	default:
 		break;
 	}