From 2ed761021fec7c2bfd705c8b3293e70fe467f8ef Mon Sep 17 00:00:00 2001
From: Patrick Lerda <patrick9876@free.fr>
Date: Mon, 8 Dec 2025 15:21:06 +0100
Subject: [PATCH] r600: make vertex r10g10b10a2_sscaled conformant on palm and
 beyond

This is a gl4.3 issue very similar to e8fa3b49503a.

The mode r10g10b10a2_sscaled processed as vertex on palm at the
hardware level doesn't follow the current standard. Indeed, the .w
component (2-bits) is not calculated as expected. The table below
describes the situation.

This change fixes this issue by adding two gpu instructions at
the vertex fetch shader stage. An equivalent C representation and
a gpu asm dump of the generated sequence are available below.

.w(2-bits)	expected	palm		cypress
0		 0		0		 0
1		 1		1		 1
2		-2		2		-2
3		-1		3		-1

w_out = w_in - (w_in > 1. ? 4. : 0.);

0002 00000024 A0040000  ALU 2 @72
 0072 801F2C0A 600004C0     1 w:     SETGT*4                __.w,  R10.w, 1.0
 0074 839FCC0A 61400010     2 w:     ADD                    R10.w,  R10.w, -PV.w

Note: cypress returns the expected value, and does not need
this correction.

This change was tested on palm, barts and cayman. Here are the tests fixed:
khr-gl4[3-6]/vertex_attrib_binding/basic-input-case6: fail pass
khr-gles31/core/vertex_attrib_binding/basic-input-case6: fail pass

Cc: mesa-stable
Signed-off-by: Patrick Lerda <patrick9876@free.fr>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38849>
---
 src/gallium/drivers/r600/r600_shader.c | 64 ++++++++++++++++++--------
 1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 0e19e6a0597..a64ed6124f0 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -338,6 +338,12 @@ void *r600_create_vertex_fetch_shader(struct pipe_context *ctx,
 	int i, j, r, fs_size;
 	uint32_t buffer_mask = 0;
 	struct r600_fetch_shader *shader;
+	unsigned post_fix_count = 0;
+	struct post_fix {
+		uint16_t gpr;
+		uint16_t swizzle3;
+		uint16_t num_format;
+	} post_fix[VERT_ATTRIB_MAX];
 
 	assert(count < 32);
 
@@ -434,33 +440,50 @@ void *r600_create_vertex_fetch_shader(struct pipe_context *ctx,
 
 		if (unlikely(rctx->b.family >= CHIP_PALM &&
 			     format == FMT_2_10_10_10 &&
-			     !num_format && format_comp &&
+			     (!num_format || num_format == 2) &&
+			     format_comp &&
 			     desc->swizzle[3] >= PIPE_SWIZZLE_X &&
 			     desc->swizzle[3] <= PIPE_SWIZZLE_W)) {
+			post_fix[post_fix_count].gpr = i + 1;
+			post_fix[post_fix_count].swizzle3 = desc->swizzle[3];
+			post_fix[post_fix_count].num_format = num_format;
+			post_fix_count++;
+		}
+	}
+
+	if (unlikely(post_fix_count)) {
+		bc.force_add_cf = 1;
+
+		for (i = 0; i < post_fix_count; i++) {
 			struct r600_bytecode_alu alu;
-			const unsigned sel_main = i + 1;
+			const uint16_t sel_main = post_fix[i].gpr;
+			const uint16_t swizzle3 = post_fix[i].swizzle3;
+			const uint16_t local_num_format = post_fix[i].num_format;
 
-			bc.force_add_cf = 1;
+			if (!local_num_format) {
+				memset(&alu, 0, sizeof(alu));
+				alu.op = ALU_OP1_MOV;
+				alu.src[0].sel = sel_main;
+				alu.src[0].chan = swizzle3;
+				alu.dst.chan = 1;
 
-			memset(&alu, 0, sizeof(alu));
-			alu.op = ALU_OP1_MOV;
-			alu.src[0].sel = sel_main;
-			alu.src[0].chan = desc->swizzle[3];
-			alu.dst.chan = 1;
-			alu.omod = 2;
-			alu.dst.clamp = 1;
+				alu.omod = 2;
+				alu.dst.clamp = 1;
 
-			if (unlikely(r = r600_bytecode_add_alu(&bc, &alu)))
-				goto fail;
+				if (unlikely(r = r600_bytecode_add_alu(&bc, &alu)))
+					goto fail;
+			}
 
 			memset(&alu, 0, sizeof(alu));
 			alu.op = ALU_OP2_SETGT;
 			alu.src[0].sel = sel_main;
-			alu.src[0].chan = desc->swizzle[3];
+			alu.src[0].chan = swizzle3;
 			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
-			alu.src[1].value = 0x3f000000;
+			alu.src[1].value = !local_num_format ?
+				0x3f000000 :
+				0x3f800000;
 			alu.dst.chan = 3;
-			alu.omod = 1;
+			alu.omod = !local_num_format ? 1 : 2;
 			alu.last = 1;
 
 			if (unlikely(r = r600_bytecode_add_alu(&bc, &alu)))
@@ -468,13 +491,18 @@ void *r600_create_vertex_fetch_shader(struct pipe_context *ctx,
 
 			memset(&alu, 0, sizeof(alu));
 			alu.op = ALU_OP2_ADD;
-			alu.src[0].sel = V_SQ_ALU_SRC_PV;
-			alu.src[0].chan = 1;
+			if (!local_num_format) {
+				alu.src[0].sel = V_SQ_ALU_SRC_PV;
+				alu.src[0].chan = 1;
+			} else {
+				alu.src[0].sel = sel_main;
+				alu.src[0].chan = swizzle3;
+			}
 			alu.src[1].sel = V_SQ_ALU_SRC_PV;
 			alu.src[1].chan = 3;
 			alu.src[1].neg = 1;
 			alu.dst.sel = sel_main;
-			alu.dst.chan = desc->swizzle[3];
+			alu.dst.chan = swizzle3;
 			alu.dst.write = 1;
 			alu.last = 1;