r600: make vertex r10g10b10a2_sscaled conformant on palm and beyond

This is a gl4.3 issue very similar to e8fa3b4950.

The mode r10g10b10a2_sscaled processed as vertex on palm at the
hardware level doesn't follow the current standard. Indeed, the .w
component (2-bits) is not calculated as expected. The table below
describes the situation.

This change fixes this issue by adding two gpu instructions at
the vertex fetch shader stage. An equivalent C representation and
a gpu asm dump of the generated sequence are available below.

.w(2-bits)	expected	palm		cypress
0		 0		0		 0
1		 1		1		 1
2		-2		2		-2
3		-1		3		-1

w_out = w_in - (w_in > 1. ? 4. : 0.);

0002 00000024 A0040000  ALU 2 @72
 0072 801F2C0A 600004C0     1 w:     SETGT*4                __.w,  R10.w, 1.0
 0074 839FCC0A 61400010     2 w:     ADD                    R10.w,  R10.w, -PV.w

Note: cypress returns the expected value, and does not need
this correction.

This change was tested on palm, barts and cayman. Here are the tests fixed:
khr-gl4[3-6]/vertex_attrib_binding/basic-input-case6: fail pass
khr-gles31/core/vertex_attrib_binding/basic-input-case6: fail pass

Cc: mesa-stable
Signed-off-by: Patrick Lerda <patrick9876@free.fr>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38849>
This commit is contained in:
Patrick Lerda 2025-12-08 15:21:06 +01:00 committed by Marge Bot
parent da1108dcc4
commit 2ed761021f

View file

@ -338,6 +338,12 @@ void *r600_create_vertex_fetch_shader(struct pipe_context *ctx,
int i, j, r, fs_size;
uint32_t buffer_mask = 0;
struct r600_fetch_shader *shader;
unsigned post_fix_count = 0;
struct post_fix {
uint16_t gpr;
uint16_t swizzle3;
uint16_t num_format;
} post_fix[VERT_ATTRIB_MAX];
assert(count < 32);
@ -434,33 +440,50 @@ void *r600_create_vertex_fetch_shader(struct pipe_context *ctx,
if (unlikely(rctx->b.family >= CHIP_PALM &&
format == FMT_2_10_10_10 &&
!num_format && format_comp &&
(!num_format || num_format == 2) &&
format_comp &&
desc->swizzle[3] >= PIPE_SWIZZLE_X &&
desc->swizzle[3] <= PIPE_SWIZZLE_W)) {
post_fix[post_fix_count].gpr = i + 1;
post_fix[post_fix_count].swizzle3 = desc->swizzle[3];
post_fix[post_fix_count].num_format = num_format;
post_fix_count++;
}
}
if (unlikely(post_fix_count)) {
bc.force_add_cf = 1;
for (i = 0; i < post_fix_count; i++) {
struct r600_bytecode_alu alu;
const unsigned sel_main = i + 1;
const uint16_t sel_main = post_fix[i].gpr;
const uint16_t swizzle3 = post_fix[i].swizzle3;
const uint16_t local_num_format = post_fix[i].num_format;
bc.force_add_cf = 1;
if (!local_num_format) {
memset(&alu, 0, sizeof(alu));
alu.op = ALU_OP1_MOV;
alu.src[0].sel = sel_main;
alu.src[0].chan = swizzle3;
alu.dst.chan = 1;
memset(&alu, 0, sizeof(alu));
alu.op = ALU_OP1_MOV;
alu.src[0].sel = sel_main;
alu.src[0].chan = desc->swizzle[3];
alu.dst.chan = 1;
alu.omod = 2;
alu.dst.clamp = 1;
alu.omod = 2;
alu.dst.clamp = 1;
if (unlikely(r = r600_bytecode_add_alu(&bc, &alu)))
goto fail;
if (unlikely(r = r600_bytecode_add_alu(&bc, &alu)))
goto fail;
}
memset(&alu, 0, sizeof(alu));
alu.op = ALU_OP2_SETGT;
alu.src[0].sel = sel_main;
alu.src[0].chan = desc->swizzle[3];
alu.src[0].chan = swizzle3;
alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
alu.src[1].value = 0x3f000000;
alu.src[1].value = !local_num_format ?
0x3f000000 :
0x3f800000;
alu.dst.chan = 3;
alu.omod = 1;
alu.omod = !local_num_format ? 1 : 2;
alu.last = 1;
if (unlikely(r = r600_bytecode_add_alu(&bc, &alu)))
@ -468,13 +491,18 @@ void *r600_create_vertex_fetch_shader(struct pipe_context *ctx,
memset(&alu, 0, sizeof(alu));
alu.op = ALU_OP2_ADD;
alu.src[0].sel = V_SQ_ALU_SRC_PV;
alu.src[0].chan = 1;
if (!local_num_format) {
alu.src[0].sel = V_SQ_ALU_SRC_PV;
alu.src[0].chan = 1;
} else {
alu.src[0].sel = sel_main;
alu.src[0].chan = swizzle3;
}
alu.src[1].sel = V_SQ_ALU_SRC_PV;
alu.src[1].chan = 3;
alu.src[1].neg = 1;
alu.dst.sel = sel_main;
alu.dst.chan = desc->swizzle[3];
alu.dst.chan = swizzle3;
alu.dst.write = 1;
alu.last = 1;