From 5200e1c2120f3143dcd11eb0bc0b4ddf8ee62f86 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 7 Feb 2022 23:40:25 -0500
Subject: [PATCH] translate: improve sse2 32-bit unsigned -> float conversion

The existing logic would drop the low bit. Instead, let's drop the high
bit, do the conversion, and then add the fixed constant back in if the
value had the high bit set originally.

Fixes KHR-GL45.direct_state_access.vertex_arrays_attribute_format on
drivers that use this module to handle the format conversion.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Acked-by: Emma Anholt <emma@anholt.net>
Tested-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14922>
---
 .../auxiliary/translate/translate_sse.c       | 55 ++++++++++++++-----
 .../drivers/zink/ci/zink-lvp-fails.txt        |  1 -
 2 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index c128ac3da7e..91f0ea6e4cd 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -64,7 +64,8 @@ struct translate_buffer_variant
 
 #define ELEMENT_BUFFER_INSTANCE_ID  1001
 
-#define NUM_CONSTS 7
+#define NUM_FLOAT_CONSTS 9
+#define NUM_UNSIGNED_CONSTS 1
 
 enum
 {
@@ -74,22 +75,32 @@ enum
    CONST_INV_32767,
    CONST_INV_65535,
    CONST_INV_2147483647,
-   CONST_255
+   CONST_INV_4294967295,
+   CONST_255,
+   CONST_2147483648,
+   /* float consts end */
+   CONST_2147483647_INT,
 };
 
 #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
-static float consts[NUM_CONSTS][4] = {
+static float consts[NUM_FLOAT_CONSTS][4] = {
    {0, 0, 0, 1},
    C(1.0 / 127.0),
    C(1.0 / 255.0),
    C(1.0 / 32767.0),
    C(1.0 / 65535.0),
    C(1.0 / 2147483647.0),
-   C(255.0)
+   C(1.0 / 4294967295.0),
+   C(255.0),
+   C(2147483648.0),
 };
 
 #undef C
 
+static unsigned uconsts[NUM_UNSIGNED_CONSTS][4] = {
+   {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff},
+};
+
 struct translate_sse
 {
    struct translate translate;
@@ -100,9 +111,10 @@ struct translate_sse
    struct x86_function elt8_func;
    struct x86_function *func;
 
-     PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
+   PIPE_ALIGN_VAR(16) float consts[NUM_FLOAT_CONSTS][4];
+   PIPE_ALIGN_VAR(16) float uconsts[NUM_UNSIGNED_CONSTS][4];
    int8_t reg_to_const[16];
-   int8_t const_to_reg[NUM_CONSTS];
+   int8_t const_to_reg[NUM_FLOAT_CONSTS + NUM_UNSIGNED_CONSTS];
 
    struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS];
    unsigned nr_buffers;
@@ -165,9 +177,13 @@ get_const(struct translate_sse *p, unsigned id)
    p->const_to_reg[id] = i;
 
    /* TODO: this should happen outside the loop, if possible */
+   const void *c;
+   if (id < NUM_FLOAT_CONSTS)
+      c = &p->consts[id][0];
+   else
+      c = &p->uconsts[id - NUM_FLOAT_CONSTS][0];
    sse_movaps(p->func, reg,
-              x86_make_disp(p->machine_EDI,
-                            get_offset(p, &p->consts[id][0])));
+              x86_make_disp(p->machine_EDI, get_offset(p, c)));
 
    return reg;
 }
@@ -508,6 +524,7 @@ translate_attr_convert(struct translate_sse *p,
         || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
         || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {
       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+      struct x86_reg auxXMM;
 
       for (i = 0; i < output_desc->nr_channels; ++i) {
          if (swizzle[i] == PIPE_SWIZZLE_0
@@ -544,12 +561,26 @@ translate_attr_convert(struct translate_sse *p,
                sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
                break;
             case 32:           /* we lose precision here */
-               sse2_psrld_imm(p->func, dataXMM, 1);
+               /* No unsigned conversion (except in AVX512F), so we check if
+                * it's negative, and stick the high bit as a separate float
+                * value in an aux register: */
+               auxXMM = x86_make_reg(file_XMM, 1);
+               /* aux = 0 */
+               sse_xorps(p->func, auxXMM, auxXMM);
+               /* aux = aux > data ? 0xffffffff : 0 */
+               sse2_pcmpgtd(p->func, auxXMM, dataXMM);
+               /* data = data & 0x7fffffff */
+               sse_andps(p->func, dataXMM, get_const(p, CONST_2147483647_INT));
+               /* aux = aux & 2147483648.0 */
+               sse_andps(p->func, auxXMM, get_const(p, CONST_2147483648));
                break;
             default:
                return FALSE;
             }
             sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
+            if (input_desc->channel[0].size == 32)
+               /* add in the high bit's worth of float that we AND'd away */
+               sse_addps(p->func, dataXMM, auxXMM);
             if (input_desc->channel[0].normalized) {
                struct x86_reg factor;
                switch (input_desc->channel[0].size) {
@@ -560,7 +591,7 @@ translate_attr_convert(struct translate_sse *p,
                   factor = get_const(p, CONST_INV_65535);
                   break;
                case 32:
-                  factor = get_const(p, CONST_INV_2147483647);
+                  factor = get_const(p, CONST_INV_4294967295);
                   break;
                default:
                   assert(0);
@@ -572,9 +603,6 @@ translate_attr_convert(struct translate_sse *p,
                }
                sse_mulps(p->func, dataXMM, factor);
             }
-            else if (input_desc->channel[0].size == 32)
-               /* compensate for the bit we threw away to fit u32 into s32 */
-               sse_addps(p->func, dataXMM, dataXMM);
             break;
          case UTIL_FORMAT_TYPE_SIGNED:
             if (!(x86_target_caps(p->func) & X86_SSE2))
@@ -1491,6 +1519,7 @@ translate_sse2_create(const struct translate_key *key)
 
    memset(p, 0, sizeof(*p));
    memcpy(p->consts, consts, sizeof(consts));
+   memcpy(p->uconsts, uconsts, sizeof(uconsts));
 
    p->translate.key = *key;
    p->translate.release = translate_sse_release;
diff --git a/src/gallium/drivers/zink/ci/zink-lvp-fails.txt b/src/gallium/drivers/zink/ci/zink-lvp-fails.txt
index 103d922a57b..1ab12943331 100644
--- a/src/gallium/drivers/zink/ci/zink-lvp-fails.txt
+++ b/src/gallium/drivers/zink/ci/zink-lvp-fails.txt
@@ -1,5 +1,4 @@
 KHR-GL46.compute_shader.conditional-dispatching,Fail
-KHR-GL46.direct_state_access.vertex_arrays_attribute_format,Fail
 KHR-GL46.gpu_shader_fp64.builtin.mod_dvec2,Fail
 KHR-GL46.gpu_shader_fp64.builtin.mod_dvec3,Fail
 KHR-GL46.gpu_shader_fp64.builtin.mod_dvec4,Fail