translate: improve sse2 32-bit unsigned -> float conversion

The existing logic would drop the low bit. Instead, let's drop the high bit, do the conversion, and then add the fixed constant back in if the value had the high bit set originally. Fixes KHR-GL45.direct_state_access.vertex_arrays_attribute_format on drivers that use this module to handle the format conversion. Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu> Acked-by: Emma Anholt <emma@anholt.net> Tested-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14922>
2026-01-03 20:10:17 +01:00 · 2022-02-07 23:40:25 -05:00 · 2022-02-07 23:40:25 -05:00 · 5200e1c212
commit 5200e1c212
parent 0b69f7b15d
2 changed files with 42 additions and 14 deletions
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@ -64,7 +64,8 @@ struct translate_buffer_variant

 #define ELEMENT_BUFFER_INSTANCE_ID  1001

-#define NUM_CONSTS 7
+#define NUM_FLOAT_CONSTS 9
+#define NUM_UNSIGNED_CONSTS 1

 enum
 {
@ -74,22 +75,32 @@ enum
   CONST_INV_32767,
   CONST_INV_65535,
   CONST_INV_2147483647,
-   CONST_255
+   CONST_INV_4294967295,
+   CONST_255,
+   CONST_2147483648,
+   /* float consts end */
+   CONST_2147483647_INT,
 };

 #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
-static float consts[NUM_CONSTS][4] = {
+static float consts[NUM_FLOAT_CONSTS][4] = {
   {0, 0, 0, 1},
   C(1.0 / 127.0),
   C(1.0 / 255.0),
   C(1.0 / 32767.0),
   C(1.0 / 65535.0),
   C(1.0 / 2147483647.0),
-   C(255.0)
+   C(1.0 / 4294967295.0),
+   C(255.0),
+   C(2147483648.0),
 };

 #undef C

+static unsigned uconsts[NUM_UNSIGNED_CONSTS][4] = {
+   {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff},
+};
+
 struct translate_sse
 {
   struct translate translate;
@ -100,9 +111,10 @@ struct translate_sse
   struct x86_function elt8_func;
   struct x86_function *func;

-     PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
+   PIPE_ALIGN_VAR(16) float consts[NUM_FLOAT_CONSTS][4];
+   PIPE_ALIGN_VAR(16) float uconsts[NUM_UNSIGNED_CONSTS][4];
   int8_t reg_to_const[16];
-   int8_t const_to_reg[NUM_CONSTS];
+   int8_t const_to_reg[NUM_FLOAT_CONSTS + NUM_UNSIGNED_CONSTS];

   struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS];
   unsigned nr_buffers;
@ -165,9 +177,13 @@ get_const(struct translate_sse *p, unsigned id)
   p->const_to_reg[id] = i;

   /* TODO: this should happen outside the loop, if possible */
+   const void *c;
+   if (id < NUM_FLOAT_CONSTS)
+      c = &p->consts[id][0];
+   else
+      c = &p->uconsts[id - NUM_FLOAT_CONSTS][0];
   sse_movaps(p->func, reg,
-              x86_make_disp(p->machine_EDI,
-                            get_offset(p, &p->consts[id][0])));
+              x86_make_disp(p->machine_EDI, get_offset(p, c)));

   return reg;
 }
@ -508,6 +524,7 @@ translate_attr_convert(struct translate_sse *p,
        || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
        || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {
      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+      struct x86_reg auxXMM;

      for (i = 0; i < output_desc->nr_channels; ++i) {
         if (swizzle[i] == PIPE_SWIZZLE_0
@ -544,12 +561,26 @@ translate_attr_convert(struct translate_sse *p,
               sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
               break;
            case 32:           /* we lose precision here */
-               sse2_psrld_imm(p->func, dataXMM, 1);
+               /* No unsigned conversion (except in AVX512F), so we check if
+                * it's negative, and stick the high bit as a separate float
+                * value in an aux register: */
+               auxXMM = x86_make_reg(file_XMM, 1);
+               /* aux = 0 */
+               sse_xorps(p->func, auxXMM, auxXMM);
+               /* aux = aux > data ? 0xffffffff : 0 */
+               sse2_pcmpgtd(p->func, auxXMM, dataXMM);
+               /* data = data & 0x7fffffff */
+               sse_andps(p->func, dataXMM, get_const(p, CONST_2147483647_INT));
+               /* aux = aux & 2147483648.0 */
+               sse_andps(p->func, auxXMM, get_const(p, CONST_2147483648));
               break;
            default:
               return FALSE;
            }
            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
+            if (input_desc->channel[0].size == 32)
+               /* add in the high bit's worth of float that we AND'd away */
+               sse_addps(p->func, dataXMM, auxXMM);
            if (input_desc->channel[0].normalized) {
               struct x86_reg factor;
               switch (input_desc->channel[0].size) {
@ -560,7 +591,7 @@ translate_attr_convert(struct translate_sse *p,
                  factor = get_const(p, CONST_INV_65535);
                  break;
               case 32:
-                  factor = get_const(p, CONST_INV_2147483647);
+                  factor = get_const(p, CONST_INV_4294967295);
                  break;
               default:
                  assert(0);
@ -572,9 +603,6 @@ translate_attr_convert(struct translate_sse *p,
               }
               sse_mulps(p->func, dataXMM, factor);
            }
-            else if (input_desc->channel[0].size == 32)
-               /* compensate for the bit we threw away to fit u32 into s32 */
-               sse_addps(p->func, dataXMM, dataXMM);
            break;
         case UTIL_FORMAT_TYPE_SIGNED:
            if (!(x86_target_caps(p->func) & X86_SSE2))
@ -1491,6 +1519,7 @@ translate_sse2_create(const struct translate_key *key)

   memset(p, 0, sizeof(*p));
   memcpy(p->consts, consts, sizeof(consts));
+   memcpy(p->uconsts, uconsts, sizeof(uconsts));

   p->translate.key = *key;
   p->translate.release = translate_sse_release;
--- a/src/gallium/drivers/zink/ci/zink-lvp-fails.txt
+++ b/src/gallium/drivers/zink/ci/zink-lvp-fails.txt
@ -1,5 +1,4 @@
 KHR-GL46.compute_shader.conditional-dispatching,Fail
-KHR-GL46.direct_state_access.vertex_arrays_attribute_format,Fail
 KHR-GL46.gpu_shader_fp64.builtin.mod_dvec2,Fail
 KHR-GL46.gpu_shader_fp64.builtin.mod_dvec3,Fail
 KHR-GL46.gpu_shader_fp64.builtin.mod_dvec4,Fail