From dc398afb2782a594e58382dcb918e28825b7aec8 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 11 May 2026 14:35:36 +0200
Subject: [PATCH] nir: fix shuffling local IDs for quad derivatives with larger
 workgroup sizes

This was fundamentally broken for workgroup sizes >= 8x8.

This fixes new VKCTS coverage
dEQP-VK.glsl.texture_functions.texture.*_compute, and also few tests
from the vkd3d-proton testsuite (note that quad derivatives is
currently disabled for < GFX12).

Cc: mesa-stable
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41483>
---
 src/compiler/nir/nir_lower_system_values.c | 55 ++++++++++++----------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c
index 34061abe689..6dbc0acc732 100644
--- a/src/compiler/nir/nir_lower_system_values.c
+++ b/src/compiler/nir/nir_lower_system_values.c
@@ -579,48 +579,55 @@ lower_compute_system_value_instr(nir_builder *b,
           * That's the layout required by AMD hardware for derivatives to
           * work. Other hardware may work differently.
           *
-          * It's a classic tiling pattern that can be implemented by inserting
-          * bit y[0] between bits x[0] and x[1] like this:
+          * Map each thread to a 2x2 block by decomposing the linear index
+          * i = y * W + x into a block index and a within-block position:
           *
-          *    x[0],y[0],x[1],...x[last],y[1],...,y[last]
+          *    block     = i / 4
+          *    block_pos = i % 4     (0=(0,0) 1=(1,0) 2=(0,1) 3=(1,1))
           *
           * If the width is a power of two, use:
-          *    i = ((x & 1) | ((y & 1) << 1) | ((x & ~1) << 1)) | ((y & ~1) << logbase2(size_x))
+          *    block_x = block & ((W/2) - 1)
+          *    block_y = block >> (log2(W) - 1)
           *
           * If the width is not a power of two or the local size is variable, use:
-          *    i = ((x & 1) | ((y & 1) << 1) | ((x & ~1) << 1)) + ((y & ~1) * size_x)
+          *    block_x = block % (W/2)
+          *    block_y = block / (W/2)
+          *
+          * In both cases:
+          *    x' = block_x * 2 + (block_pos & 1)
+          *    y' = block_y * 2 + (block_pos >> 1)
           *
           * GL_NV_compute_shader_derivatives requires that the width and height
           * are a multiple of two, which is also a requirement for the second
           * expression to work.
-          *
-          * The 2D result is: (x,y) = (i % w, i / w)
           */
 
-         nir_def *one = nir_imm_int(b, 1);
-         nir_def *inv_one = nir_imm_int(b, ~1);
-         nir_def *x_bit0 = nir_iand(b, x, one);
-         nir_def *y_bit0 = nir_iand(b, y, one);
-         nir_def *x_bits_1n = nir_iand(b, x, inv_one);
-         nir_def *y_bits_1n = nir_iand(b, y, inv_one);
-         nir_def *bits_01 = nir_ior(b, x_bit0, nir_ishl(b, y_bit0, one));
-         nir_def *bits_01x = nir_ior(b, bits_01,
-                                     nir_ishl(b, x_bits_1n, one));
          nir_def *i;
 
          if (!b->shader->info.workgroup_size_variable &&
              util_is_power_of_two_nonzero(size_x)) {
-            nir_def *log2_size_x = nir_imm_int(b, util_logbase2(size_x));
-            i = nir_ior(b, bits_01x, nir_ishl(b, y_bits_1n, log2_size_x));
+            i = nir_ior(b, x, nir_ishl_imm(b, y, util_logbase2(size_x)));
          } else {
-            i = nir_iadd(b, bits_01x, nir_imul(b, y_bits_1n, size_x_imm));
+            i = nir_iadd(b, x, nir_imul(b, y, size_x_imm));
          }
 
-         /* This should be fast if size_x is an immediate or even a power
-          * of two.
-          */
-         x = nir_umod(b, i, size_x_imm);
-         y = nir_udiv(b, i, size_x_imm);
+         nir_def *block     = nir_ushr_imm(b, i, 2);
+         nir_def *block_pos = nir_iand_imm(b, i, 3);
+
+         nir_def *block_x, *block_y;
+         if (!b->shader->info.workgroup_size_variable &&
+             util_is_power_of_two_nonzero(size_x)) {
+            unsigned log2_half_size_x = util_logbase2(size_x) - 1;
+            block_x = nir_iand_imm(b, block, (size_x >> 1) - 1);
+            block_y = nir_ushr_imm(b, block, log2_half_size_x);
+         } else {
+            nir_def *half_size_x = nir_ushr_imm(b, size_x_imm, 1);
+            block_x = nir_umod(b, block, half_size_x);
+            block_y = nir_udiv(b, block, half_size_x);
+         }
+
+         x = nir_ior(b, nir_ishl_imm(b, block_x, 1), nir_iand_imm(b, block_pos, 1));
+         y = nir_ior(b, nir_ishl_imm(b, block_y, 1), nir_ushr_imm(b, block_pos, 1));
 
          return nir_vec3(b, x, y, z);
       }