From 64bc538f5e2f73bfb21717f898d7b54aab323adc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Roberto=20de=20Souza?= <jose.souza@intel.com>
Date: Wed, 8 Apr 2026 12:02:50 -0700
Subject: [PATCH] intel/brw: Explicitly upcast UB to UW for SHR with vector
 immediates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

HW does not allow instructions with vector immediates to cross a GRF boundary if
it has a stride.

Under register pressure, the register allocator may place a temporary register
across such a boundary.

To resolve this, we now explicitly emit a MOV to upcast the UB payload into a
UW VGRF.
This ensures the SHR instruction operates on a dense, well-aligned region that
satisfies hardware alignment constraints.

Below is the portion of the shader exhibiting this issue:

Native code for unnamed fragment shader GLSL6 (src_hash 0x9c84a007) (sha1 48745e7dae90d08f8a9bbe4dbf837de23440c841f0344e669cb8af9df79bce58)
SIMD32 shader: 44 instructions. 0 loops. 354 cycles. 0:0 spills:fills, 2 sends, scheduled with mode latency-sensitive. Promoted 0 constants. GRF registers: 22. Non-SSA regs (after NIR): 11. Compacted 800 to 800 bytes (0%)
mov(1)          f1<1>UW         g0.30<0,1,0>UW                  { align1 WE_all 1N };
mov(1)          f1.1<1>UW       g1.30<0,1,0>UW                  { align1 WE_all 1N I@1 };
mov(32)         g2<2>UW         g0.20<2,8,0>UW                  { align1 WE_all };
mov(32)         g4<2>UW         g0.21<2,8,0>UW                  { align1 WE_all };
mov(32)         g8<2>UW         g1.20<2,8,0>UW                  { align1 WE_all };
mov(32)         g10<2>UW        g1.21<2,8,0>UW                  { align1 WE_all };
mov(16)         g12<4>UB        g0.60<1,8,0>UB                  { align1 1H };
mov(16)         g13<4>UB        g1.60<1,8,0>UB                  { align1 2H };
add(32)         g0<1>UW         g2<16,8,2>UW    0x01000100V     { align1 WE_all I@6 };
add(32)         g1<1>UW         g4<16,8,2>UW    0x01010000V     { align1 WE_all I@6 };
add(32)         g2<1>UW         g8<16,8,2>UW    0x01000100V     { align1 WE_all I@6 };
add(32)         g3<1>UW         g10<16,8,2>UW   0x01010000V     { align1 WE_all I@6 };
shr(16)         g4<1>UW         g12<32,8,4>UB   0x76543210V     { align1 1H I@6 };
mov(16)         g14.32<4>UB     g13<32,8,4>UB                   { align1 2H I@6 };
sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 1N I@6 };
mov(16)         g5<1>UW         g0<16,8,2>UW                    { align1 1H };
sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 1N I@6 };
mov(16)         g0<1>UW         g1<16,8,2>UW                    { align1 1H };
sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 5N I@6 };
mov(16)         g5.16<1>UW      g2<16,8,2>UW                    { align1 2H };
sync nop(1)                     null<0,1,0>UB                   { align1 WE_all 5N I@6 };
mov(16)         g0.16<1>UW      g3<16,8,2>UW                    { align1 2H };
shr(16)         g4.16<1>UW      g14.32<32,8,4>UB 0x76543210V    { align1 2H I@5 };
    ERROR: Invalid register region for source 0.  See special restrictions section.

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Signed-off-by: José Roberto de Souza <jose.souza@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40856>
---
 src/intel/compiler/brw/brw_from_nir.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp
index ccd51256ac8..71e54358976 100644
--- a/src/intel/compiler/brw/brw_from_nir.cpp
+++ b/src/intel/compiler/brw/brw_from_nir.cpp
@@ -194,8 +194,11 @@ emit_system_values_block(nir_to_brw_state &ntb, nir_block *block)
                 */
                const struct brw_reg reg = s.devinfo->ver >= 20 ?
                   xe2_vec1_grf(i, 15) : brw_vec1_grf(i + 1, 7);
+               brw_reg mask_uw = hbld.vgrf(BRW_TYPE_UW);
+               hbld.MOV(mask_uw, stride(retype(reg, BRW_TYPE_UB), 1, 8, 0));
+
                hbld.SHR(offset(shifted, hbld, i),
-                        stride(retype(reg, BRW_TYPE_UB), 1, 8, 0),
+                        mask_uw,
                         brw_imm_v(0x76543210));
             }
 
@@ -3528,8 +3531,11 @@ emit_sampleid_setup(nir_to_brw_state &ntb)
        */
       const struct brw_reg id_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) :
                                     brw_vec1_grf(i + 1, 0);
+      brw_reg mask_uw = hbld.vgrf(BRW_TYPE_UW);
+      hbld.MOV(mask_uw, stride(retype(id_reg, BRW_TYPE_UB), 1, 8, 0));
+
       hbld.SHR(offset(tmp, hbld, i),
-               stride(retype(id_reg, BRW_TYPE_UB), 1, 8, 0),
+               mask_uw,
                brw_imm_v(0x44440000));
    }