From c2a25cf75cad20959e3760c13818379624264f3d Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Thu, 11 May 2023 15:22:44 -0700
Subject: [PATCH] intel/fs: Fix shift counts for 8- and 16-bit types

With regards to implicit masking of the shift counts for 8- and 16-bit
types, the PRMs are incorrect. They falsely state that on Gen9+ only the
low bits of src1 matching the size of src0 (e.g., 4-bits for W or UW
src0) are used. The Bspec (backed by data from experimentation) state
that 0x3f is used for Q and UQ types, and 0x1f is used for **all** other
types.

To match the behavior expected for the NIR opcodes, explicit masks for
8- and 16-bit types must be added.

This fixes (the updated version, see crucible!138) of
func.shader.shift.int16_t on all Intel platforms. According to Karol,
this also fixes "integer_ops integer_rotate" tests in OpenCL CTS.

No shader-db or fossil-db changes on any Intel platform.

Tested-by: Karol Herbst <kherbst@redhat.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23001>
---
 src/intel/compiler/brw_fs_nir.cpp | 59 +++++++++++++++----------------
 1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 25b22695dad..d1e712eb27b 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -1817,43 +1817,42 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
    case nir_op_bitfield_insert:
       unreachable("not reached: should have been lowered");
 
-   /* For all shift operations:
+   /* With regards to implicit masking of the shift counts for 8- and 16-bit
+    * types, the PRMs are **incorrect**. They falsely state that on Gen9+ only
+    * the low bits of src1 matching the size of src0 (e.g., 4-bits for W or UW
+    * src0) are used. The Bspec (backed by data from experimentation) state
+    * that 0x3f is used for Q and UQ types, and 0x1f is used for **all** other
+    * types.
     *
-    * Gen4 - Gen7: After application of source modifiers, the low 5-bits of
-    * src1 are used an unsigned value for the shift count.
-    *
-    * Gen8: As with earlier platforms, but for Q and UQ types on src0, the low
-    * 6-bit of src1 are used.
-    *
-    * Gen9+: The low bits of src1 matching the size of src0 (e.g., 4-bits for
-    * W or UW src0).
-    *
-    * The implication is that the following instruction will produce a
-    * different result on Gen9+ than on previous platforms:
-    *
-    *    shr(8)    g4<1>UW    g12<8,8,1>UW    0x0010UW
-    *
-    * where Gen9+ will shift by zero, and earlier platforms will shift by 16.
-    *
-    * This does not seem to be the case.  Experimentally, it has been
-    * determined that shifts of 16-bit values on Gen8 behave properly.  Shifts
-    * of 8-bit values on both Gen8 and Gen9 do not.  Gen11+ lowers 8-bit
-    * values, so those platforms were not tested.  No features expose access
-    * to 8- or 16-bit types on Gen7 or earlier, so those platforms were not
-    * tested either.  See
-    * https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76.
-    *
-    * This is part of the reason 8-bit values are lowered to 16-bit on all
-    * platforms.
+    * The match the behavior expected for the NIR opcodes, explicit masks for
+    * 8- and 16-bit types must be added.
     */
    case nir_op_ishl:
-      bld.SHL(result, op[0], op[1]);
+      if (instr->def.bit_size < 32) {
+         bld.AND(result, op[1], brw_imm_ud(instr->def.bit_size - 1));
+         bld.SHL(result, op[0], result);
+      } else {
+         bld.SHL(result, op[0], op[1]);
+      }
+
       break;
    case nir_op_ishr:
-      bld.ASR(result, op[0], op[1]);
+      if (instr->def.bit_size < 32) {
+         bld.AND(result, op[1], brw_imm_ud(instr->def.bit_size - 1));
+         bld.ASR(result, op[0], result);
+      } else {
+         bld.ASR(result, op[0], op[1]);
+      }
+
       break;
    case nir_op_ushr:
-      bld.SHR(result, op[0], op[1]);
+      if (instr->def.bit_size < 32) {
+         bld.AND(result, op[1], brw_imm_ud(instr->def.bit_size - 1));
+         bld.SHR(result, op[0], result);
+      } else {
+         bld.SHR(result, op[0], op[1]);
+      }
+
       break;
 
    case nir_op_urol: