From 51808a207673471fd242f77491632170b8c0a9c2 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Tue, 26 Jan 2021 15:27:33 -0800
Subject: [PATCH] intel/compiler: Properly handle shift count for 8-bit sources

This fixes the Crucible func.shader.shift.int8_t test on Gen8 and Gen9.
See https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76.

No changes in fossil-db because there are no shaders in fossil-db that
use shaderInt8. :(

A couple alternatives were considered.

1. Lower 8-bit integers to 16-bit on all platforms.  I looked at the
   output of a few shaders from the Vulkan CTS, and it was a mess.
   There were so many extra type converting MOVs.  I think all of that
   could be cleaned up, but it would be more work.  It would also not be
   great for cherry-picking to a stable branch.

   This *is* the approach that will be taken Mesa 21.1.  See also
   https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8730.

2. Disable the optimization that prunes the `& 7`.  This would be more
   optimal in shaders that don't have the explicit mask, but it's not
   very future proof.  It would potentially require auditing future
   optimizations to make sure they don't run afoul of this problem.

In the end, the easiest solution seems to be adding the extra mask to
implement the specified semantics of the NIR shift instructions...
especially since the only shaders we have that use shaderInt8 are from
the CTS.

v2: Use braces in the else part because they were used in the then part.
Suggested by Jason.

Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Fixes: 26fc5e1f4a8 ("nir/algebraic: expand existing 32-bit patterns to all bit sizes using loops")
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9001>
---
 src/intel/compiler/brw_fs_nir.cpp   | 55 +++++++++++++++++++++++++++--
 src/intel/compiler/brw_vec4_nir.cpp | 11 ++++++
 2 files changed, 63 insertions(+), 3 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index d8675c6fbcf..cc60c2868ac 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -1817,14 +1817,63 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
    case nir_op_bitfield_insert:
       unreachable("not reached: should have been lowered");
 
+   /* For all shift operations:
+    *
+    * Gen4 - Gen7: After application of source modifiers, the low 5-bits of
+    * src1 are used an unsigned value for the shift count.
+    *
+    * Gen8: As with earlier platforms, but for Q and UQ types on src0, the low
+    * 6-bit of src1 are used.
+    *
+    * Gen9+: The low bits of src1 matching the size of src0 (e.g., 4-bits for
+    * W or UW src0).
+    *
+    * The implication is that the following instruction will produce a
+    * different result on Gen9+ than on previous platforms:
+    *
+    *    shr(8)    g4<1>UW    g12<8,8,1>UW    0x0010UW
+    *
+    * where Gen9+ will shift by zero, and earlier platforms will shift by 16.
+    *
+    * This does not seem to be the case.  Experimentally, I have determined
+    * that shifts of 16-bit values on Gen8 behave properly.  Shifts of 8-bit
+    * values on both Gen8 and Gen9 do not.  Gen11+ lowers 8-bit values, so
+    * those platforms were not tested.  No features expose access to 8- or
+    * 16-bit types on Gen7 or earlier, so those platforms were not tested
+    * either.  See
+    * https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76.
+    */
    case nir_op_ishl:
-      bld.SHL(result, op[0], op[1]);
+      if (type_sz(op[0].type) == 1) {
+         fs_reg tmp = bld.vgrf(op[1].type);
+         bld.AND(tmp, op[1], retype(brw_imm_ud(7), op[1].type));
+         bld.SHL(result, op[0], tmp);
+      } else {
+         bld.SHL(result, op[0], op[1]);
+      }
+
       break;
+
    case nir_op_ishr:
-      bld.ASR(result, op[0], op[1]);
+      if (type_sz(op[0].type) == 1) {
+         fs_reg tmp = bld.vgrf(op[1].type);
+         bld.AND(tmp, op[1], retype(brw_imm_ud(7), op[1].type));
+         bld.ASR(result, op[0], tmp);
+      } else {
+         bld.ASR(result, op[0], op[1]);
+      }
+
       break;
+
    case nir_op_ushr:
-      bld.SHR(result, op[0], op[1]);
+      if (type_sz(op[0].type) == 1) {
+         fs_reg tmp = bld.vgrf(op[1].type);
+         bld.AND(tmp, op[1], retype(brw_imm_ud(7), op[1].type));
+         bld.SHR(result, op[0], tmp);
+      } else {
+         bld.SHR(result, op[0], op[1]);
+      }
+
       break;
 
    case nir_op_urol:
diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp
index abeac65577d..4e32f0cc892 100644
--- a/src/intel/compiler/brw_vec4_nir.cpp
+++ b/src/intel/compiler/brw_vec4_nir.cpp
@@ -1797,20 +1797,31 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       }
       break;
 
+   /* For all shift operations:
+    *
+    * Gen4 - Gen7: After application of source modifiers, the low 5-bits of
+    * src1 are used an unsigned value for the shift count.
+    *
+    * Later platforms alter this behavior, but those platforms are not
+    * supported by this code generator.
+    */
    case nir_op_ishl:
       assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      assert(type_sz(op[0].type) == 4);
       try_immediate_source(instr, op, false);
       emit(SHL(dst, op[0], op[1]));
       break;
 
    case nir_op_ishr:
       assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      assert(type_sz(op[0].type) == 4);
       try_immediate_source(instr, op, false);
       emit(ASR(dst, op[0], op[1]));
       break;
 
    case nir_op_ushr:
       assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      assert(type_sz(op[0].type) == 4);
       try_immediate_source(instr, op, false);
       emit(SHR(dst, op[0], op[1]));
       break;