From 51808a207673471fd242f77491632170b8c0a9c2 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Tue, 26 Jan 2021 15:27:33 -0800 Subject: [PATCH] intel/compiler: Properly handle shift count for 8-bit sources This fixes the Crucible func.shader.shift.int8_t test on Gen8 and Gen9. See https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76. No changes in fossil-db because there are no shaders in fossil-db that use shaderInt8. :( A couple alternatives were considered. 1. Lower 8-bit integers to 16-bit on all platforms. I looked at the output of a few shaders from the Vulkan CTS, and it was a mess. There were so many extra type converting MOVs. I think all of that could be cleaned up, but it would be more work. It would also not be great for cherry-picking to a stable branch. This *is* the approach that will be taken Mesa 21.1. See also https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8730. 2. Disable the optimization that prunes the `& 7`. This would be more optimal in shaders that don't have the explicit mask, but it's not very future proof. It would potentially require auditing future optimizations to make sure they don't run afoul of this problem. In the end, the easiest solution seems to be adding the extra mask to implement the specified semantics of the NIR shift instructions... especially since the only shaders we have that use shaderInt8 are from the CTS. v2: Use braces in the else part because they were used in the then part. Suggested by Jason. Reviewed-by: Jason Ekstrand Fixes: 26fc5e1f4a8 ("nir/algebraic: expand existing 32-bit patterns to all bit sizes using loops") Part-of: --- src/intel/compiler/brw_fs_nir.cpp | 55 +++++++++++++++++++++++++++-- src/intel/compiler/brw_vec4_nir.cpp | 11 ++++++ 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index d8675c6fbcf..cc60c2868ac 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -1817,14 +1817,63 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_bitfield_insert: unreachable("not reached: should have been lowered"); + /* For all shift operations: + * + * Gen4 - Gen7: After application of source modifiers, the low 5-bits of + * src1 are used an unsigned value for the shift count. + * + * Gen8: As with earlier platforms, but for Q and UQ types on src0, the low + * 6-bit of src1 are used. + * + * Gen9+: The low bits of src1 matching the size of src0 (e.g., 4-bits for + * W or UW src0). + * + * The implication is that the following instruction will produce a + * different result on Gen9+ than on previous platforms: + * + * shr(8) g4<1>UW g12<8,8,1>UW 0x0010UW + * + * where Gen9+ will shift by zero, and earlier platforms will shift by 16. + * + * This does not seem to be the case. Experimentally, I have determined + * that shifts of 16-bit values on Gen8 behave properly. Shifts of 8-bit + * values on both Gen8 and Gen9 do not. Gen11+ lowers 8-bit values, so + * those platforms were not tested. No features expose access to 8- or + * 16-bit types on Gen7 or earlier, so those platforms were not tested + * either. See + * https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76. + */ case nir_op_ishl: - bld.SHL(result, op[0], op[1]); + if (type_sz(op[0].type) == 1) { + fs_reg tmp = bld.vgrf(op[1].type); + bld.AND(tmp, op[1], retype(brw_imm_ud(7), op[1].type)); + bld.SHL(result, op[0], tmp); + } else { + bld.SHL(result, op[0], op[1]); + } + break; + case nir_op_ishr: - bld.ASR(result, op[0], op[1]); + if (type_sz(op[0].type) == 1) { + fs_reg tmp = bld.vgrf(op[1].type); + bld.AND(tmp, op[1], retype(brw_imm_ud(7), op[1].type)); + bld.ASR(result, op[0], tmp); + } else { + bld.ASR(result, op[0], op[1]); + } + break; + case nir_op_ushr: - bld.SHR(result, op[0], op[1]); + if (type_sz(op[0].type) == 1) { + fs_reg tmp = bld.vgrf(op[1].type); + bld.AND(tmp, op[1], retype(brw_imm_ud(7), op[1].type)); + bld.SHR(result, op[0], tmp); + } else { + bld.SHR(result, op[0], op[1]); + } + break; case nir_op_urol: diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp index abeac65577d..4e32f0cc892 100644 --- a/src/intel/compiler/brw_vec4_nir.cpp +++ b/src/intel/compiler/brw_vec4_nir.cpp @@ -1797,20 +1797,31 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) } break; + /* For all shift operations: + * + * Gen4 - Gen7: After application of source modifiers, the low 5-bits of + * src1 are used an unsigned value for the shift count. + * + * Later platforms alter this behavior, but those platforms are not + * supported by this code generator. + */ case nir_op_ishl: assert(nir_dest_bit_size(instr->dest.dest) < 64); + assert(type_sz(op[0].type) == 4); try_immediate_source(instr, op, false); emit(SHL(dst, op[0], op[1])); break; case nir_op_ishr: assert(nir_dest_bit_size(instr->dest.dest) < 64); + assert(type_sz(op[0].type) == 4); try_immediate_source(instr, op, false); emit(ASR(dst, op[0], op[1])); break; case nir_op_ushr: assert(nir_dest_bit_size(instr->dest.dest) < 64); + assert(type_sz(op[0].type) == 4); try_immediate_source(instr, op, false); emit(SHR(dst, op[0], op[1])); break;