From c2a25cf75cad20959e3760c13818379624264f3d Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Thu, 11 May 2023 15:22:44 -0700 Subject: [PATCH] intel/fs: Fix shift counts for 8- and 16-bit types With regards to implicit masking of the shift counts for 8- and 16-bit types, the PRMs are incorrect. They falsely state that on Gen9+ only the low bits of src1 matching the size of src0 (e.g., 4-bits for W or UW src0) are used. The Bspec (backed by data from experimentation) state that 0x3f is used for Q and UQ types, and 0x1f is used for **all** other types. To match the behavior expected for the NIR opcodes, explicit masks for 8- and 16-bit types must be added. This fixes (the updated version, see crucible!138) of func.shader.shift.int16_t on all Intel platforms. According to Karol, this also fixes "integer_ops integer_rotate" tests in OpenCL CTS. No shader-db or fossil-db changes on any Intel platform. Tested-by: Karol Herbst Part-of: --- src/intel/compiler/brw_fs_nir.cpp | 59 +++++++++++++++---------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 25b22695dad..d1e712eb27b 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -1817,43 +1817,42 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr, case nir_op_bitfield_insert: unreachable("not reached: should have been lowered"); - /* For all shift operations: + /* With regards to implicit masking of the shift counts for 8- and 16-bit + * types, the PRMs are **incorrect**. They falsely state that on Gen9+ only + * the low bits of src1 matching the size of src0 (e.g., 4-bits for W or UW + * src0) are used. The Bspec (backed by data from experimentation) state + * that 0x3f is used for Q and UQ types, and 0x1f is used for **all** other + * types. * - * Gen4 - Gen7: After application of source modifiers, the low 5-bits of - * src1 are used an unsigned value for the shift count. - * - * Gen8: As with earlier platforms, but for Q and UQ types on src0, the low - * 6-bit of src1 are used. - * - * Gen9+: The low bits of src1 matching the size of src0 (e.g., 4-bits for - * W or UW src0). - * - * The implication is that the following instruction will produce a - * different result on Gen9+ than on previous platforms: - * - * shr(8) g4<1>UW g12<8,8,1>UW 0x0010UW - * - * where Gen9+ will shift by zero, and earlier platforms will shift by 16. - * - * This does not seem to be the case. Experimentally, it has been - * determined that shifts of 16-bit values on Gen8 behave properly. Shifts - * of 8-bit values on both Gen8 and Gen9 do not. Gen11+ lowers 8-bit - * values, so those platforms were not tested. No features expose access - * to 8- or 16-bit types on Gen7 or earlier, so those platforms were not - * tested either. See - * https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76. - * - * This is part of the reason 8-bit values are lowered to 16-bit on all - * platforms. + * The match the behavior expected for the NIR opcodes, explicit masks for + * 8- and 16-bit types must be added. */ case nir_op_ishl: - bld.SHL(result, op[0], op[1]); + if (instr->def.bit_size < 32) { + bld.AND(result, op[1], brw_imm_ud(instr->def.bit_size - 1)); + bld.SHL(result, op[0], result); + } else { + bld.SHL(result, op[0], op[1]); + } + break; case nir_op_ishr: - bld.ASR(result, op[0], op[1]); + if (instr->def.bit_size < 32) { + bld.AND(result, op[1], brw_imm_ud(instr->def.bit_size - 1)); + bld.ASR(result, op[0], result); + } else { + bld.ASR(result, op[0], op[1]); + } + break; case nir_op_ushr: - bld.SHR(result, op[0], op[1]); + if (instr->def.bit_size < 32) { + bld.AND(result, op[1], brw_imm_ud(instr->def.bit_size - 1)); + bld.SHR(result, op[0], result); + } else { + bld.SHR(result, op[0], op[1]); + } + break; case nir_op_urol: