diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index d8675c6fbcf..cc60c2868ac 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -1817,14 +1817,63 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_bitfield_insert: unreachable("not reached: should have been lowered"); + /* For all shift operations: + * + * Gen4 - Gen7: After application of source modifiers, the low 5-bits of + * src1 are used an unsigned value for the shift count. + * + * Gen8: As with earlier platforms, but for Q and UQ types on src0, the low + * 6-bit of src1 are used. + * + * Gen9+: The low bits of src1 matching the size of src0 (e.g., 4-bits for + * W or UW src0). + * + * The implication is that the following instruction will produce a + * different result on Gen9+ than on previous platforms: + * + * shr(8) g4<1>UW g12<8,8,1>UW 0x0010UW + * + * where Gen9+ will shift by zero, and earlier platforms will shift by 16. + * + * This does not seem to be the case. Experimentally, I have determined + * that shifts of 16-bit values on Gen8 behave properly. Shifts of 8-bit + * values on both Gen8 and Gen9 do not. Gen11+ lowers 8-bit values, so + * those platforms were not tested. No features expose access to 8- or + * 16-bit types on Gen7 or earlier, so those platforms were not tested + * either. See + * https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76. + */ case nir_op_ishl: - bld.SHL(result, op[0], op[1]); + if (type_sz(op[0].type) == 1) { + fs_reg tmp = bld.vgrf(op[1].type); + bld.AND(tmp, op[1], retype(brw_imm_ud(7), op[1].type)); + bld.SHL(result, op[0], tmp); + } else { + bld.SHL(result, op[0], op[1]); + } + break; + case nir_op_ishr: - bld.ASR(result, op[0], op[1]); + if (type_sz(op[0].type) == 1) { + fs_reg tmp = bld.vgrf(op[1].type); + bld.AND(tmp, op[1], retype(brw_imm_ud(7), op[1].type)); + bld.ASR(result, op[0], tmp); + } else { + bld.ASR(result, op[0], op[1]); + } + break; + case nir_op_ushr: - bld.SHR(result, op[0], op[1]); + if (type_sz(op[0].type) == 1) { + fs_reg tmp = bld.vgrf(op[1].type); + bld.AND(tmp, op[1], retype(brw_imm_ud(7), op[1].type)); + bld.SHR(result, op[0], tmp); + } else { + bld.SHR(result, op[0], op[1]); + } + break; case nir_op_urol: diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp index abeac65577d..4e32f0cc892 100644 --- a/src/intel/compiler/brw_vec4_nir.cpp +++ b/src/intel/compiler/brw_vec4_nir.cpp @@ -1797,20 +1797,31 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) } break; + /* For all shift operations: + * + * Gen4 - Gen7: After application of source modifiers, the low 5-bits of + * src1 are used an unsigned value for the shift count. + * + * Later platforms alter this behavior, but those platforms are not + * supported by this code generator. + */ case nir_op_ishl: assert(nir_dest_bit_size(instr->dest.dest) < 64); + assert(type_sz(op[0].type) == 4); try_immediate_source(instr, op, false); emit(SHL(dst, op[0], op[1])); break; case nir_op_ishr: assert(nir_dest_bit_size(instr->dest.dest) < 64); + assert(type_sz(op[0].type) == 4); try_immediate_source(instr, op, false); emit(ASR(dst, op[0], op[1])); break; case nir_op_ushr: assert(nir_dest_bit_size(instr->dest.dest) < 64); + assert(type_sz(op[0].type) == 4); try_immediate_source(instr, op, false); emit(SHR(dst, op[0], op[1])); break;