diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index 6c9aa7e5117..449eddece2a 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -1511,6 +1511,10 @@ if (!isnormal(dst)) dst = copysignf(0.0f, src0); """) +opcode("ushr_and_pan", 0, tuint, [0, 0, 0], [tuint, tuint, tuint], False, "", + "(src0 >> (src1 & (sizeof(src0) * 8 - 1))) & src2", + description = "Unsigned right-shift followed by a bitwise AND." + shift_note) + # vc4-specific opcodes # Saturated vector add for 4 8bit ints. diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index 558ee51abf1..bf5ae914554 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -3926,6 +3926,10 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) instr->op == nir_op_sdot_4x8_iadd_sat); break; } + case nir_op_ushr_and_pan: { + bi_rshift_and_to(b, sz, dst, s0, s2, bi_byte(s1, 0), false); + break; + } default: fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name); @@ -5849,6 +5853,9 @@ bi_optimize_nir(nir_shader *nir, unsigned gpu_id, nir_variable_mode robust2_mode late_algebraic |= late_algebraic_progress; } + /* Vectorize instruction generated by bifrost_nir_lower_algebraic_late */ + NIR_PASS(_, nir, nir_opt_vectorize, bi_vectorize_filter, &gpu_id); + while (late_algebraic) { late_algebraic = false; NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); @@ -5858,6 +5865,9 @@ bi_optimize_nir(nir_shader *nir, unsigned gpu_id, nir_variable_mode robust2_mode NIR_PASS(_, nir, nir_opt_cse); } + /* Scalarize vector generated by nir_opt_algebraic_late that are not supported */ + NIR_PASS(_, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id); + NIR_PASS(_, nir, nir_lower_load_const_to_scalar); NIR_PASS(_, nir, nir_opt_dce); diff --git a/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py b/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py index 47e8747b4e5..53896ba52a5 100644 --- a/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py +++ b/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py @@ -104,6 +104,13 @@ algebraic_late = [ # On v11+, because FROUND.v2f16 is gone we end up with precision issues. # We lower ffract here instead to ensure lower_bit_size has been performed. (('ffract', a), ('fadd', a, ('fneg', ('ffloor', a))), 'gpu_arch >= 11'), + + # Fuse SHIFT and bitwise AND + (('iand', ('ushr', 'a@32', 'b@32'), 'c@32'), ('ushr_and_pan', a, b, c)), + (('iand', ('ushr', 'a@16', 'b@32'), 'c@16'), ('ushr_and_pan', a, ('u2u16', b), c)), + (('iand', ('extract_u8', 'a@16', 1), 15), ('ushr_and_pan', a, 8, 15)), + (('iand', 'a@16', 15), ('ushr_and_pan', a, 0, 15)), + (('ushr', 'a@16', 12), ('ushr_and_pan', a, 12, 15)), ] # nir_lower_bool_to_bitsize can generate needless conversions.