pan/bi: Lower removed instructions in algebraic on v11+

This lower all instructions that were removed on v11 to equivalents in algebraic and assert in BIR emission to ensure they are never rematerialize. Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com> Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33608>
2026-03-19 18:40:44 +01:00 · 2025-01-30 11:56:23 +00:00 · 2025-01-30 11:56:23 +00:00 · d79a31bf81
commit d79a31bf81
parent be011e8675
3 changed files with 77 additions and 5 deletions
--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@ -2748,6 +2748,10 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
      if (!(src_sz == 32 && comps == 2))
         break;

+      /* Starting with v11, we don't have V2XXX_TO_V2F16, this should have been
+       * lowered before if there is more than one components */
+      assert(b->shader->arch < 11);
+
      nir_alu_src *src = &instr->src[0];
      bi_index idx = bi_src_index(&src->src);
      bi_index s0 = bi_extract(b, idx, src->swizzle[0]);
@ -3027,6 +3031,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
   }

   case nir_op_f2i32:
+      /* v11 removed F16_TO_S32 */
+      assert(src_sz == 32 || (b->shader->arch < 11 && src_sz == 16));
+
      if (src_sz == 32)
         bi_f32_to_s32_to(b, dst, s0);
      else
@ -3035,6 +3042,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)

   /* Note 32-bit sources => no vectorization, so 32-bit works */
   case nir_op_f2u16:
+      /* v11 removed V2F16_TO_V2U16 */
+      assert(src_sz == 32 || (b->shader->arch < 11 && src_sz == 16));
+
      if (src_sz == 32)
         bi_f32_to_u32_to(b, dst, s0);
      else
@ -3042,6 +3052,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
      break;

   case nir_op_f2i16:
+      /* v11 removed V2F16_TO_V2S16 */
+      assert(src_sz == 32 || (b->shader->arch < 11 && src_sz == 16));
+
      if (src_sz == 32)
         bi_f32_to_s32_to(b, dst, s0);
      else
@ -3049,6 +3062,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
      break;

   case nir_op_f2u32:
+      /* v11 removed F16_TO_U32 */
+      assert(src_sz == 32 || (b->shader->arch < 11 && src_sz == 16));
+
      if (src_sz == 32)
         bi_f32_to_u32_to(b, dst, s0);
      else
@ -3056,6 +3072,10 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
      break;

   case nir_op_u2f16:
+      /* Starting with v11, we don't have V2XXX_TO_V2F16, this should have been
+       * lowered before by algebraic. */
+      assert(b->shader->arch < 11);
+
      if (src_sz == 32)
         bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false));
      else if (src_sz == 16)
@ -3065,6 +3085,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
      break;

   case nir_op_u2f32:
+      /* v11 removed U16_TO_F32 and U8_TO_F32 */
+      assert(src_sz == 32 || (b->shader->arch < 11 && (src_sz == 16 || src_sz == 8)));
+
      if (src_sz == 32)
         bi_u32_to_f32_to(b, dst, s0);
      else if (src_sz == 16)
@ -3074,6 +3097,10 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
      break;

   case nir_op_i2f16:
+      /* Starting with v11, we don't have V2XXX_TO_V2F16, this should have been
+       * lowered before by algebraic. */
+      assert(b->shader->arch < 11);
+
      if (src_sz == 32)
         bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false));
      else if (src_sz == 16)
@ -3083,7 +3110,8 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
      break;

   case nir_op_i2f32:
-      assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
+      /* v11 removed S16_TO_F32 and S8_TO_F32 */
+      assert(src_sz == 32 || (b->shader->arch < 11 && (src_sz == 16 || src_sz == 8)));

      if (src_sz == 32)
         bi_s32_to_f32_to(b, dst, s0);
@ -4883,6 +4911,8 @@ bi_vectorize_filter(const nir_instr *instr, const void *data)
   case nir_op_f2f16:
   case nir_op_f2f16_rtz:
   case nir_op_f2f16_rtne:
+   case nir_op_u2f16:
+   case nir_op_i2f16:
      if (pan_arch(gpu_id) >= 11)
         return 1;

@ -5116,7 +5146,7 @@ bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)

   /* Prepass to simplify instruction selection */
   late_algebraic = false;
-   NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late);
+   NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late, pan_arch(gpu_id));

   while (late_algebraic) {
      late_algebraic = false;
--- a/src/panfrost/compiler/bifrost_nir.h
+++ b/src/panfrost/compiler/bifrost_nir.h
@ -25,7 +25,7 @@
 #include "nir.h"
 #include "nir_builder.h"

-bool bifrost_nir_lower_algebraic_late(nir_shader *shader);
+bool bifrost_nir_lower_algebraic_late(nir_shader *shader, unsigned gpu_arch);
 bool bifrost_nir_lower_xfb(nir_shader *shader);
 bool bifrost_nir_opt_boolean_bitwise(nir_shader *shader);
 bool bifrost_nir_lower_load_output(nir_shader *nir);
--- a/src/panfrost/compiler/bifrost_nir_algebraic.py
+++ b/src/panfrost/compiler/bifrost_nir_algebraic.py
@ -75,8 +75,48 @@ algebraic_late = [
    # XXX: Duplicate of nir_lower_pack
    (('unpack_64_2x32', a), ('vec2', ('unpack_64_2x32_split_x', a),
                                     ('unpack_64_2x32_split_y', a))),
+
+    # On v11+, all non integer variant to convert to F32 are gone except for S32_TO_F32.
+    (('i2f32', 'a@8'), ('i2f32', ('i2i32', a)), 'gpu_arch >= 11'),
+    (('i2f32', 'a@16'), ('i2f32', ('i2i32', a)), 'gpu_arch >= 11'),
+    (('u2f32', 'a@8'), ('u2f32', ('u2u32', a)), 'gpu_arch >= 11'),
+    (('u2f32', 'a@16'), ('u2f32', ('u2u32', a)), 'gpu_arch >= 11'),
+
+    # On v11+, all non integer variant to convert to F16 are gone except for S32_TO_F32.
+    (('i2f16', 'a'), ('f2f16', ('i2f32', ('i2i32', a))), 'gpu_arch >= 11'),
+    (('u2f16', 'a'), ('f2f16', ('u2f32', ('u2u32', a))), 'gpu_arch >= 11'),
+
+    # On v11+, V2F16_TO_V2S16 / V2F16_TO_V2U16 are gone
+    (('f2i16', 'a@16'), ('f2i16', ('f2f32', a)), 'gpu_arch >= 11'),
+    (('f2u16', 'a@16'), ('f2u16', ('f2f32', a)), 'gpu_arch >= 11'),
+
+    # On v11+, F16_TO_S32/F16_TO_U32 is gone but we still have F32_TO_S32/F32_TO_U32
+    (('f2i32', 'a@16'), ('f2i32', ('f2f32', a)), 'gpu_arch >= 11'),
+    (('f2u32', 'a@16'), ('f2u32', ('f2f32', a)), 'gpu_arch >= 11'),
+
+    # On v11+, IABS.v4s8 is gone
+    (('iabs', 'a@8'), ('i2i8', ('iabs', ('i2i16', a))), 'gpu_arch >= 11'),
+
+    # On v11+, ISUB.v4s8 is gone
+    (('ineg', 'a@8'), ('i2i8', ('ineg', ('i2i16', a))), 'gpu_arch >= 11'),
+    (('isub', 'a@8', 'b@8'), ('i2i8', ('isub', ('i2i16', a), ('i2i16', b))), 'gpu_arch >= 11'),
+    (('isub_sat', 'a@8', 'b@8'), ('i2i8', ('isub_sat', ('i2i16', a), ('i2i16', b))), 'gpu_arch >= 11'),
+    (('usub_sat', 'a@8', 'b@8'), ('u2u8', ('usub_sat', ('u2u16', a), ('u2u16', b))), 'gpu_arch >= 11'),
 ]

+# On v11+, ICMP_OR.v4u8 was removed
+for cond in ['ilt', 'ige', 'ieq', 'ine', 'ult', 'uge']:
+    convert_8bit = 'u2u8'
+    convert_16bit = 'u2u16'
+
+    if cond[0] == 'i':
+        convert_8bit = 'i2i8'
+        convert_16bit = 'i2i16'
+
+    algebraic_late += [
+        ((f'{cond}8', a, b), (convert_8bit, (f'{cond}16', (convert_16bit, a), (convert_16bit, b))), 'gpu_arch >= 11'),
+    ]
+
 # Handling all combinations of boolean and float sizes for b2f is nontrivial.
 # bcsel has the same problem in more generality; lower b2f to bcsel in NIR to
 # reuse the efficient implementations of bcsel. This includes special handling
@ -108,8 +148,10 @@ def run():
    print(nir_algebraic.AlgebraicPass("bifrost_nir_opt_boolean_bitwise",
                                      opt_bool_bitwise).render())
    print(nir_algebraic.AlgebraicPass("bifrost_nir_lower_algebraic_late",
-                                      algebraic_late).render())
-
+                                      algebraic_late,
+                                      [
+                                          ("unsigned ", "gpu_arch")
+                                      ]).render())

 if __name__ == '__main__':
    main()