From d79a31bf81a3527897f7c6f5178abd47d80fbaee Mon Sep 17 00:00:00 2001
From: Mary Guillemard <mary.guillemard@collabora.com>
Date: Thu, 30 Jan 2025 11:56:23 +0000
Subject: [PATCH] pan/bi: Lower removed instructions in algebraic on v11+

This lower all instructions that were removed on v11 to
equivalents in algebraic and assert in BIR emission to ensure
they are never rematerialize.

Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com>
Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33608>
---
 src/panfrost/compiler/bifrost_compile.c       | 34 +++++++++++++-
 src/panfrost/compiler/bifrost_nir.h           |  2 +-
 .../compiler/bifrost_nir_algebraic.py         | 46 ++++++++++++++++++-
 3 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c
index a58288eb2e4..63b6127b2e4 100644
--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@@ -2748,6 +2748,10 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
       if (!(src_sz == 32 && comps == 2))
          break;
 
+      /* Starting with v11, we don't have V2XXX_TO_V2F16, this should have been
+       * lowered before if there is more than one components */
+      assert(b->shader->arch < 11);
+
       nir_alu_src *src = &instr->src[0];
       bi_index idx = bi_src_index(&src->src);
       bi_index s0 = bi_extract(b, idx, src->swizzle[0]);
@@ -3027,6 +3031,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
    }
 
    case nir_op_f2i32:
+      /* v11 removed F16_TO_S32 */
+      assert(src_sz == 32 || (b->shader->arch < 11 && src_sz == 16));
+
       if (src_sz == 32)
          bi_f32_to_s32_to(b, dst, s0);
       else
@@ -3035,6 +3042,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
 
    /* Note 32-bit sources => no vectorization, so 32-bit works */
    case nir_op_f2u16:
+      /* v11 removed V2F16_TO_V2U16 */
+      assert(src_sz == 32 || (b->shader->arch < 11 && src_sz == 16));
+
       if (src_sz == 32)
          bi_f32_to_u32_to(b, dst, s0);
       else
@@ -3042,6 +3052,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
       break;
 
    case nir_op_f2i16:
+      /* v11 removed V2F16_TO_V2S16 */
+      assert(src_sz == 32 || (b->shader->arch < 11 && src_sz == 16));
+
       if (src_sz == 32)
          bi_f32_to_s32_to(b, dst, s0);
       else
@@ -3049,6 +3062,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
       break;
 
    case nir_op_f2u32:
+      /* v11 removed F16_TO_U32 */
+      assert(src_sz == 32 || (b->shader->arch < 11 && src_sz == 16));
+
       if (src_sz == 32)
          bi_f32_to_u32_to(b, dst, s0);
       else
@@ -3056,6 +3072,10 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
       break;
 
    case nir_op_u2f16:
+      /* Starting with v11, we don't have V2XXX_TO_V2F16, this should have been
+       * lowered before by algebraic. */
+      assert(b->shader->arch < 11);
+
       if (src_sz == 32)
          bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false));
       else if (src_sz == 16)
@@ -3065,6 +3085,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
       break;
 
    case nir_op_u2f32:
+      /* v11 removed U16_TO_F32 and U8_TO_F32 */
+      assert(src_sz == 32 || (b->shader->arch < 11 && (src_sz == 16 || src_sz == 8)));
+
       if (src_sz == 32)
          bi_u32_to_f32_to(b, dst, s0);
       else if (src_sz == 16)
@@ -3074,6 +3097,10 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
       break;
 
    case nir_op_i2f16:
+      /* Starting with v11, we don't have V2XXX_TO_V2F16, this should have been
+       * lowered before by algebraic. */
+      assert(b->shader->arch < 11);
+
       if (src_sz == 32)
          bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false));
       else if (src_sz == 16)
@@ -3083,7 +3110,8 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
       break;
 
    case nir_op_i2f32:
-      assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
+      /* v11 removed S16_TO_F32 and S8_TO_F32 */
+      assert(src_sz == 32 || (b->shader->arch < 11 && (src_sz == 16 || src_sz == 8)));
 
       if (src_sz == 32)
          bi_s32_to_f32_to(b, dst, s0);
@@ -4883,6 +4911,8 @@ bi_vectorize_filter(const nir_instr *instr, const void *data)
    case nir_op_f2f16:
    case nir_op_f2f16_rtz:
    case nir_op_f2f16_rtne:
+   case nir_op_u2f16:
+   case nir_op_i2f16:
       if (pan_arch(gpu_id) >= 11)
          return 1;
 
@@ -5116,7 +5146,7 @@ bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
 
    /* Prepass to simplify instruction selection */
    late_algebraic = false;
-   NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late);
+   NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late, pan_arch(gpu_id));
 
    while (late_algebraic) {
       late_algebraic = false;
diff --git a/src/panfrost/compiler/bifrost_nir.h b/src/panfrost/compiler/bifrost_nir.h
index cf005826448..d2dcb4fe98d 100644
--- a/src/panfrost/compiler/bifrost_nir.h
+++ b/src/panfrost/compiler/bifrost_nir.h
@@ -25,7 +25,7 @@
 #include "nir.h"
 #include "nir_builder.h"
 
-bool bifrost_nir_lower_algebraic_late(nir_shader *shader);
+bool bifrost_nir_lower_algebraic_late(nir_shader *shader, unsigned gpu_arch);
 bool bifrost_nir_lower_xfb(nir_shader *shader);
 bool bifrost_nir_opt_boolean_bitwise(nir_shader *shader);
 bool bifrost_nir_lower_load_output(nir_shader *nir);
diff --git a/src/panfrost/compiler/bifrost_nir_algebraic.py b/src/panfrost/compiler/bifrost_nir_algebraic.py
index 362266569fc..ba82e08d45c 100644
--- a/src/panfrost/compiler/bifrost_nir_algebraic.py
+++ b/src/panfrost/compiler/bifrost_nir_algebraic.py
@@ -75,8 +75,48 @@ algebraic_late = [
     # XXX: Duplicate of nir_lower_pack
     (('unpack_64_2x32', a), ('vec2', ('unpack_64_2x32_split_x', a),
                                      ('unpack_64_2x32_split_y', a))),
+
+    # On v11+, all non integer variant to convert to F32 are gone except for S32_TO_F32.
+    (('i2f32', 'a@8'), ('i2f32', ('i2i32', a)), 'gpu_arch >= 11'),
+    (('i2f32', 'a@16'), ('i2f32', ('i2i32', a)), 'gpu_arch >= 11'),
+    (('u2f32', 'a@8'), ('u2f32', ('u2u32', a)), 'gpu_arch >= 11'),
+    (('u2f32', 'a@16'), ('u2f32', ('u2u32', a)), 'gpu_arch >= 11'),
+
+    # On v11+, all non integer variant to convert to F16 are gone except for S32_TO_F32.
+    (('i2f16', 'a'), ('f2f16', ('i2f32', ('i2i32', a))), 'gpu_arch >= 11'),
+    (('u2f16', 'a'), ('f2f16', ('u2f32', ('u2u32', a))), 'gpu_arch >= 11'),
+
+    # On v11+, V2F16_TO_V2S16 / V2F16_TO_V2U16 are gone
+    (('f2i16', 'a@16'), ('f2i16', ('f2f32', a)), 'gpu_arch >= 11'),
+    (('f2u16', 'a@16'), ('f2u16', ('f2f32', a)), 'gpu_arch >= 11'),
+
+    # On v11+, F16_TO_S32/F16_TO_U32 is gone but we still have F32_TO_S32/F32_TO_U32
+    (('f2i32', 'a@16'), ('f2i32', ('f2f32', a)), 'gpu_arch >= 11'),
+    (('f2u32', 'a@16'), ('f2u32', ('f2f32', a)), 'gpu_arch >= 11'),
+
+    # On v11+, IABS.v4s8 is gone
+    (('iabs', 'a@8'), ('i2i8', ('iabs', ('i2i16', a))), 'gpu_arch >= 11'),
+
+    # On v11+, ISUB.v4s8 is gone
+    (('ineg', 'a@8'), ('i2i8', ('ineg', ('i2i16', a))), 'gpu_arch >= 11'),
+    (('isub', 'a@8', 'b@8'), ('i2i8', ('isub', ('i2i16', a), ('i2i16', b))), 'gpu_arch >= 11'),
+    (('isub_sat', 'a@8', 'b@8'), ('i2i8', ('isub_sat', ('i2i16', a), ('i2i16', b))), 'gpu_arch >= 11'),
+    (('usub_sat', 'a@8', 'b@8'), ('u2u8', ('usub_sat', ('u2u16', a), ('u2u16', b))), 'gpu_arch >= 11'),
 ]
 
+# On v11+, ICMP_OR.v4u8 was removed
+for cond in ['ilt', 'ige', 'ieq', 'ine', 'ult', 'uge']:
+    convert_8bit = 'u2u8'
+    convert_16bit = 'u2u16'
+
+    if cond[0] == 'i':
+        convert_8bit = 'i2i8'
+        convert_16bit = 'i2i16'
+
+    algebraic_late += [
+        ((f'{cond}8', a, b), (convert_8bit, (f'{cond}16', (convert_16bit, a), (convert_16bit, b))), 'gpu_arch >= 11'),
+    ]
+
 # Handling all combinations of boolean and float sizes for b2f is nontrivial.
 # bcsel has the same problem in more generality; lower b2f to bcsel in NIR to
 # reuse the efficient implementations of bcsel. This includes special handling
@@ -108,8 +148,10 @@ def run():
     print(nir_algebraic.AlgebraicPass("bifrost_nir_opt_boolean_bitwise",
                                       opt_bool_bitwise).render())
     print(nir_algebraic.AlgebraicPass("bifrost_nir_lower_algebraic_late",
-                                      algebraic_late).render())
-
+                                      algebraic_late,
+                                      [
+                                          ("unsigned ", "gpu_arch")
+                                      ]).render())
 
 if __name__ == '__main__':
     main()