From d79a31bf81a3527897f7c6f5178abd47d80fbaee Mon Sep 17 00:00:00 2001 From: Mary Guillemard Date: Thu, 30 Jan 2025 11:56:23 +0000 Subject: [PATCH] pan/bi: Lower removed instructions in algebraic on v11+ This lower all instructions that were removed on v11 to equivalents in algebraic and assert in BIR emission to ensure they are never rematerialize. Signed-off-by: Mary Guillemard Reviewed-by: Lars-Ivar Hesselberg Simonsen Part-of: --- src/panfrost/compiler/bifrost_compile.c | 34 +++++++++++++- src/panfrost/compiler/bifrost_nir.h | 2 +- .../compiler/bifrost_nir_algebraic.py | 46 ++++++++++++++++++- 3 files changed, 77 insertions(+), 5 deletions(-) diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c index a58288eb2e4..63b6127b2e4 100644 --- a/src/panfrost/compiler/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost_compile.c @@ -2748,6 +2748,10 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) if (!(src_sz == 32 && comps == 2)) break; + /* Starting with v11, we don't have V2XXX_TO_V2F16, this should have been + * lowered before if there is more than one components */ + assert(b->shader->arch < 11); + nir_alu_src *src = &instr->src[0]; bi_index idx = bi_src_index(&src->src); bi_index s0 = bi_extract(b, idx, src->swizzle[0]); @@ -3027,6 +3031,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) } case nir_op_f2i32: + /* v11 removed F16_TO_S32 */ + assert(src_sz == 32 || (b->shader->arch < 11 && src_sz == 16)); + if (src_sz == 32) bi_f32_to_s32_to(b, dst, s0); else @@ -3035,6 +3042,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) /* Note 32-bit sources => no vectorization, so 32-bit works */ case nir_op_f2u16: + /* v11 removed V2F16_TO_V2U16 */ + assert(src_sz == 32 || (b->shader->arch < 11 && src_sz == 16)); + if (src_sz == 32) bi_f32_to_u32_to(b, dst, s0); else @@ -3042,6 +3052,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) break; case nir_op_f2i16: + /* v11 removed V2F16_TO_V2S16 */ + assert(src_sz == 32 || (b->shader->arch < 11 && src_sz == 16)); + if (src_sz == 32) bi_f32_to_s32_to(b, dst, s0); else @@ -3049,6 +3062,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) break; case nir_op_f2u32: + /* v11 removed F16_TO_U32 */ + assert(src_sz == 32 || (b->shader->arch < 11 && src_sz == 16)); + if (src_sz == 32) bi_f32_to_u32_to(b, dst, s0); else @@ -3056,6 +3072,10 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) break; case nir_op_u2f16: + /* Starting with v11, we don't have V2XXX_TO_V2F16, this should have been + * lowered before by algebraic. */ + assert(b->shader->arch < 11); + if (src_sz == 32) bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false)); else if (src_sz == 16) @@ -3065,6 +3085,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) break; case nir_op_u2f32: + /* v11 removed U16_TO_F32 and U8_TO_F32 */ + assert(src_sz == 32 || (b->shader->arch < 11 && (src_sz == 16 || src_sz == 8))); + if (src_sz == 32) bi_u32_to_f32_to(b, dst, s0); else if (src_sz == 16) @@ -3074,6 +3097,10 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) break; case nir_op_i2f16: + /* Starting with v11, we don't have V2XXX_TO_V2F16, this should have been + * lowered before by algebraic. */ + assert(b->shader->arch < 11); + if (src_sz == 32) bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false)); else if (src_sz == 16) @@ -3083,7 +3110,8 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) break; case nir_op_i2f32: - assert(src_sz == 32 || src_sz == 16 || src_sz == 8); + /* v11 removed S16_TO_F32 and S8_TO_F32 */ + assert(src_sz == 32 || (b->shader->arch < 11 && (src_sz == 16 || src_sz == 8))); if (src_sz == 32) bi_s32_to_f32_to(b, dst, s0); @@ -4883,6 +4911,8 @@ bi_vectorize_filter(const nir_instr *instr, const void *data) case nir_op_f2f16: case nir_op_f2f16_rtz: case nir_op_f2f16_rtne: + case nir_op_u2f16: + case nir_op_i2f16: if (pan_arch(gpu_id) >= 11) return 1; @@ -5116,7 +5146,7 @@ bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend) /* Prepass to simplify instruction selection */ late_algebraic = false; - NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late); + NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late, pan_arch(gpu_id)); while (late_algebraic) { late_algebraic = false; diff --git a/src/panfrost/compiler/bifrost_nir.h b/src/panfrost/compiler/bifrost_nir.h index cf005826448..d2dcb4fe98d 100644 --- a/src/panfrost/compiler/bifrost_nir.h +++ b/src/panfrost/compiler/bifrost_nir.h @@ -25,7 +25,7 @@ #include "nir.h" #include "nir_builder.h" -bool bifrost_nir_lower_algebraic_late(nir_shader *shader); +bool bifrost_nir_lower_algebraic_late(nir_shader *shader, unsigned gpu_arch); bool bifrost_nir_lower_xfb(nir_shader *shader); bool bifrost_nir_opt_boolean_bitwise(nir_shader *shader); bool bifrost_nir_lower_load_output(nir_shader *nir); diff --git a/src/panfrost/compiler/bifrost_nir_algebraic.py b/src/panfrost/compiler/bifrost_nir_algebraic.py index 362266569fc..ba82e08d45c 100644 --- a/src/panfrost/compiler/bifrost_nir_algebraic.py +++ b/src/panfrost/compiler/bifrost_nir_algebraic.py @@ -75,8 +75,48 @@ algebraic_late = [ # XXX: Duplicate of nir_lower_pack (('unpack_64_2x32', a), ('vec2', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a))), + + # On v11+, all non integer variant to convert to F32 are gone except for S32_TO_F32. + (('i2f32', 'a@8'), ('i2f32', ('i2i32', a)), 'gpu_arch >= 11'), + (('i2f32', 'a@16'), ('i2f32', ('i2i32', a)), 'gpu_arch >= 11'), + (('u2f32', 'a@8'), ('u2f32', ('u2u32', a)), 'gpu_arch >= 11'), + (('u2f32', 'a@16'), ('u2f32', ('u2u32', a)), 'gpu_arch >= 11'), + + # On v11+, all non integer variant to convert to F16 are gone except for S32_TO_F32. + (('i2f16', 'a'), ('f2f16', ('i2f32', ('i2i32', a))), 'gpu_arch >= 11'), + (('u2f16', 'a'), ('f2f16', ('u2f32', ('u2u32', a))), 'gpu_arch >= 11'), + + # On v11+, V2F16_TO_V2S16 / V2F16_TO_V2U16 are gone + (('f2i16', 'a@16'), ('f2i16', ('f2f32', a)), 'gpu_arch >= 11'), + (('f2u16', 'a@16'), ('f2u16', ('f2f32', a)), 'gpu_arch >= 11'), + + # On v11+, F16_TO_S32/F16_TO_U32 is gone but we still have F32_TO_S32/F32_TO_U32 + (('f2i32', 'a@16'), ('f2i32', ('f2f32', a)), 'gpu_arch >= 11'), + (('f2u32', 'a@16'), ('f2u32', ('f2f32', a)), 'gpu_arch >= 11'), + + # On v11+, IABS.v4s8 is gone + (('iabs', 'a@8'), ('i2i8', ('iabs', ('i2i16', a))), 'gpu_arch >= 11'), + + # On v11+, ISUB.v4s8 is gone + (('ineg', 'a@8'), ('i2i8', ('ineg', ('i2i16', a))), 'gpu_arch >= 11'), + (('isub', 'a@8', 'b@8'), ('i2i8', ('isub', ('i2i16', a), ('i2i16', b))), 'gpu_arch >= 11'), + (('isub_sat', 'a@8', 'b@8'), ('i2i8', ('isub_sat', ('i2i16', a), ('i2i16', b))), 'gpu_arch >= 11'), + (('usub_sat', 'a@8', 'b@8'), ('u2u8', ('usub_sat', ('u2u16', a), ('u2u16', b))), 'gpu_arch >= 11'), ] +# On v11+, ICMP_OR.v4u8 was removed +for cond in ['ilt', 'ige', 'ieq', 'ine', 'ult', 'uge']: + convert_8bit = 'u2u8' + convert_16bit = 'u2u16' + + if cond[0] == 'i': + convert_8bit = 'i2i8' + convert_16bit = 'i2i16' + + algebraic_late += [ + ((f'{cond}8', a, b), (convert_8bit, (f'{cond}16', (convert_16bit, a), (convert_16bit, b))), 'gpu_arch >= 11'), + ] + # Handling all combinations of boolean and float sizes for b2f is nontrivial. # bcsel has the same problem in more generality; lower b2f to bcsel in NIR to # reuse the efficient implementations of bcsel. This includes special handling @@ -108,8 +148,10 @@ def run(): print(nir_algebraic.AlgebraicPass("bifrost_nir_opt_boolean_bitwise", opt_bool_bitwise).render()) print(nir_algebraic.AlgebraicPass("bifrost_nir_lower_algebraic_late", - algebraic_late).render()) - + algebraic_late, + [ + ("unsigned ", "gpu_arch") + ]).render()) if __name__ == '__main__': main()