pan/bi: Stop using V2F32_TO_V2F16 on Valhall

On v11+, V2F32_TO_V2F16 doesn't exist anymore. This commit ensure we stop using it on every codepath except when a vectored conversion is prefered. (v9-v10) Instead, we use FADD.F32 to handle data conversion thanks to the swizzle defined for the destination. This also work on older Valhall gens, so let's follow that logic when we only have one component used. Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com> Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33608>
2025-12-22 13:30:12 +01:00 · 2025-01-30 11:56:23 +00:00 · 2025-01-30 11:56:23 +00:00 · b63ef74e73
commit b63ef74e73
parent 947264e18a
1 changed files with 57 additions and 8 deletions
--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@ -278,6 +278,19 @@ bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
   return dst;
 }
 static inline bi_instr *
 bi_f32_to_f16_to(bi_builder *b, bi_index dest, bi_index src)
 {
   /* Use V2F32_TO_V2F16 on Bifrost, FADD otherwise */
   if (b->shader->arch < 9)
      return bi_v2f32_to_v2f16_to(b, dest, src, src);
   assert(dest.swizzle != BI_SWIZZLE_H01);
   /* FADD with 0 and force convertion to F16 on Valhall and later */
   return bi_fadd_f32_to(b, dest, src, bi_imm_u32(0));
 }
 static bi_index
 bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
 {
@ -319,7 +332,19 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
                                  bi_imm_u32(8), BI_SPECIAL_NONE);
         }
-         f16 = bi_v2f32_to_v2f16(b, f[0], f[1]);
+         /* On v11+, V2F32_TO_V2F16 is gone */
         if (b->shader->arch >= 11) {
            bi_index tmp[2];
            for (int i = 0; i < 2; i++) {
               tmp[i] = bi_half(bi_temp(b->shader), false);
               bi_f32_to_f16_to(b, tmp[i], f[i]);
            }
            f16 = bi_mkvec_v2i16(b, tmp[0], tmp[1]);
         } else {
            f16 = bi_v2f32_to_v2f16(b, f[0], f[1]);
         }
      }
      return bi_v2f16_to_v2s16(b, f16);
@ -2651,13 +2676,26 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
   case nir_op_f2f16:
   case nir_op_f2f16_rtz:
   case nir_op_f2f16_rtne: {
      /* Starting with v11, we don't have V2XXX_TO_V2F16, this should have been
       * lowered before if there is more than one components */
      assert(b->shader->arch < 11 || comps == 1);
      assert(src_sz == 32);
      bi_index idx = bi_src_index(&instr->src[0].src);
      bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
      bi_index s1 =
         comps > 1 ? bi_extract(b, idx, instr->src[0].swizzle[1]) : s0;
-      bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1);
+      bi_instr *I;
      /* Use V2F32_TO_V2F16 if vectorized */
      if (comps == 2) {
         /* Starting with v11, we don't have V2F32_TO_V2F16, this should have
          * been lowered before if there is more than one components */
         assert(b->shader->arch < 11);
         bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]);
         I = bi_v2f32_to_v2f16_to(b, dst, s0, s1);
      } else {
         assert(comps == 1);
         I = bi_f32_to_f16_to(b, dst, s0);
      }
      /* Override rounding if explicitly requested. Otherwise, the
       * default rounding mode is selected by the builder. Depending
@ -2952,7 +2990,8 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
      break;
   case nir_op_fquantize2f16: {
-      bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0);
+      bi_instr *f16 =
         bi_f32_to_f16_to(b, bi_half(bi_temp(b->shader), false), s0);
      if (b->shader->arch < 9) {
         /* Bifrost has psuedo-ftz on conversions, that is lowered to an ftz
@ -2961,11 +3000,11 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
      } else {
         /* Valhall doesn't have clauses, and uses a separate flush
          * instruction */
-         f16 = bi_flush_to(b, 16, bi_temp(b->shader), f16->dest[0]);
+         f16 = bi_flush_to(b, 16, bi_half(bi_temp(b->shader), false), f16->dest[0]);
         f16->ftz = true;
      }
-      bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false));
+      bi_instr *f32 = bi_f16_to_f32_to(b, dst, f16->dest[0]);
      if (b->shader->arch < 9)
         f32->ftz = true;
@ -4797,6 +4836,8 @@ bi_lower_bit_size(const nir_instr *instr, UNUSED void *data)
 static uint8_t
 bi_vectorize_filter(const nir_instr *instr, const void *data)
 {
   unsigned gpu_id = *((unsigned *)data);
   /* Defaults work for everything else */
   if (instr->type != nir_instr_type_alu)
      return 0;
@ -4817,6 +4858,14 @@ bi_vectorize_filter(const nir_instr *instr, const void *data)
   case nir_op_extract_i16:
   case nir_op_insert_u16:
      return 1;
   /* On v11+, we lost all packed F16 conversions */
   case nir_op_f2f16:
   case nir_op_f2f16_rtz:
   case nir_op_f2f16_rtne:
      if (pan_arch(gpu_id) >= 11)
         return 1;
      break;
   default:
      break;
   }
@ -5041,7 +5090,7 @@ bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
      NIR_PASS(progress, nir, bifrost_nir_opt_boolean_bitwise);
   NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
-   NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);
+   NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, &gpu_id);
   NIR_PASS(progress, nir, nir_lower_bool_to_bitsize);
   /* Prepass to simplify instruction selection */