nak,nir: Generalize nak_nir_split_64bit_conversions and move it to NIR

This pass was originally based on a similar pass from Intel but it's grown support for some fancy stuff like fp64 -> fp16 conversion splitting with proper rounding. Reviewed-by: Mel Henning <mhenning@darkrefraction.com> Reviewed-by: Benjamin Lee <benjamin.lee@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34126>
2026-01-07 00:00:12 +01:00 · 2025-03-17 14:01:18 -05:00 · 2025-03-17 14:01:18 -05:00 · a3935c7aa2
commit a3935c7aa2
parent 2d75e7dced
6 changed files with 63 additions and 20 deletions
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@ -292,6 +292,7 @@ files_libnir = files(
  'nir_serialize.h',
  'nir_shader_compiler_options.h',
  'nir_split_64bit_vec3_and_vec4.c',
+  'nir_split_conversions.c',
  'nir_split_per_member_structs.c',
  'nir_split_var_copies.c',
  'nir_split_vars.c',
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@ -5791,6 +5791,16 @@ bool nir_lower_bit_size(nir_shader *shader,
                        void *callback_data);
 bool nir_lower_64bit_phis(nir_shader *shader);

+typedef struct nir_split_conversions_options {
+   nir_lower_bit_size_callback callback;
+   void *callback_data;
+   /* True if the implementation supports nir_intrinsic_convert_alu_types */
+   bool has_convert_alu_types;
+} nir_split_conversions_options;
+
+bool nir_split_conversions(nir_shader *shader,
+                           const nir_split_conversions_options *options);
+
 bool nir_split_64bit_vec3_and_vec4(nir_shader *shader);

 nir_lower_int64_options nir_lower_int64_op_to_options_mask(nir_op opcode);
--- a/src/nouveau/compiler/nak_nir_split_64bit_conversions.c
+++ b/src/nouveau/compiler/nak_nir_split_64bit_conversions.c
@ -23,7 +23,7 @@

 /* Adapted from intel_nir_lower_conversions.c */

-#include "nak_private.h"
+#include "nir.h"
 #include "nir_builder.h"

 static nir_rounding_mode
@ -37,8 +37,10 @@ op_rounding_mode(nir_op op)
 }

 static bool
-split_64bit_conversion(nir_builder *b, nir_instr *instr, UNUSED void *_data)
+split_conversion_instr(nir_builder *b, nir_instr *instr, UNUSED void *_data)
 {
+   const nir_split_conversions_options *opts = _data;
+
   if (instr->type != nir_instr_type_alu)
      return false;

@ -47,21 +49,25 @@ split_64bit_conversion(nir_builder *b, nir_instr *instr, UNUSED void *_data)
   if (!nir_op_infos[alu->op].is_conversion)
      return false;

+   unsigned tmp_bit_size = opts->callback(instr, opts->callback_data);
+   if (tmp_bit_size == 0)
+      return false;
+
   unsigned src_bit_size = nir_src_bit_size(alu->src[0].src);
+   unsigned dst_bit_size = alu->def.bit_size;
+   if (src_bit_size < dst_bit_size)
+      assert(src_bit_size < tmp_bit_size && tmp_bit_size < dst_bit_size);
+   else
+      assert(dst_bit_size < tmp_bit_size && tmp_bit_size < src_bit_size);
+
   nir_alu_type src_type = nir_op_infos[alu->op].input_types[0];
   nir_alu_type src_full_type = (nir_alu_type) (src_type | src_bit_size);

-   unsigned dst_bit_size = alu->def.bit_size;
   nir_alu_type dst_full_type = nir_op_infos[alu->op].output_type;
   assert(nir_alu_type_get_type_size(dst_full_type) == dst_bit_size);
   nir_alu_type dst_type = nir_alu_type_get_base_type(dst_full_type);
   const nir_rounding_mode rounding_mode = op_rounding_mode(alu->op);

-   /* We can't cross the 64-bit boundary in one conversion */
-   if ((src_bit_size <= 32 && dst_bit_size <= 32) ||
-       (src_bit_size >= 32 && dst_bit_size >= 32))
-      return false;
-
   nir_alu_type tmp_type;
   if ((src_full_type == nir_type_float16 && dst_bit_size == 64) ||
       (src_bit_size == 64 && dst_full_type == nir_type_float16)) {
@ -69,6 +75,7 @@ split_64bit_conversion(nir_builder *b, nir_instr *instr, UNUSED void *_data)
       * 32-bit float type so we don't lose range when we convert to/from
       * a 64-bit integer.
       */
+      assert(tmp_bit_size == 32);
      tmp_type = nir_type_float32;
   } else {
      /* For fp64 to integer conversions, using an integer intermediate type
@ -83,7 +90,7 @@ split_64bit_conversion(nir_builder *b, nir_instr *instr, UNUSED void *_data)
       * For all other conversions, the conversion from int to int is either
       * lossless or just as lossy as the final conversion.
       */
-      tmp_type = dst_type | 32;
+      tmp_type = dst_type | tmp_bit_size;
   }

   b->cursor = nir_before_instr(&alu->instr);
@ -95,7 +102,8 @@ split_64bit_conversion(nir_builder *b, nir_instr *instr, UNUSED void *_data)
       */
      assert(tmp_type == nir_type_float32);
      if (rounding_mode == nir_rounding_mode_rtne ||
-          rounding_mode == nir_rounding_mode_undef) {
+          rounding_mode == nir_rounding_mode_undef ||
+          !opts->has_convert_alu_types) {
         nir_def *src_lo = nir_unpack_64_2x32_split_x(b, src);
         nir_def *src_hi = nir_unpack_64_2x32_split_y(b, src);

@ -148,10 +156,14 @@ split_64bit_conversion(nir_builder *b, nir_instr *instr, UNUSED void *_data)
          * sufficiently negative exponent that it will flush to zero when
          * converted to fp16, regardless of what we do here.
          *
+          * This same trick works for all the rounding modes.  Even though the
+          * actual rounding logic is a bit different, they all treat the F and
+          * D bits together based on "all F and D bits are zero" or not.
+          *
          * There are many operations we could choose for combining the low
          * dword bits for ORing into the high dword.  We choose umin because
-          * it nicely translates to a single fixed-latency instruction on
-          * everything except Volta.
+          * it nicely translates to a single fixed-latency instruction on a
+          * lot of hardware.
          */
         src_hi = nir_ior(b, src_hi, nir_umin_imm(b, src_lo, 1));
         src_lo = nir_imm_int(b, 0);
@ -160,7 +172,8 @@ split_64bit_conversion(nir_builder *b, nir_instr *instr, UNUSED void *_data)
      } else {
         /* For round-up, round-down, and round-towards-zero, the rounding
          * accumulates properly as long as we use the same rounding mode for
-          * both operations.
+          * both operations.  This is more efficient if the back-end supports
+          * nir_intrinsic_convert_alu_types.
          */
         tmp = nir_convert_alu_types(b, 32, src,
                                     .src_type = nir_type_float64,
@ -183,9 +196,10 @@ split_64bit_conversion(nir_builder *b, nir_instr *instr, UNUSED void *_data)
 }

 bool
-nak_nir_split_64bit_conversions(nir_shader *nir)
+nir_split_conversions(nir_shader *shader,
+                      const nir_split_conversions_options *options)
 {
-   return nir_shader_instructions_pass(nir, split_64bit_conversion,
+   return nir_shader_instructions_pass(shader, split_conversion_instr,
                                       nir_metadata_control_flow,
-                                       NULL);
+                                       (void *)options);
 }
--- a/src/nouveau/compiler/meson.build
+++ b/src/nouveau/compiler/meson.build
@ -34,7 +34,6 @@ libnak_c_files = files(
  'nak_nir_lower_vtg_io.c',
  'nak_nir_mark_lcssa_invariants.c',
  'nak_nir_rematerialize_load_const.c',
-  'nak_nir_split_64bit_conversions.c',
 )

 _libacorn_rs = static_library(
--- a/src/nouveau/compiler/nak_nir.c
+++ b/src/nouveau/compiler/nak_nir.c
@ -915,6 +915,21 @@ atomic_supported(const nir_instr *instr, const void *data)
            intr->def.bit_size == 64);
 }

+static unsigned
+split_conversions_cb(const nir_instr *instr, void *data)
+{
+   nir_alu_instr *alu = nir_instr_as_alu(instr);
+   unsigned src_bit_size = nir_src_bit_size(alu->src[0].src);
+   unsigned dst_bit_size = alu->def.bit_size;
+
+   /* We can't cross the 64-bit boundary in one conversion */
+   if ((src_bit_size <= 32 && dst_bit_size <= 32) ||
+       (src_bit_size >= 32 && dst_bit_size >= 32))
+      return 0;
+
+   return 32;
+}
+
 void
 nak_postprocess_nir(nir_shader *nir,
                    const struct nak_compiler *nak,
@ -1059,8 +1074,13 @@ nak_postprocess_nir(nir_shader *nir,
      }
   } while (progress);

-   if (nak->sm < 70)
-      OPT(nir, nak_nir_split_64bit_conversions);
+   if (nak->sm < 70) {
+      const nir_split_conversions_options split_conv_opts = {
+         .callback = split_conversions_cb,
+         .has_convert_alu_types = true,
+      };
+      OPT(nir, nir_split_conversions, &split_conv_opts);
+   }

   /* Re-materialize load_const instructions in the blocks that use them.
    * This is both a register pressure optimization and a ensures correctness
--- a/src/nouveau/compiler/nak_private.h
+++ b/src/nouveau/compiler/nak_private.h
@ -252,7 +252,6 @@ enum nak_fs_out {

 bool nak_nir_rematerialize_load_const(nir_shader *nir);
 bool nak_nir_mark_lcssa_invariants(nir_shader *nir);
-bool nak_nir_split_64bit_conversions(nir_shader *nir);
 bool nak_nir_lower_non_uniform_ldcx(nir_shader *nir);
 bool nak_nir_add_barriers(nir_shader *nir, const struct nak_compiler *nak);
 bool nak_nir_lower_cf(nir_shader *nir);