nir: Add lower_vec_to_regs pass

This is a variant of nir_lower_vec_to_movs that produces register intrinsics (store_reg with write masks) instead of masked moves. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23089>
2026-05-05 09:38:07 +02:00 · 2023-06-09 09:41:21 -04:00 · 2023-06-09 09:41:21 -04:00 · 61010e5255
commit 61010e5255
parent aea8a70200
3 changed files with 270 additions and 0 deletions
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@ -221,6 +221,7 @@ files_libnir = files(
  'nir_lower_vars_to_ssa.c',
  'nir_lower_var_copies.c',
  'nir_lower_vec_to_movs.c',
+  'nir_lower_vec_to_regs.c',
  'nir_lower_vec3_to_vec4.c',
  'nir_lower_viewport_transform.c',
  'nir_lower_wpos_center.c',
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@ -5351,6 +5351,8 @@ bool nir_zero_initialize_shared_memory(nir_shader *shader,
 bool nir_move_vec_src_uses_to_dest(nir_shader *shader);
 bool nir_lower_vec_to_movs(nir_shader *shader, nir_instr_writemask_filter_cb cb,
                           const void *_data);
+bool nir_lower_vec_to_regs(nir_shader *shader, nir_instr_writemask_filter_cb cb,
+                           const void *_data);
 void nir_lower_alpha_test(nir_shader *shader, enum compare_func func,
                          bool alpha_to_one,
                          const gl_state_index16 *alpha_ref_state_tokens);
--- a/src/compiler/nir/nir_lower_vec_to_regs.c
+++ b/src/compiler/nir/nir_lower_vec_to_regs.c
@ -0,0 +1,267 @@
+/*
+ * Copyright 2023 Valve Corporation
+ * Copyright 2014 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+
+/*
+ * This file implements a simple pass that lowers vecN instructions to a series
+ * of partial register stores with partial writes.
+ */
+
+struct data {
+   nir_instr_writemask_filter_cb cb;
+   const void *data;
+};
+
+/**
+ * For a given starting writemask channel and corresponding source index in
+ * the vec instruction, insert a store_reg to the vector register with a
+ * writemask with all the channels that get read from the same src reg.
+ *
+ * Returns the writemask of the store, so the parent loop calling this knows
+ * which ones have been processed.
+ */
+static unsigned
+insert_store(nir_builder *b, nir_ssa_def *reg, nir_alu_instr *vec,
+             unsigned start_idx)
+{
+   assert(start_idx < nir_op_infos[vec->op].num_inputs);
+   assert(vec->src[start_idx].src.is_ssa);
+   nir_ssa_def *src = vec->src[start_idx].src.ssa;
+
+   unsigned num_components = nir_dest_num_components(vec->dest.dest);
+   assert(num_components == nir_op_infos[vec->op].num_inputs);
+   unsigned write_mask = 0;
+   unsigned swiz[NIR_MAX_VEC_COMPONENTS] = {0};
+
+   for (unsigned i = start_idx; i < num_components; i++) {
+      if (vec->src[i].src.ssa == src) {
+         write_mask |= BITFIELD_BIT(i);
+         swiz[i] = vec->src[i].swizzle[0];
+      }
+   }
+
+   /* No sense storing from undef, just return the write mask */
+   if (src->parent_instr->type == nir_instr_type_ssa_undef)
+      return write_mask;
+
+   b->cursor = nir_before_instr(&vec->instr);
+   nir_build_store_reg(b, nir_swizzle(b, src, swiz, num_components), reg,
+                       .write_mask = write_mask);
+   return write_mask;
+}
+
+static bool
+has_replicated_dest(nir_alu_instr *alu)
+{
+   return alu->op == nir_op_fdot2_replicated ||
+          alu->op == nir_op_fdot3_replicated ||
+          alu->op == nir_op_fdot4_replicated ||
+          alu->op == nir_op_fdph_replicated;
+}
+
+/* Attempts to coalesce the "move" from the given source of the vec to the
+ * destination of the instruction generating the value. If, for whatever
+ * reason, we cannot coalesce the move, it does nothing and returns 0.  We
+ * can then call insert_mov as normal.
+ */
+static unsigned
+try_coalesce(nir_builder *b, nir_ssa_def *reg, nir_alu_instr *vec,
+             unsigned start_idx, struct data *data)
+{
+   assert(start_idx < nir_op_infos[vec->op].num_inputs);
+   assert(vec->src[start_idx].src.is_ssa);
+
+   /* If we are going to do a reswizzle, then the vecN operation must be the
+    * only use of the source value.
+    */
+   nir_foreach_use_including_if(src, vec->src[start_idx].src.ssa) {
+      if (src->is_if)
+         return 0;
+
+      if (src->parent_instr != &vec->instr)
+         return 0;
+   }
+
+   if (vec->src[start_idx].src.ssa->parent_instr->type != nir_instr_type_alu)
+      return 0;
+
+   nir_alu_instr *src_alu =
+      nir_instr_as_alu(vec->src[start_idx].src.ssa->parent_instr);
+
+   if (has_replicated_dest(src_alu)) {
+      /* The fdot instruction is special: It replicates its result to all
+       * components.  This means that we can always rewrite its destination
+       * and we don't need to swizzle anything.
+       */
+   } else {
+      /* We only care about being able to re-swizzle the instruction if it is
+       * something that we can reswizzle.  It must be per-component.  The one
+       * exception to this is the fdotN instructions which implicitly splat
+       * their result out to all channels.
+       */
+      if (nir_op_infos[src_alu->op].output_size != 0)
+         return 0;
+
+      /* If we are going to reswizzle the instruction, we can't have any
+       * non-per-component sources either.
+       */
+      for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+         if (nir_op_infos[src_alu->op].input_sizes[j] != 0)
+            return 0;
+   }
+
+   /* Only vecN instructions have more than 4 sources and those are disallowed
+    * by the above check for non-per-component sources.  This assumption saves
+    * us a bit of stack memory.
+    */
+   assert(nir_op_infos[src_alu->op].num_inputs <= 4);
+
+   /* Stash off all of the ALU instruction's swizzles. */
+   uint8_t swizzles[4][NIR_MAX_VEC_COMPONENTS];
+   for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+      for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++)
+         swizzles[j][i] = src_alu->src[j].swizzle[i];
+
+   unsigned dest_components = nir_dest_num_components(vec->dest.dest);
+   assert(dest_components == nir_op_infos[vec->op].num_inputs);
+
+   /* Generate the final write mask */
+   nir_component_mask_t write_mask = 0;
+   for (unsigned i = start_idx; i < dest_components; i++) {
+      if (vec->src[i].src.ssa != &src_alu->dest.dest.ssa)
+         continue;
+
+      write_mask |= BITFIELD_BIT(i);
+   }
+
+   /* If the instruction would be vectorized but the backend
+    * doesn't support vectorizing this op, abort. */
+   if (data->cb && !data->cb(&src_alu->instr, write_mask, data->data))
+      return 0;
+
+   for (unsigned i = 0; i < dest_components; i++) {
+      bool valid = write_mask & BITFIELD_BIT(i);
+
+      /* At this point, the given vec source matches up with the ALU
+       * instruction so we can re-swizzle that component to match.
+       */
+      if (has_replicated_dest(src_alu)) {
+         /* Since the destination is a single replicated value, we don't need
+          * to do any reswizzling
+          */
+      } else {
+         for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++) {
+            /* For channels we're extending out of nowhere, use a benign swizzle
+             * so we don't read invalid components and trip nir_validate.
+             */
+            unsigned c = valid ? vec->src[i].swizzle[0] : 0;
+
+            src_alu->src[j].swizzle[i] = swizzles[j][c];
+         }
+      }
+
+      /* Clear the no longer needed vec source */
+      if (valid)
+         nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, NIR_SRC_INIT);
+   }
+
+   /* We've cleared the only use of the destination */
+   assert(list_is_empty(&src_alu->dest.dest.ssa.uses));
+
+   /* ... so we can replace it with the bigger destination accommodating the
+    * whole vector that will be masked for the store.
+    */
+   unsigned bit_size = nir_dest_bit_size(vec->dest.dest);
+   assert(bit_size == src_alu->dest.dest.ssa.bit_size);
+   nir_ssa_dest_init(&src_alu->instr, &src_alu->dest.dest,
+                     dest_components, bit_size);
+   src_alu->dest.write_mask = nir_component_mask(dest_components);
+
+   /* Then we can store that ALU result directly into the register */
+   b->cursor = nir_after_instr(&src_alu->instr);
+   nir_build_store_reg(b, &src_alu->dest.dest.ssa,
+                       reg, .write_mask = write_mask);
+
+   return write_mask;
+}
+
+static bool
+lower(nir_builder *b, nir_instr *instr, void *data_)
+{
+   struct data *data = data_;
+   if (instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *vec = nir_instr_as_alu(instr);
+   if (vec->op == nir_op_mov || !nir_op_is_vec(vec->op))
+      return false;
+
+   assert(vec->dest.dest.is_ssa);
+   unsigned num_components = nir_dest_num_components(vec->dest.dest);
+
+   /* Special case: if all sources are the same, just swizzle instead to avoid
+    * the extra copies from a register.
+    */
+   bool need_reg = false;
+   for (unsigned i = 1; i < num_components; ++i) {
+      if (!nir_srcs_equal(vec->src[0].src, vec->src[i].src)) {
+         need_reg = true;
+         break;
+      }
+   }
+
+   if (need_reg) {
+      /* We'll replace with a register. Declare one for the purpose. */
+      nir_ssa_def *reg = nir_decl_reg(b, num_components,
+                                      nir_dest_bit_size(vec->dest.dest), 0);
+
+      unsigned finished_write_mask = 0;
+      for (unsigned i = 0; i < num_components; i++) {
+         /* Try to coalesce the move */
+         if (!(finished_write_mask & BITFIELD_BIT(i)))
+            finished_write_mask |= try_coalesce(b, reg, vec, i, data);
+
+         /* Otherwise fall back on the simple path */
+         if (!(finished_write_mask & BITFIELD_BIT(i)))
+            finished_write_mask |= insert_store(b, reg, vec, i);
+      }
+
+      nir_rewrite_uses_to_load_reg(b, &vec->dest.dest.ssa, reg);
+   } else {
+      /* Otherwise, we replace with a swizzle */
+      unsigned swiz[NIR_MAX_VEC_COMPONENTS] = {0};
+
+      for (unsigned i = 0; i < num_components; ++i) {
+         assert(vec->src[i].src.is_ssa);
+         swiz[i] = vec->src[i].swizzle[0];
+      }
+
+      b->cursor = nir_before_instr(instr);
+      nir_ssa_def *swizzled = nir_swizzle(b, vec->src[0].src.ssa, swiz,
+                                          num_components);
+      nir_ssa_def_rewrite_uses(&vec->dest.dest.ssa, swizzled);
+   }
+
+   nir_instr_remove(&vec->instr);
+   nir_instr_free(&vec->instr);
+   return true;
+}
+
+bool
+nir_lower_vec_to_regs(nir_shader *shader, nir_instr_writemask_filter_cb cb,
+                      const void *_data)
+{
+   struct data data = {
+      .cb = cb,
+      .data = _data
+   };
+
+   return nir_shader_instructions_pass(shader, lower,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance, &data);
+}