nir/lower_vec_to_movs: Coalesce movs on-the-fly when possible

The old pass blindly inserted a bunch of moves into the shader with no concern for whether or not it was really needed. This adds code to try and coalesce into the destination of the instruction providing the value. Shader-db results for vec4 shaders on Haswell: total instructions in shared programs: 1754420 -> 1747753 (-0.38%) instructions in affected programs: 231230 -> 224563 (-2.88%) helped: 1017 HURT: 2 This approach is heavily based on a different patch by Eduardo Lima Mitev <elima@igalia.com>. Eduardo's patch did this in a separate pass as opposed to integrating it into nir_lower_vec_to_movs. Reviewed-by: Eduardo Lima Mitev <elima@igalia.com>
2026-05-07 00:38:48 +02:00 · 2015-09-09 14:40:06 -07:00 · 2015-09-09 14:40:06 -07:00 · 2458ea95c5
commit 2458ea95c5
parent 2b2f1f16a0
1 changed files with 85 additions and 0 deletions
--- a/src/glsl/nir/nir_lower_vec_to_movs.c
+++ b/src/glsl/nir/nir_lower_vec_to_movs.c
@ -79,6 +79,88 @@ insert_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
   return mov->dest.write_mask;
 }

+/* Attempts to coalesce the "move" from the given source of the vec to the
+ * destination of the instruction generating the value. If, for whatever
+ * reason, we cannot coalesce the mmove, it does nothing and returns 0.  We
+ * can then call insert_mov as normal.
+ */
+static unsigned
+try_coalesce(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
+{
+   assert(start_idx < nir_op_infos[vec->op].num_inputs);
+
+   /* We will only even try if the source is SSA */
+   if (!vec->src[start_idx].src.is_ssa)
+      return 0;
+
+   assert(vec->src[start_idx].src.ssa);
+
+   /* If we are going to do a reswizzle, then the vecN operation must be the
+    * only use of the source value.  We also can't have any source modifiers.
+    */
+   nir_foreach_use(vec->src[start_idx].src.ssa, src) {
+      if (src->parent_instr != &vec->instr)
+         return 0;
+
+      nir_alu_src *alu_src = exec_node_data(nir_alu_src, src, src);
+      if (alu_src->abs || alu_src->negate)
+         return 0;
+   }
+
+   if (!list_empty(&vec->src[start_idx].src.ssa->if_uses))
+      return 0;
+
+   if (vec->src[start_idx].src.ssa->parent_instr->type != nir_instr_type_alu)
+      return 0;
+
+   nir_alu_instr *src_alu =
+      nir_instr_as_alu(vec->src[start_idx].src.ssa->parent_instr);
+
+   /* We only care about being able to re-swizzle the instruction if it is
+    * something that we can reswizzle.  It must be per-component.
+    */
+   if (nir_op_infos[src_alu->op].output_size != 0)
+      return 0;
+
+   /* If we are going to reswizzle the instruction, we can't have any
+    * non-per-component sources either.
+    */
+   for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+      if (nir_op_infos[src_alu->op].input_sizes[j] != 0)
+         return 0;
+
+   /* Stash off all of the ALU instruction's swizzles. */
+   uint8_t swizzles[4][4];
+   for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+      for (unsigned i = 0; i < 4; i++)
+         swizzles[j][i] = src_alu->src[j].swizzle[i];
+
+   unsigned write_mask = 0;
+   for (unsigned i = start_idx; i < 4; i++) {
+      if (!(vec->dest.write_mask & (1 << i)))
+         continue;
+
+      if (!vec->src[i].src.is_ssa ||
+          vec->src[i].src.ssa != &src_alu->dest.dest.ssa)
+         continue;
+
+      /* At this point, the give vec source matchese up with the ALU
+       * instruction so we can re-swizzle that component to match.
+       */
+      write_mask |= 1 << i;
+      for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+         src_alu->src[j].swizzle[i] = swizzles[j][vec->src[i].swizzle[0]];
+
+      /* Clear the no longer needed vec source */
+      nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, NIR_SRC_INIT);
+   }
+
+   nir_instr_rewrite_dest(&src_alu->instr, &src_alu->dest.dest, vec->dest.dest);
+   src_alu->dest.write_mask = write_mask;
+
+   return write_mask;
+}
+
 static bool
 lower_vec_to_movs_block(nir_block *block, void *void_impl)
 {
@ -132,6 +214,9 @@ lower_vec_to_movs_block(nir_block *block, void *void_impl)
         if (!(vec->dest.write_mask & (1 << i)))
            continue;

+         if (!(finished_write_mask & (1 << i)))
+            finished_write_mask |= try_coalesce(vec, i, shader);
+
         if (!(finished_write_mask & (1 << i)))
            finished_write_mask |= insert_mov(vec, i, shader);
      }