nir: add a utility computing post-dominance of SSA uses

The massive comment explains it. Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26819>
2025-12-22 11:20:11 +01:00 · 2023-07-10 03:01:21 -04:00 · 2023-07-10 03:01:21 -04:00 · ba54099dce
commit ba54099dce
parent 9d76ba37e8
3 changed files with 400 additions and 0 deletions
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@ -300,6 +300,7 @@ files_libnir = files(
  'nir_sweep.c',
  'nir_to_lcssa.c',
  'nir_trivialize_registers.c',
+  'nir_use_dominance.c',
  'nir_validate.c',
  'nir_vla.h',
  'nir_worklist.c',
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@ -6698,6 +6698,22 @@ nir_store_reg_for_def(const nir_def *def)
   return intr;
 }

+struct nir_use_dominance_state;
+
+struct nir_use_dominance_state *
+nir_calc_use_dominance_impl(nir_function_impl *impl, bool post_dominance);
+
+nir_instr *
+nir_get_immediate_use_dominator(struct nir_use_dominance_state *state,
+                                nir_instr *instr);
+nir_instr *nir_use_dominance_lca(struct nir_use_dominance_state *state,
+                                 nir_instr *i1, nir_instr *i2);
+bool nir_instr_dominates_use(struct nir_use_dominance_state *state,
+                             nir_instr *parent, nir_instr *child);
+void nir_print_use_dominators(struct nir_use_dominance_state *state,
+                              nir_instr **instructions,
+                              unsigned num_instructions);
+
 #include "nir_inline_helpers.h"

 #ifdef __cplusplus
--- a/src/compiler/nir/nir_use_dominance.c
+++ b/src/compiler/nir/nir_use_dominance.c
@ -0,0 +1,383 @@
+/*
+ * Copyright 2014 Intel Corporation
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+/* This implements dominance and post-dominance of the SSA use graph where
+ * instructions are vertices and SSA uses are edges (i.e. edges go from
+ * each instruction to all its uses). CF nodes are ignored and irrelevant.
+ * It's different from nir_dominance.c, but the algorithm is the same, which
+ * is from "A Simple, Fast Dominance Algorithm" by Cooper, Harvey, and Kennedy.
+ *
+ * Definitions:
+ * - Instruction A is post-dominated by instruction B if the result of
+ *   instruction A and following intermediate results using the result of
+ *   instruction A only affect the result of instruction B. Consequently,
+ *   if instruction B was removed, instruction A would become dead including
+ *   all instructions computing the intermediate results.
+ *   Example: A(load) -> ... -> B(ALU)
+ *   Note: This is the foundation of inter-shader code motion from later
+ *   shaders to earlier shaders.
+ * - Instruction B is dominated by instruction A if all use paths from
+ *   all loads to instruction B must go through instruction A.
+ *   Note: Unlike post-dominance, dominance is unusable as-is because
+ *   the immediate dominator typically doesn't exist if there are non-unary
+ *   opcodes (i.e. branches of an expression tree following source operands
+ *   don't usually converge to a single instruction unless all instructions
+ *   are unary). The solution is to ignore loads like load_const to allow
+ *   non-unary opcodes, which would be the foundation of inter-shader code
+ *   motion from earlier shaders to later shaders, such as 2 output stores
+ *   having only 1 ALU instruction as their only source at the beginning,
+ *   ignoring constant and uniform operands along the way.
+ *
+ * Interesting cases implied by this (post-)dominator tree:
+ * - load_const, loads without src operands, and undef are not dominated by
+ *   anything because they don't have any src operands.
+ * - No instruction post-dominates store intrinsics (and all other intrinsics
+ *   without a destination) and nir_if nodes (they use a value but don't
+ *   produce any).
+ *
+ * Typical application:
+ * - The immediate post-dominator query returns the solution to the problem of
+ *   how much code we can move into the previous shader or preamble without
+ *   increasing the number of inputs. Example of an SSA-use graph and
+ *   the possible result that a user of this utility can produce:
+ *
+ *          input0 input1             input0 input1
+ *              \   / \                  |      \
+ *    constant   alu  ...    ------>     |     ...
+ *           \   /
+ *            alu
+ * (immediate post-dominator of input0)
+ *
+ * Examples of possible applications:
+ * - Moving load_input+ALU to the previous shader: An immediate post-dominator
+ *   of load_input and all instructions between load_input and the immediate
+ *   post-dominator are a candidate for being moved into the previous shader
+ *   and we only need to check if the post-dominator is movable. Repeat
+ *   the immediate post-dominator query on the accepted post-dominator and see
+ *   if that is also movable. Repeat that until you find the farthest post-
+ *   dominator that is movable.
+ * - Moving load_uniform+ALU to a preamble shader or the CPU: An immediate
+ *   post-dominator of load_uniform is a candidate for being moved into
+ *   the preamble shader or the CPU. Repeat the immediate post-dominator query
+ *   until you find the farthest post-dominator that is movable.
+ * - Replacing a value used to compute 2 shader outputs by only 1 output, and
+ *   moving the computation into the next shader:
+ *   The Lowest Common Ancestor of 2 output stores within the dominator tree
+ *   is a candidate for the new replacement output. Any loads that are
+ *   trivially movable such as load_const should be ignored by this utility,
+ *   otherwise the Lowest Common Ancestor wouldn't exist.
+ *
+ * Queries:
+ * - get the immediate dominator of an instruction
+ * - get the Lowest Common Ancestor of 2 instructions
+ * - whether one instruction dominates another
+ *
+ * Implemenation details:
+ * - Since some instructions are not dominated by anything, a dummy root is
+ *   added into the graph that dominates such instructions, which is required
+ *   by the algorithm.
+ *
+ * TODO: only post-dominance implemented, not dominance
+ */
+
+#include "nir.h"
+
+struct nir_use_dom_node {
+   nir_instr *instr;
+   uint32_t index;
+
+   /* The index of this node's immediate dominator in the dominator tree.
+    * The dummy root points it to itself. -1 == unset.
+    */
+   int32_t imm_dom;
+};
+
+struct nir_use_dominance_state {
+   nir_function_impl *impl;
+   struct nir_use_dom_node *dom_nodes;
+   unsigned num_dom_nodes;
+};
+
+static struct nir_use_dom_node *
+get_node(struct nir_use_dominance_state *state, nir_instr *instr)
+{
+   return &state->dom_nodes[instr->index];
+}
+
+static struct nir_use_dom_node *
+get_imm_dom(struct nir_use_dominance_state *state,
+            struct nir_use_dom_node *node)
+{
+   assert(node->imm_dom != -1);
+   return &state->dom_nodes[node->imm_dom];
+}
+
+static bool
+init_instr(struct nir_use_dominance_state *state, nir_instr *instr,
+           unsigned *index)
+{
+   assert(*index < state->num_dom_nodes);
+   struct nir_use_dom_node *node = &state->dom_nodes[*index];
+
+   if (*index == 0) {
+      /* dummy root */
+      node->imm_dom = 0;
+   } else {
+      node->imm_dom = -1;
+      node->instr = instr;
+      instr->index = node->index = *index;
+   }
+   (*index)++;
+
+   return true;
+}
+
+static struct nir_use_dom_node *
+intersect(struct nir_use_dominance_state *state, struct nir_use_dom_node *i1,
+          struct nir_use_dom_node *i2)
+{
+   while (i1 != i2) {
+      /* Note, the comparisons here are the opposite of what the paper says
+       * because we index instrs from beginning -> end (i.e. reverse
+       * post-order) instead of post-order like they assume.
+       */
+      while (i1->index > i2->index)
+         i1 = get_imm_dom(state, i1);
+      while (i2->index > i1->index)
+         i2 = get_imm_dom(state, i2);
+   }
+
+   return i1;
+}
+
+static void
+update_imm_dom(struct nir_use_dominance_state *state,
+               struct nir_use_dom_node *pred,
+               struct nir_use_dom_node **new_idom)
+{
+   if (pred->imm_dom != -1) {
+      if (*new_idom)
+         *new_idom = intersect(state, pred, *new_idom);
+      else
+         *new_idom = pred;
+   }
+}
+
+static bool
+calc_dominance(struct nir_use_dominance_state *state,
+               struct nir_use_dom_node *node, bool post_dominance)
+{
+   struct nir_use_dom_node *new_idom = NULL;
+
+   if (post_dominance) {
+      nir_def *def = nir_instr_def(node->instr);
+      bool has_use = false;
+
+      if (def) {
+         nir_foreach_use_including_if(src, def) {
+            has_use = true;
+
+            /* Ifs are treated like stores because they don't produce
+             * a value. dom_nodes[0] is the dummy root.
+             */
+            if (nir_src_is_if(src)) {
+               update_imm_dom(state, &state->dom_nodes[0], &new_idom);
+               /* Short-cut because we can't come back from the root node. */
+               break;
+            } else {
+               update_imm_dom(state,
+                              get_node(state, nir_src_parent_instr(src)),
+                              &new_idom);
+            }
+         }
+      }
+
+      /* No destination (e.g. stores, atomics with an unused result, discard,
+       * dead instructions). dom_nodes[0] is the dummy root.
+       */
+      if (!has_use)
+         update_imm_dom(state, &state->dom_nodes[0], &new_idom);
+   } else {
+      unreachable("TODO: only post-dominance implemented, not dominance");
+   }
+
+   if (new_idom && node->imm_dom != new_idom->index) {
+      node->imm_dom = new_idom->index;
+      return true;
+   }
+
+   return false;
+}
+
+/**
+ * Calculate dominance or post-dominance of the SSA use graph.
+ * The returned state must not be freed while dominance queries are being used.
+ * nir_free_use_dominance_state() frees the state.
+ *
+ * It clobbers nir_instr::index, which can't be changed while dominance queries
+ * are being used.
+ *
+ * \param impl             NIR function
+ * \param post_dominance   Whether to compute post-dominance or dominance.
+ */
+struct nir_use_dominance_state *
+nir_calc_use_dominance_impl(nir_function_impl *impl, bool post_dominance)
+{
+   struct nir_use_dominance_state *state =
+      rzalloc(NULL, struct nir_use_dominance_state);
+   if (!state)
+      return NULL;
+
+   unsigned num_dom_nodes = 1; /* including the dummy root */
+   nir_foreach_block(block, impl) {
+      num_dom_nodes += exec_list_length(&block->instr_list);
+   }
+
+   state->impl = impl;
+   state->num_dom_nodes = num_dom_nodes;
+   state->dom_nodes = rzalloc_array(state, struct nir_use_dom_node,
+                                    num_dom_nodes);
+   if (!state->dom_nodes) {
+      ralloc_free(state);
+      return NULL;
+   }
+
+   unsigned index = 0;
+
+   /* We need a dummy root node because there are instructions such as
+    * load_const that aren't dominated by anything. If we are calculating
+    * post-dominance, intrinsics without a destination aren't post-dominated
+    * by anything. However, the algorithm requires a common (post-)dominator.
+    */
+   init_instr(state, NULL, &index);
+
+   /* Post-dominance is identical to dominance, but instructions are added
+    * in the opposite order.
+    */
+   if (post_dominance) {
+      nir_foreach_block_reverse(block, impl) {
+         nir_foreach_instr_reverse(instr, block) {
+            init_instr(state, instr, &index);
+         }
+      }
+   } else {
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr(instr, block) {
+            init_instr(state, instr, &index);
+         }
+      }
+   }
+
+   bool progress = true;
+   while (progress) {
+      progress = false;
+
+      /* Skip the dummy root (iterate from 1). */
+      for (unsigned i = 1; i < num_dom_nodes; i++) {
+         progress |= calc_dominance(state, &state->dom_nodes[i],
+                                    post_dominance);
+      }
+   }
+
+   return state;
+}
+
+nir_instr *
+nir_get_immediate_use_dominator(struct nir_use_dominance_state *state,
+                                nir_instr *instr)
+{
+   struct nir_use_dom_node *node = get_node(state, instr);
+
+   return get_imm_dom(state, node)->instr;
+}
+
+/**
+ * Computes the least common ancestor of two instructions.
+ */
+nir_instr *
+nir_use_dominance_lca(struct nir_use_dominance_state *state,
+                      nir_instr *i1, nir_instr *i2)
+{
+   assert(i1 && i2);
+   struct nir_use_dom_node *lca = intersect(state, get_node(state, i1),
+                                            get_node(state, i2));
+   assert(lca);
+   /* Might be NULL in case of the dummy root. */
+   return lca->instr;
+}
+
+/**
+ * Returns true if the parent dominates the child in the SSA use graph
+ * described at the beginning.
+ */
+bool
+nir_instr_dominates_use(struct nir_use_dominance_state *state,
+                        nir_instr *parent_instr, nir_instr *child_instr)
+{
+   struct nir_use_dom_node *parent = get_node(state, parent_instr);
+   struct nir_use_dom_node *child = get_node(state, child_instr);
+
+   while (parent->index < child->index)
+      child = get_imm_dom(state, child);
+
+   return parent == child;
+}
+
+static void
+print_instr(struct nir_use_dom_node *node)
+{
+   if (!node)
+      printf("NULL - bug");
+   else if (node->index == 0)
+      printf("dummy_root");
+   else
+      nir_print_instr(node->instr, stdout);
+}
+
+void
+nir_print_use_dominators(struct nir_use_dominance_state *state,
+                         nir_instr **instructions, unsigned num_instructions)
+{
+   for (unsigned i = 0; i < num_instructions; i++) {
+      printf("Input idom(\"");
+      nir_print_instr(instructions[i], stdout);
+      printf("\") = \"");
+      print_instr(get_imm_dom(state, get_node(state, instructions[i])));
+      printf("\"\n");
+   }
+   puts("");
+
+   nir_foreach_block(block, state->impl) {
+      nir_foreach_instr(instr, block) {
+         printf("idom(\"");
+         nir_print_instr(instr, stdout);
+         printf("\") = \"");
+         print_instr(get_imm_dom(state, get_node(state, instr)));
+         printf("\"\n");
+      }
+   }
+   puts("");
+
+   for (unsigned i = 0; i < num_instructions; i++) {
+      for (unsigned j = i + 1; j < num_instructions; j++) {
+         printf("LCA input 1: ");
+         nir_print_instr(instructions[i], stdout);
+         printf("\nLCA input 2: ");
+         nir_print_instr(instructions[j], stdout);
+         puts("");
+         nir_instr *lca =
+            nir_use_dominance_lca(state, instructions[i], instructions[j]);
+
+         if (lca) {
+            printf("2 inputs have a common post-dominator: ");
+            nir_print_instr(lca, stdout);
+            printf("\n");
+         }
+         puts("");
+      }
+   }
+}