nir: Add a faster lowest common ancestor algorithm

On a fossil from the blender 4.5.0 vulkan backend, this improves compile
times in nak by about 17%. Compile time of other shaders improves by a
more modest 1.2%.

No stat changes on shader-db.

Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36184>
This commit is contained in:
Mel Henning 2025-07-15 21:11:34 -04:00 committed by Marge Bot
parent 0d07b86073
commit 17876a00af
9 changed files with 226 additions and 38 deletions

View file

@ -114,6 +114,7 @@ else
'nir_deref.h', 'nir_deref.h',
'nir_divergence_analysis.c', 'nir_divergence_analysis.c',
'nir_dominance.c', 'nir_dominance.c',
'nir_dominance_lca.c',
'nir_fixup_is_exported.c', 'nir_fixup_is_exported.c',
'nir_format_convert.c', 'nir_format_convert.c',
'nir_format_convert.h', 'nir_format_convert.h',

View file

@ -692,6 +692,8 @@ nir_function_impl_create_bare(nir_shader *shader)
impl->num_blocks = 0; impl->num_blocks = 0;
impl->valid_metadata = nir_metadata_none; impl->valid_metadata = nir_metadata_none;
impl->structured = true; impl->structured = true;
range_minimum_query_table_init(&impl->dom_lca_info.table);
impl->dom_lca_info.block_from_idx = NULL;
/* create start & end blocks */ /* create start & end blocks */
nir_block *start_block = nir_block_create(shader); nir_block *start_block = nir_block_create(shader);

View file

@ -41,6 +41,7 @@
#include "util/log.h" #include "util/log.h"
#include "util/macros.h" #include "util/macros.h"
#include "util/ralloc.h" #include "util/ralloc.h"
#include "util/range_minimum_query.h"
#include "util/set.h" #include "util/set.h"
#include "util/u_math.h" #include "util/u_math.h"
#include "nir_defines.h" #include "nir_defines.h"
@ -3476,6 +3477,17 @@ typedef enum {
*/ */
nir_metadata_divergence = 0x40, nir_metadata_divergence = 0x40,
/** Indicates that block dominance lca information is valid
*
* This includes:
*
* - nir_function_impl::dom_lca_info
*
* A pass can preserve this metadata type if it preserves
* nir_metadata_dominance.
*/
nir_metadata_dominance_lca = 0x80,
/** All control flow metadata /** All control flow metadata
* *
* This includes all metadata preserved by a pass that preserves control flow * This includes all metadata preserved by a pass that preserves control flow
@ -3486,7 +3498,8 @@ typedef enum {
* This is the most common metadata set to preserve, so it has its own alias. * This is the most common metadata set to preserve, so it has its own alias.
*/ */
nir_metadata_control_flow = nir_metadata_block_index | nir_metadata_control_flow = nir_metadata_block_index |
nir_metadata_dominance, nir_metadata_dominance |
nir_metadata_dominance_lca,
/** All metadata /** All metadata
* *
@ -3524,6 +3537,12 @@ typedef struct nir_function_impl {
/* total number of basic blocks, only valid when block_index_dirty = false */ /* total number of basic blocks, only valid when block_index_dirty = false */
unsigned num_blocks; unsigned num_blocks;
/** Information used for LCA queries */
struct nir_dom_lca_info {
struct range_minimum_query_table table;
nir_block **block_from_idx;
} dom_lca_info;
/** True if this nir_function_impl uses structured control-flow /** True if this nir_function_impl uses structured control-flow
* *
* Structured nir_function_impls have different validation rules. * Structured nir_function_impls have different validation rules.
@ -4921,8 +4940,18 @@ bool nir_shader_lower_instructions(nir_shader *shader,
void nir_calc_dominance_impl(nir_function_impl *impl); void nir_calc_dominance_impl(nir_function_impl *impl);
void nir_calc_dominance(nir_shader *shader); void nir_calc_dominance(nir_shader *shader);
void nir_calc_dominance_lca_impl(nir_function_impl *impl);
/**
* Computes the lowest common ancestor of two blocks in the dominator tree.
*
* If one of the blocks is null or unreachable, the other block is returned or
* NULL if it's unreachable.
*
* Requires nir_metadata_dominance_lca
*/
nir_block *nir_dominance_lca(nir_block *b1, nir_block *b2); nir_block *nir_dominance_lca(nir_block *b1, nir_block *b2);
bool nir_block_dominates(nir_block *parent, nir_block *child); bool nir_block_dominates(nir_block *parent, nir_block *child);
bool nir_block_is_unreachable(nir_block *block); bool nir_block_is_unreachable(nir_block *block);

View file

@ -212,35 +212,6 @@ nir_calc_dominance(nir_shader *shader)
} }
} }
static nir_block *
block_return_if_reachable(nir_block *b)
{
return (b && nir_block_is_reachable(b)) ? b : NULL;
}
/**
* Computes the least common ancestor of two blocks. If one of the blocks
* is null or unreachable, the other block is returned or NULL if it's
* unreachable.
*/
nir_block *
nir_dominance_lca(nir_block *b1, nir_block *b2)
{
if (b1 == NULL || !nir_block_is_reachable(b1))
return block_return_if_reachable(b2);
if (b2 == NULL || !nir_block_is_reachable(b2))
return block_return_if_reachable(b1);
assert(nir_cf_node_get_function(&b1->cf_node) ==
nir_cf_node_get_function(&b2->cf_node));
assert(nir_cf_node_get_function(&b1->cf_node)->valid_metadata &
nir_metadata_dominance);
return intersect(b1, b2);
}
/** /**
* Returns true if parent dominates child according to the following * Returns true if parent dominates child according to the following
* definition: * definition:

View file

@ -0,0 +1,177 @@
/*
* Copyright 2025 Valve Corporation
* SPDX-License-Identifier: MIT
*/
#include "nir.h"
/**
* Find the lowest common ancestor in the dominance tree.
*
* We reduce the LCA problem to range minimum query using the standard euler
* tour method (see eg. Bender and Colton section 2). From there, we use the
* simple RMQ algorithm that uses O(n log n) preprcessing time and O(1) query
* time (Bender and Colton section 3).
*
* As a slight modification, we store the block index instead of the block
* depth. We can do this because the lower tree depth is always at a lower block
* index and we use an RMQ algorithm that doesn't rely on the -1/+1 property.
*
* Bender, M.A., Farach-Colton, M. (2000). The LCA Problem Revisited. In:
* Gonnet, G.H., Viola, A. (eds) LATIN 2000: Theoretical Informatics. LATIN
* 2000. Lecture Notes in Computer Science, vol 1776. Springer, Berlin,
* Heidelberg. https://doi.org/10.1007/10719839_9
*/
static void
realloc_info(nir_function_impl *impl)
{
struct nir_dom_lca_info *info = &impl->dom_lca_info;
const uint32_t euler_tour_size = impl->num_blocks * 2 - 1;
void *mem_ctx = ralloc_parent(impl);
range_minimum_query_table_resize(&info->table, mem_ctx, euler_tour_size);
info->block_from_idx = reralloc_array_size(mem_ctx, info->block_from_idx,
sizeof(nir_block *),
impl->num_blocks);
}
static uint32_t
dom_lca_representative(nir_block *block)
{
/* The dom_pre_index is 1-indexed so we need to subtract one to match our
* indices
*/
return block->dom_pre_index - 1;
}
static void
generate_euler_tour(nir_function_impl *impl)
{
uint32_t *table = impl->dom_lca_info.table.table;
nir_block **block_from_idx = impl->dom_lca_info.block_from_idx;
if (impl->num_blocks == 1) {
nir_block *block = nir_start_block(impl);
table[0] = 0;
block_from_idx[0] = block;
return;
}
/* By definition, the first row of the table contains range minimum query
* lookups for each single-element block, meaning it is just the array that
* we will perform RMQs on. Therefore, when generating the Euler tour, we
* store results in the first row and are free to use the rest of the table
* as scratch memory for the depth-first search.
*
* The stack contains the index of the node's next child to visit.
*/
assert(impl->dom_lca_info.table.height >= 2);
STATIC_ASSERT(sizeof(uint32_t) <= sizeof(nir_block *));
uint32_t *dfs_stack = (uint32_t *)&table[impl->dom_lca_info.table.width];
nir_block *cur_block = nir_start_block(impl);
uint32_t *cur_stack = dfs_stack;
bool first_visit = true;
uint32_t i;
for (i = 0; i < impl->dom_lca_info.table.width; i++) {
if (cur_block == NULL) {
/* This can happen earlier than expected if some blocks are
* unreachable
*/
break;
}
assert(cur_stack >= dfs_stack);
table[i] = cur_block->index;
if (first_visit) {
/* First visit. Place it on the stack. */
*cur_stack = 0;
assert(i == dom_lca_representative(cur_block));
block_from_idx[cur_block->index] = cur_block;
}
if (*cur_stack < cur_block->num_dom_children) {
cur_block = cur_block->dom_children[*cur_stack];
*cur_stack += 1;
cur_stack += 1;
first_visit = true;
} else {
assert(*cur_stack == cur_block->num_dom_children);
cur_block = cur_block->imm_dom;
cur_stack -= 1;
first_visit = false;
}
}
assert(cur_block == NULL);
if (i != impl->dom_lca_info.table.width) {
void *mem_ctx = ralloc_parent(impl);
range_minimum_query_table_resize(&impl->dom_lca_info.table, mem_ctx, i);
}
}
void
nir_calc_dominance_lca_impl(nir_function_impl *impl)
{
if (impl->valid_metadata & nir_metadata_dominance_lca)
return;
nir_metadata_require(impl, nir_metadata_block_index |
nir_metadata_dominance);
realloc_info(impl);
generate_euler_tour(impl);
range_minimum_query_table_preprocess(&impl->dom_lca_info.table);
}
static nir_block *
block_return_if_reachable(nir_block *b)
{
return (b && nir_block_is_reachable(b)) ? b : NULL;
}
static bool
is_lca(nir_block *result, nir_block *b1, nir_block *b2)
{
if (!nir_block_dominates(result, b1) || !nir_block_dominates(result, b2))
return false;
for (int i = 0; i < result->num_dom_children; i++) {
nir_block *child = result->dom_children[i];
if (nir_block_dominates(child, b1) &&
nir_block_dominates(child, b2))
return false;
}
return true;
}
nir_block *
nir_dominance_lca(nir_block *b1, nir_block *b2)
{
if (b1 == NULL || !nir_block_is_reachable(b1))
return block_return_if_reachable(b2);
if (b2 == NULL || !nir_block_is_reachable(b2))
return block_return_if_reachable(b1);
assert(nir_cf_node_get_function(&b1->cf_node) ==
nir_cf_node_get_function(&b2->cf_node));
nir_function_impl *impl = nir_cf_node_get_function(&b1->cf_node);
assert(impl->valid_metadata & nir_metadata_dominance_lca);
uint32_t i1 = dom_lca_representative(b1);
uint32_t i2 = dom_lca_representative(b2);
if (i1 > i2)
SWAP(i1, i2);
uint32_t index = range_minimum_query(&impl->dom_lca_info.table, i1, i2 + 1);
nir_block *result = impl->dom_lca_info.block_from_idx[index];
assert(is_lca(result, b1, b2));
return result;
}

View file

@ -38,6 +38,8 @@ nir_metadata_require(nir_function_impl *impl, nir_metadata required, ...)
nir_index_instrs(impl); nir_index_instrs(impl);
if (NEEDS_UPDATE(nir_metadata_dominance)) if (NEEDS_UPDATE(nir_metadata_dominance))
nir_calc_dominance_impl(impl); nir_calc_dominance_impl(impl);
if (NEEDS_UPDATE(nir_metadata_dominance_lca))
nir_calc_dominance_lca_impl(impl);
if (NEEDS_UPDATE(nir_metadata_live_defs)) if (NEEDS_UPDATE(nir_metadata_live_defs))
nir_live_defs_impl(impl); nir_live_defs_impl(impl);
if (NEEDS_UPDATE(nir_metadata_divergence)) if (NEEDS_UPDATE(nir_metadata_divergence))
@ -73,6 +75,9 @@ nir_progress(bool progress, nir_function_impl *impl, nir_metadata preserved)
if (!progress) if (!progress)
preserved = nir_metadata_all; preserved = nir_metadata_all;
if (!(preserved & nir_metadata_dominance))
assert(!(preserved & nir_metadata_dominance_lca));
/* If we discard valid liveness information, immediately free the /* If we discard valid liveness information, immediately free the
* liveness information for each block. For large shaders, it can * liveness information for each block. For large shaders, it can
* consume a huge amount of memory, and it's usually not immediately * consume a huge amount of memory, and it's usually not immediately

View file

@ -796,8 +796,9 @@ weak_gvn(const nir_instr *a, const nir_instr *b)
static bool static bool
opt_gcm_impl(nir_shader *shader, nir_function_impl *impl, bool value_number) opt_gcm_impl(nir_shader *shader, nir_function_impl *impl, bool value_number)
{ {
nir_metadata_require(impl, nir_metadata_require(impl, nir_metadata_block_index |
nir_metadata_block_index | nir_metadata_dominance); nir_metadata_dominance |
nir_metadata_dominance_lca);
nir_metadata_require(impl, nir_metadata_loop_analysis, nir_metadata_require(impl, nir_metadata_loop_analysis,
shader->options->force_indirect_unrolling, shader->options->force_indirect_unrolling,
shader->options->force_indirect_unrolling_sampler); shader->options->force_indirect_unrolling_sampler);

View file

@ -361,11 +361,12 @@ nir_opt_sink(nir_shader *shader, nir_move_options options)
bool progress = false; bool progress = false;
nir_foreach_function_impl(impl, shader) { nir_foreach_function_impl(impl, shader) {
nir_metadata_require(impl, nir_metadata required = nir_metadata_block_index |
nir_metadata_block_index | nir_metadata_dominance | nir_metadata_dominance |
(options & (nir_move_only_convergent | nir_metadata_dominance_lca;
nir_move_only_divergent) ? if (options & (nir_move_only_convergent | nir_move_only_divergent))
nir_metadata_divergence : 0)); required |= nir_metadata_divergence;
nir_metadata_require(impl, required);
nir_foreach_block_reverse(block, impl) { nir_foreach_block_reverse(block, impl) {
nir_foreach_instr_reverse_safe(instr, block) { nir_foreach_instr_reverse_safe(instr, block) {

View file

@ -64,7 +64,8 @@ brw_nir_lower_rt_intrinsics_pre_trace(nir_shader *nir)
if (intrinsics->entries > 0) { if (intrinsics->entries > 0) {
nir_foreach_function_with_impl(func, impl, nir) { nir_foreach_function_with_impl(func, impl, nir) {
nir_metadata_require(impl, nir_metadata_dominance); nir_metadata_require(impl, nir_metadata_dominance |
nir_metadata_dominance_lca);
/* Going in reverse order of blocks, move the intrinsics gather above /* Going in reverse order of blocks, move the intrinsics gather above
* in the LCA block to trace calls. * in the LCA block to trace calls.