mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 07:20:10 +01:00
nir: Add a faster lowest common ancestor algorithm
On a fossil from the blender 4.5.0 vulkan backend, this improves compile times in nak by about 17%. Compile time of other shaders improves by a more modest 1.2%. No stat changes on shader-db. Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36184>
This commit is contained in:
parent
0d07b86073
commit
17876a00af
9 changed files with 226 additions and 38 deletions
|
|
@ -114,6 +114,7 @@ else
|
|||
'nir_deref.h',
|
||||
'nir_divergence_analysis.c',
|
||||
'nir_dominance.c',
|
||||
'nir_dominance_lca.c',
|
||||
'nir_fixup_is_exported.c',
|
||||
'nir_format_convert.c',
|
||||
'nir_format_convert.h',
|
||||
|
|
|
|||
|
|
@ -692,6 +692,8 @@ nir_function_impl_create_bare(nir_shader *shader)
|
|||
impl->num_blocks = 0;
|
||||
impl->valid_metadata = nir_metadata_none;
|
||||
impl->structured = true;
|
||||
range_minimum_query_table_init(&impl->dom_lca_info.table);
|
||||
impl->dom_lca_info.block_from_idx = NULL;
|
||||
|
||||
/* create start & end blocks */
|
||||
nir_block *start_block = nir_block_create(shader);
|
||||
|
|
|
|||
|
|
@ -41,6 +41,7 @@
|
|||
#include "util/log.h"
|
||||
#include "util/macros.h"
|
||||
#include "util/ralloc.h"
|
||||
#include "util/range_minimum_query.h"
|
||||
#include "util/set.h"
|
||||
#include "util/u_math.h"
|
||||
#include "nir_defines.h"
|
||||
|
|
@ -3476,6 +3477,17 @@ typedef enum {
|
|||
*/
|
||||
nir_metadata_divergence = 0x40,
|
||||
|
||||
/** Indicates that block dominance lca information is valid
|
||||
*
|
||||
* This includes:
|
||||
*
|
||||
* - nir_function_impl::dom_lca_info
|
||||
*
|
||||
* A pass can preserve this metadata type if it preserves
|
||||
* nir_metadata_dominance.
|
||||
*/
|
||||
nir_metadata_dominance_lca = 0x80,
|
||||
|
||||
/** All control flow metadata
|
||||
*
|
||||
* This includes all metadata preserved by a pass that preserves control flow
|
||||
|
|
@ -3486,7 +3498,8 @@ typedef enum {
|
|||
* This is the most common metadata set to preserve, so it has its own alias.
|
||||
*/
|
||||
nir_metadata_control_flow = nir_metadata_block_index |
|
||||
nir_metadata_dominance,
|
||||
nir_metadata_dominance |
|
||||
nir_metadata_dominance_lca,
|
||||
|
||||
/** All metadata
|
||||
*
|
||||
|
|
@ -3524,6 +3537,12 @@ typedef struct nir_function_impl {
|
|||
/* total number of basic blocks, only valid when block_index_dirty = false */
|
||||
unsigned num_blocks;
|
||||
|
||||
/** Information used for LCA queries */
|
||||
struct nir_dom_lca_info {
|
||||
struct range_minimum_query_table table;
|
||||
nir_block **block_from_idx;
|
||||
} dom_lca_info;
|
||||
|
||||
/** True if this nir_function_impl uses structured control-flow
|
||||
*
|
||||
* Structured nir_function_impls have different validation rules.
|
||||
|
|
@ -4921,8 +4940,18 @@ bool nir_shader_lower_instructions(nir_shader *shader,
|
|||
|
||||
void nir_calc_dominance_impl(nir_function_impl *impl);
|
||||
void nir_calc_dominance(nir_shader *shader);
|
||||
void nir_calc_dominance_lca_impl(nir_function_impl *impl);
|
||||
|
||||
/**
|
||||
* Computes the lowest common ancestor of two blocks in the dominator tree.
|
||||
*
|
||||
* If one of the blocks is null or unreachable, the other block is returned or
|
||||
* NULL if it's unreachable.
|
||||
*
|
||||
* Requires nir_metadata_dominance_lca
|
||||
*/
|
||||
nir_block *nir_dominance_lca(nir_block *b1, nir_block *b2);
|
||||
|
||||
bool nir_block_dominates(nir_block *parent, nir_block *child);
|
||||
bool nir_block_is_unreachable(nir_block *block);
|
||||
|
||||
|
|
|
|||
|
|
@ -212,35 +212,6 @@ nir_calc_dominance(nir_shader *shader)
|
|||
}
|
||||
}
|
||||
|
||||
static nir_block *
|
||||
block_return_if_reachable(nir_block *b)
|
||||
{
|
||||
return (b && nir_block_is_reachable(b)) ? b : NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the least common ancestor of two blocks. If one of the blocks
|
||||
* is null or unreachable, the other block is returned or NULL if it's
|
||||
* unreachable.
|
||||
*/
|
||||
nir_block *
|
||||
nir_dominance_lca(nir_block *b1, nir_block *b2)
|
||||
{
|
||||
if (b1 == NULL || !nir_block_is_reachable(b1))
|
||||
return block_return_if_reachable(b2);
|
||||
|
||||
if (b2 == NULL || !nir_block_is_reachable(b2))
|
||||
return block_return_if_reachable(b1);
|
||||
|
||||
assert(nir_cf_node_get_function(&b1->cf_node) ==
|
||||
nir_cf_node_get_function(&b2->cf_node));
|
||||
|
||||
assert(nir_cf_node_get_function(&b1->cf_node)->valid_metadata &
|
||||
nir_metadata_dominance);
|
||||
|
||||
return intersect(b1, b2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if parent dominates child according to the following
|
||||
* definition:
|
||||
|
|
|
|||
177
src/compiler/nir/nir_dominance_lca.c
Normal file
177
src/compiler/nir/nir_dominance_lca.c
Normal file
|
|
@ -0,0 +1,177 @@
|
|||
/*
|
||||
* Copyright 2025 Valve Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "nir.h"
|
||||
|
||||
/**
|
||||
* Find the lowest common ancestor in the dominance tree.
|
||||
*
|
||||
* We reduce the LCA problem to range minimum query using the standard euler
|
||||
* tour method (see eg. Bender and Colton section 2). From there, we use the
|
||||
* simple RMQ algorithm that uses O(n log n) preprcessing time and O(1) query
|
||||
* time (Bender and Colton section 3).
|
||||
*
|
||||
* As a slight modification, we store the block index instead of the block
|
||||
* depth. We can do this because the lower tree depth is always at a lower block
|
||||
* index and we use an RMQ algorithm that doesn't rely on the -1/+1 property.
|
||||
*
|
||||
* Bender, M.A., Farach-Colton, M. (2000). The LCA Problem Revisited. In:
|
||||
* Gonnet, G.H., Viola, A. (eds) LATIN 2000: Theoretical Informatics. LATIN
|
||||
* 2000. Lecture Notes in Computer Science, vol 1776. Springer, Berlin,
|
||||
* Heidelberg. https://doi.org/10.1007/10719839_9
|
||||
*/
|
||||
|
||||
static void
|
||||
realloc_info(nir_function_impl *impl)
|
||||
{
|
||||
struct nir_dom_lca_info *info = &impl->dom_lca_info;
|
||||
const uint32_t euler_tour_size = impl->num_blocks * 2 - 1;
|
||||
|
||||
void *mem_ctx = ralloc_parent(impl);
|
||||
range_minimum_query_table_resize(&info->table, mem_ctx, euler_tour_size);
|
||||
info->block_from_idx = reralloc_array_size(mem_ctx, info->block_from_idx,
|
||||
sizeof(nir_block *),
|
||||
impl->num_blocks);
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
dom_lca_representative(nir_block *block)
|
||||
{
|
||||
/* The dom_pre_index is 1-indexed so we need to subtract one to match our
|
||||
* indices
|
||||
*/
|
||||
return block->dom_pre_index - 1;
|
||||
}
|
||||
|
||||
static void
|
||||
generate_euler_tour(nir_function_impl *impl)
|
||||
{
|
||||
uint32_t *table = impl->dom_lca_info.table.table;
|
||||
nir_block **block_from_idx = impl->dom_lca_info.block_from_idx;
|
||||
if (impl->num_blocks == 1) {
|
||||
nir_block *block = nir_start_block(impl);
|
||||
table[0] = 0;
|
||||
block_from_idx[0] = block;
|
||||
return;
|
||||
}
|
||||
|
||||
/* By definition, the first row of the table contains range minimum query
|
||||
* lookups for each single-element block, meaning it is just the array that
|
||||
* we will perform RMQs on. Therefore, when generating the Euler tour, we
|
||||
* store results in the first row and are free to use the rest of the table
|
||||
* as scratch memory for the depth-first search.
|
||||
*
|
||||
* The stack contains the index of the node's next child to visit.
|
||||
*/
|
||||
assert(impl->dom_lca_info.table.height >= 2);
|
||||
STATIC_ASSERT(sizeof(uint32_t) <= sizeof(nir_block *));
|
||||
uint32_t *dfs_stack = (uint32_t *)&table[impl->dom_lca_info.table.width];
|
||||
|
||||
nir_block *cur_block = nir_start_block(impl);
|
||||
uint32_t *cur_stack = dfs_stack;
|
||||
|
||||
bool first_visit = true;
|
||||
uint32_t i;
|
||||
for (i = 0; i < impl->dom_lca_info.table.width; i++) {
|
||||
if (cur_block == NULL) {
|
||||
/* This can happen earlier than expected if some blocks are
|
||||
* unreachable
|
||||
*/
|
||||
break;
|
||||
}
|
||||
|
||||
assert(cur_stack >= dfs_stack);
|
||||
table[i] = cur_block->index;
|
||||
|
||||
if (first_visit) {
|
||||
/* First visit. Place it on the stack. */
|
||||
*cur_stack = 0;
|
||||
assert(i == dom_lca_representative(cur_block));
|
||||
block_from_idx[cur_block->index] = cur_block;
|
||||
}
|
||||
|
||||
if (*cur_stack < cur_block->num_dom_children) {
|
||||
cur_block = cur_block->dom_children[*cur_stack];
|
||||
*cur_stack += 1;
|
||||
cur_stack += 1;
|
||||
first_visit = true;
|
||||
} else {
|
||||
assert(*cur_stack == cur_block->num_dom_children);
|
||||
cur_block = cur_block->imm_dom;
|
||||
cur_stack -= 1;
|
||||
first_visit = false;
|
||||
}
|
||||
}
|
||||
|
||||
assert(cur_block == NULL);
|
||||
|
||||
if (i != impl->dom_lca_info.table.width) {
|
||||
void *mem_ctx = ralloc_parent(impl);
|
||||
range_minimum_query_table_resize(&impl->dom_lca_info.table, mem_ctx, i);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
nir_calc_dominance_lca_impl(nir_function_impl *impl)
|
||||
{
|
||||
if (impl->valid_metadata & nir_metadata_dominance_lca)
|
||||
return;
|
||||
|
||||
nir_metadata_require(impl, nir_metadata_block_index |
|
||||
nir_metadata_dominance);
|
||||
|
||||
realloc_info(impl);
|
||||
generate_euler_tour(impl);
|
||||
range_minimum_query_table_preprocess(&impl->dom_lca_info.table);
|
||||
}
|
||||
|
||||
static nir_block *
|
||||
block_return_if_reachable(nir_block *b)
|
||||
{
|
||||
return (b && nir_block_is_reachable(b)) ? b : NULL;
|
||||
}
|
||||
|
||||
static bool
|
||||
is_lca(nir_block *result, nir_block *b1, nir_block *b2)
|
||||
{
|
||||
if (!nir_block_dominates(result, b1) || !nir_block_dominates(result, b2))
|
||||
return false;
|
||||
|
||||
for (int i = 0; i < result->num_dom_children; i++) {
|
||||
nir_block *child = result->dom_children[i];
|
||||
if (nir_block_dominates(child, b1) &&
|
||||
nir_block_dominates(child, b2))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
nir_block *
|
||||
nir_dominance_lca(nir_block *b1, nir_block *b2)
|
||||
{
|
||||
if (b1 == NULL || !nir_block_is_reachable(b1))
|
||||
return block_return_if_reachable(b2);
|
||||
|
||||
if (b2 == NULL || !nir_block_is_reachable(b2))
|
||||
return block_return_if_reachable(b1);
|
||||
|
||||
assert(nir_cf_node_get_function(&b1->cf_node) ==
|
||||
nir_cf_node_get_function(&b2->cf_node));
|
||||
|
||||
nir_function_impl *impl = nir_cf_node_get_function(&b1->cf_node);
|
||||
assert(impl->valid_metadata & nir_metadata_dominance_lca);
|
||||
|
||||
uint32_t i1 = dom_lca_representative(b1);
|
||||
uint32_t i2 = dom_lca_representative(b2);
|
||||
if (i1 > i2)
|
||||
SWAP(i1, i2);
|
||||
uint32_t index = range_minimum_query(&impl->dom_lca_info.table, i1, i2 + 1);
|
||||
nir_block *result = impl->dom_lca_info.block_from_idx[index];
|
||||
|
||||
assert(is_lca(result, b1, b2));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
@ -38,6 +38,8 @@ nir_metadata_require(nir_function_impl *impl, nir_metadata required, ...)
|
|||
nir_index_instrs(impl);
|
||||
if (NEEDS_UPDATE(nir_metadata_dominance))
|
||||
nir_calc_dominance_impl(impl);
|
||||
if (NEEDS_UPDATE(nir_metadata_dominance_lca))
|
||||
nir_calc_dominance_lca_impl(impl);
|
||||
if (NEEDS_UPDATE(nir_metadata_live_defs))
|
||||
nir_live_defs_impl(impl);
|
||||
if (NEEDS_UPDATE(nir_metadata_divergence))
|
||||
|
|
@ -73,6 +75,9 @@ nir_progress(bool progress, nir_function_impl *impl, nir_metadata preserved)
|
|||
if (!progress)
|
||||
preserved = nir_metadata_all;
|
||||
|
||||
if (!(preserved & nir_metadata_dominance))
|
||||
assert(!(preserved & nir_metadata_dominance_lca));
|
||||
|
||||
/* If we discard valid liveness information, immediately free the
|
||||
* liveness information for each block. For large shaders, it can
|
||||
* consume a huge amount of memory, and it's usually not immediately
|
||||
|
|
|
|||
|
|
@ -796,8 +796,9 @@ weak_gvn(const nir_instr *a, const nir_instr *b)
|
|||
static bool
|
||||
opt_gcm_impl(nir_shader *shader, nir_function_impl *impl, bool value_number)
|
||||
{
|
||||
nir_metadata_require(impl,
|
||||
nir_metadata_block_index | nir_metadata_dominance);
|
||||
nir_metadata_require(impl, nir_metadata_block_index |
|
||||
nir_metadata_dominance |
|
||||
nir_metadata_dominance_lca);
|
||||
nir_metadata_require(impl, nir_metadata_loop_analysis,
|
||||
shader->options->force_indirect_unrolling,
|
||||
shader->options->force_indirect_unrolling_sampler);
|
||||
|
|
|
|||
|
|
@ -361,11 +361,12 @@ nir_opt_sink(nir_shader *shader, nir_move_options options)
|
|||
bool progress = false;
|
||||
|
||||
nir_foreach_function_impl(impl, shader) {
|
||||
nir_metadata_require(impl,
|
||||
nir_metadata_block_index | nir_metadata_dominance |
|
||||
(options & (nir_move_only_convergent |
|
||||
nir_move_only_divergent) ?
|
||||
nir_metadata_divergence : 0));
|
||||
nir_metadata required = nir_metadata_block_index |
|
||||
nir_metadata_dominance |
|
||||
nir_metadata_dominance_lca;
|
||||
if (options & (nir_move_only_convergent | nir_move_only_divergent))
|
||||
required |= nir_metadata_divergence;
|
||||
nir_metadata_require(impl, required);
|
||||
|
||||
nir_foreach_block_reverse(block, impl) {
|
||||
nir_foreach_instr_reverse_safe(instr, block) {
|
||||
|
|
|
|||
|
|
@ -64,7 +64,8 @@ brw_nir_lower_rt_intrinsics_pre_trace(nir_shader *nir)
|
|||
|
||||
if (intrinsics->entries > 0) {
|
||||
nir_foreach_function_with_impl(func, impl, nir) {
|
||||
nir_metadata_require(impl, nir_metadata_dominance);
|
||||
nir_metadata_require(impl, nir_metadata_dominance |
|
||||
nir_metadata_dominance_lca);
|
||||
|
||||
/* Going in reverse order of blocks, move the intrinsics gather above
|
||||
* in the LCA block to trace calls.
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue