nir/from_ssa: Stop using nir_parallel_copy_instr

nir_parallel_copy_instr can be emulated using an intrinsic for each
entry and an array of arrays that is used by the pass to remember which
copies belong together.

Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36483>
This commit is contained in:
Konstantin Seurer 2025-07-30 22:32:23 +02:00 committed by Marge Bot
parent b20fd0ef48
commit 3f3faa82b8
2 changed files with 101 additions and 177 deletions

View file

@ -26,6 +26,8 @@
#include "nir_builder_opcodes.h"
#include "nir_vla.h"
#include "util/u_dynarray.h"
/*
* This file implements an out-of-SSA pass as described in "Revisiting
* Out-of-SSA Translation for Correctness, Code Quality, and Efficiency" by
@ -38,11 +40,17 @@ struct from_ssa_state {
struct exec_list dead_instrs;
bool phi_webs_only;
struct hash_table *merge_node_table;
struct block_parallel_copies *parallel_copies;
nir_instr *instr;
bool consider_divergence;
bool progress;
};
struct block_parallel_copies {
struct util_dynarray start;
struct util_dynarray end;
};
/* Returns if def @a comes after def @b.
*
* The core observation that makes the Boissinot algorithm efficient
@ -295,55 +303,6 @@ merge_sets_interfere(merge_set *a, merge_set *b)
return false;
}
static bool
add_parallel_copy_to_end_of_block(nir_shader *shader, nir_block *block, void *dead_ctx)
{
bool need_end_copy = false;
if (block->successors[0]) {
nir_instr *instr = nir_block_first_instr(block->successors[0]);
if (instr && instr->type == nir_instr_type_phi)
need_end_copy = true;
}
if (block->successors[1]) {
nir_instr *instr = nir_block_first_instr(block->successors[1]);
if (instr && instr->type == nir_instr_type_phi)
need_end_copy = true;
}
if (need_end_copy) {
/* If one of our successors has at least one phi node, we need to
* create a parallel copy at the end of the block but before the jump
* (if there is one).
*/
nir_parallel_copy_instr *pcopy =
nir_parallel_copy_instr_create(shader);
nir_instr_insert(nir_after_block_before_jump(block), &pcopy->instr);
}
return true;
}
static nir_parallel_copy_instr *
get_parallel_copy_at_end_of_block(nir_block *block)
{
nir_instr *last_instr = nir_block_last_instr(block);
if (last_instr == NULL)
return NULL;
/* The last instruction may be a jump in which case the parallel copy is
* right before it.
*/
if (last_instr->type == nir_instr_type_jump)
last_instr = nir_instr_prev(last_instr);
if (last_instr && last_instr->type == nir_instr_type_parallel_copy)
return nir_instr_as_parallel_copy(last_instr);
else
return NULL;
}
/** Isolate phi nodes with parallel copies
*
* In order to solve the dependency problems with the sources and
@ -380,50 +339,27 @@ isolate_phi_nodes_block(nir_shader *shader, nir_block *block, struct from_ssa_st
if (last_phi == NULL)
return true;
/* If we have phi nodes, we need to create a parallel copy at the
* start of this block but after the phi nodes.
*/
nir_parallel_copy_instr *block_pcopy =
nir_parallel_copy_instr_create(shader);
nir_instr_insert_after(&last_phi->instr, &block_pcopy->instr);
nir_foreach_phi(phi, block) {
nir_foreach_phi_src(src, phi) {
if (nir_src_is_undef(src->src))
continue;
nir_parallel_copy_instr *pcopy =
get_parallel_copy_at_end_of_block(src->pred);
assert(pcopy);
nir_builder pred_builder = nir_builder_at(nir_after_block_before_jump(src->pred));
nir_def *pred_copy = nir_parallel_copy(&pred_builder, phi->def.num_components, phi->def.bit_size, src->src.ssa, src->src.ssa);
pred_copy->divergent = state->consider_divergence && nir_src_is_divergent(&src->src);
nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx,
nir_parallel_copy_entry);
struct block_parallel_copies *pred_copies = &state->parallel_copies[src->pred->index];
util_dynarray_append(&pred_copies->end, nir_instr_as_intrinsic(pred_copy->parent_instr));
entry->dest_is_reg = false;
nir_def_init(&pcopy->instr, &entry->dest.def,
phi->def.num_components, phi->def.bit_size);
entry->dest.def.divergent = state->consider_divergence && nir_src_is_divergent(&src->src);
/* We're adding a source to a live instruction so we need to use
* nir_instr_init_src()
*/
entry->src_is_reg = false;
nir_instr_init_src(&pcopy->instr, &entry->src, src->src.ssa);
exec_list_push_tail(&pcopy->entries, &entry->node);
nir_src_rewrite(&src->src, &entry->dest.def);
nir_src_rewrite(&src->src, pred_copy);
}
nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx,
nir_parallel_copy_entry);
nir_intrinsic_instr *copy = nir_intrinsic_instr_create(shader, nir_intrinsic_parallel_copy);
entry->dest_is_reg = false;
nir_def_init(&block_pcopy->instr, &entry->dest.def,
phi->def.num_components, phi->def.bit_size);
entry->dest.def.divergent = state->consider_divergence && phi->def.divergent;
nir_def_init(&copy->instr, &copy->def, phi->def.num_components, phi->def.bit_size);
copy->def.divergent = state->consider_divergence && phi->def.divergent;
nir_def_rewrite_uses(&phi->def, &entry->dest.def);
nir_def_rewrite_uses(&phi->def, &copy->def);
/* We're adding a source to a live instruction so we need to use
* nir_instr_init_src().
@ -432,10 +368,14 @@ isolate_phi_nodes_block(nir_shader *shader, nir_block *block, struct from_ssa_st
* entry->def, ensuring that entry->src will be the only remaining use
* of the phi.
*/
entry->src_is_reg = false;
nir_instr_init_src(&block_pcopy->instr, &entry->src, &phi->def);
nir_instr_init_src(&copy->instr, &copy->src[0], &phi->def);
nir_instr_init_src(&copy->instr, &copy->src[1], &phi->def);
exec_list_push_tail(&block_pcopy->entries, &entry->node);
struct block_parallel_copies *copies = &state->parallel_copies[block->index];
util_dynarray_append(&copies->start, copy);
nir_builder builder = nir_builder_at(nir_after_instr(&last_phi->instr));
nir_builder_instr_insert(&builder, &copy->instr);
}
return true;
@ -461,23 +401,24 @@ coalesce_phi_nodes_block(nir_block *block, struct from_ssa_state *state)
}
static void
aggressive_coalesce_parallel_copy(nir_parallel_copy_instr *pcopy,
aggressive_coalesce_parallel_copy(struct util_dynarray *pcopy,
struct from_ssa_state *state)
{
nir_foreach_parallel_copy_entry(entry, pcopy) {
assert(!entry->src_is_reg);
assert(!entry->dest_is_reg);
assert(entry->dest.def.num_components ==
entry->src.ssa->num_components);
util_dynarray_foreach(pcopy, nir_intrinsic_instr *, copy_pointer) {
nir_intrinsic_instr *copy = *copy_pointer;
assert(!nir_intrinsic_src_is_reg(copy));
assert(!nir_intrinsic_dst_is_reg(copy));
assert(copy->def.num_components == copy->src[0].ssa->num_components);
/* Since load_const instructions are SSA only, we can't replace their
* destinations with registers and, therefore, can't coalesce them.
*/
if (entry->src.ssa->parent_instr->type == nir_instr_type_load_const)
if (copy->src[0].ssa->parent_instr->type == nir_instr_type_load_const)
continue;
merge_node *src_node = get_merge_node(entry->src.ssa, state);
merge_node *dest_node = get_merge_node(&entry->dest.def, state);
merge_node *src_node = get_merge_node(copy->src[0].ssa, state);
merge_node *dest_node = get_merge_node(&copy->def, state);
if (src_node->set == dest_node->set)
continue;
@ -493,31 +434,12 @@ aggressive_coalesce_parallel_copy(nir_parallel_copy_instr *pcopy,
}
}
static bool
static void
aggressive_coalesce_block(nir_block *block, struct from_ssa_state *state)
{
nir_parallel_copy_instr *start_pcopy = NULL;
nir_foreach_instr(instr, block) {
/* Phi nodes only ever come at the start of a block */
if (instr->type != nir_instr_type_phi) {
if (instr->type != nir_instr_type_parallel_copy)
break; /* The parallel copy must be right after the phis */
start_pcopy = nir_instr_as_parallel_copy(instr);
aggressive_coalesce_parallel_copy(start_pcopy, state);
break;
}
}
nir_parallel_copy_instr *end_pcopy =
get_parallel_copy_at_end_of_block(block);
if (end_pcopy && end_pcopy != start_pcopy)
aggressive_coalesce_parallel_copy(end_pcopy, state);
return true;
struct block_parallel_copies *copies = &state->parallel_copies[block->index];
aggressive_coalesce_parallel_copy(&copies->start, state);
aggressive_coalesce_parallel_copy(&copies->end, state);
}
static nir_def *
@ -703,47 +625,42 @@ resolve_registers_impl(nir_function_impl *impl, struct from_ssa_state *state)
}
nir_foreach_instr_reverse_safe(instr, block) {
switch (instr->type) {
case nir_instr_type_phi:
remove_no_op_phi(instr, state);
break;
case nir_instr_type_parallel_copy: {
nir_parallel_copy_instr *pcopy = nir_instr_as_parallel_copy(instr);
nir_foreach_parallel_copy_entry(entry, pcopy) {
assert(!entry->dest_is_reg);
if (instr->type == nir_instr_type_intrinsic) {
nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
if (intrinsic->intrinsic == nir_intrinsic_parallel_copy) {
assert(!nir_intrinsic_src_is_reg(intrinsic));
assert(!nir_intrinsic_dst_is_reg(intrinsic));
/* Parallel copy destinations will always be registers */
nir_def *reg = reg_for_ssa_def(&entry->dest.def, state);
nir_def *reg = reg_for_ssa_def(&intrinsic->def, state);
assert(reg != NULL);
/* We're switching from the nir_def to the nir_src in the dest
* union so we need to use nir_instr_init_src() here.
*/
assert(nir_def_is_unused(&entry->dest.def));
entry->dest_is_reg = true;
nir_instr_init_src(&pcopy->instr, &entry->dest.reg, reg);
}
assert(nir_def_is_unused(&intrinsic->def));
nir_intrinsic_set_dst_is_reg(intrinsic, true);
nir_src_rewrite(&intrinsic->src[1], reg);
nir_foreach_parallel_copy_entry(entry, pcopy) {
assert(!entry->src_is_reg);
nir_def *reg = reg_for_ssa_def(entry->src.ssa, state);
if (reg == NULL)
continue;
reg = reg_for_ssa_def(intrinsic->src[0].ssa, state);
if (reg) {
nir_intrinsic_set_src_is_reg(intrinsic, true);
nir_src_rewrite(&intrinsic->src[0], reg);
}
entry->src_is_reg = true;
nir_src_rewrite(&entry->src, reg);
continue;
}
break;
}
default:
state->builder.cursor = nir_after_instr(instr);
nir_foreach_def(instr, rewrite_ssa_def, state);
state->builder.cursor = nir_before_instr(instr);
nir_foreach_src(instr, rewrite_src, state);
if (instr->type == nir_instr_type_phi) {
remove_no_op_phi(instr, state);
continue;
}
state->builder.cursor = nir_after_instr(instr);
nir_foreach_def(instr, rewrite_ssa_def, state);
state->builder.cursor = nir_before_instr(instr);
nir_foreach_src(instr, rewrite_src, state);
}
}
}
@ -803,14 +720,19 @@ copy_values(struct from_ssa_state *state, struct copy_value dest, struct copy_va
}
static void
resolve_parallel_copy(nir_parallel_copy_instr *pcopy,
resolve_parallel_copy(struct util_dynarray *pcopy,
struct from_ssa_state *state)
{
unsigned num_copies = 0;
nir_foreach_parallel_copy_entry(entry, pcopy) {
nir_intrinsic_instr *first_copy = NULL;
util_dynarray_foreach(pcopy, nir_intrinsic_instr *, copy_pointer) {
nir_intrinsic_instr *copy = *copy_pointer;
if (!first_copy)
first_copy = copy;
/* Sources may be SSA but destinations are always registers */
assert(entry->dest_is_reg);
if (entry->src_is_reg && entry->src.ssa == entry->dest.reg.ssa)
assert(nir_intrinsic_dst_is_reg(copy));
if (nir_intrinsic_src_is_reg(copy) && copy->src[0].ssa == copy->src[1].ssa)
continue;
num_copies++;
@ -818,8 +740,6 @@ resolve_parallel_copy(nir_parallel_copy_instr *pcopy,
if (num_copies == 0) {
/* Hooray, we don't need any copies! */
nir_instr_remove(&pcopy->instr);
exec_list_push_tail(&state->dead_instrs, &pcopy->instr.node);
return;
}
@ -836,7 +756,7 @@ resolve_parallel_copy(nir_parallel_copy_instr *pcopy,
NIR_VLA(int, to_do, num_copies * 2);
int to_do_idx = -1;
state->builder.cursor = nir_before_instr(&pcopy->instr);
state->builder.cursor = nir_before_instr(&first_copy->instr);
/* Now we set everything up:
* - All values get assigned a temporary index
@ -844,14 +764,15 @@ resolve_parallel_copy(nir_parallel_copy_instr *pcopy,
* - Predecessors are recorded from sources and destinations
*/
int num_vals = 0;
nir_foreach_parallel_copy_entry(entry, pcopy) {
util_dynarray_foreach(pcopy, nir_intrinsic_instr *, copy_pointer) {
nir_intrinsic_instr *copy = *copy_pointer;
/* Sources may be SSA but destinations are always registers */
if (entry->src_is_reg && entry->src.ssa == entry->dest.reg.ssa)
if (nir_intrinsic_src_is_reg(copy) && copy->src[0].ssa == copy->src[1].ssa)
continue;
struct copy_value src_value = {
.is_reg = entry->src_is_reg,
.ssa = entry->src.ssa,
.is_reg = nir_intrinsic_src_is_reg(copy),
.ssa = copy->src[0].ssa,
};
int src_idx = -1;
@ -864,10 +785,10 @@ resolve_parallel_copy(nir_parallel_copy_instr *pcopy,
values[src_idx] = src_value;
}
assert(entry->dest_is_reg);
assert(nir_intrinsic_dst_is_reg(copy));
struct copy_value dest_value = {
.is_reg = true,
.ssa = entry->dest.reg.ssa,
.ssa = copy->src[1].ssa,
};
int dest_idx = -1;
@ -982,36 +903,29 @@ resolve_parallel_copy(nir_parallel_copy_instr *pcopy,
ready[++ready_idx] = b;
num_vals++;
}
nir_instr_remove(&pcopy->instr);
exec_list_push_tail(&state->dead_instrs, &pcopy->instr.node);
}
/* Resolves the parallel copies in a block. Each block can have at most
* two: One at the beginning, right after all the phi noces, and one at
* the end (or right before the final jump if it exists).
*/
static bool
static void
resolve_parallel_copies_block(nir_block *block, struct from_ssa_state *state)
{
/* At this point, we have removed all of the phi nodes. If a parallel
* copy existed right after the phi nodes in this block, it is now the
* first instruction.
*/
nir_instr *first_instr = nir_block_first_instr(block);
if (first_instr == NULL)
return true; /* Empty, nothing to do. */
struct block_parallel_copies *copies = &state->parallel_copies[block->index];
resolve_parallel_copy(&copies->start, state);
resolve_parallel_copy(&copies->end, state);
/* There can be load_reg in the way of the copies... don't be clever. */
nir_foreach_instr_safe(instr, block) {
if (instr->type == nir_instr_type_parallel_copy) {
nir_parallel_copy_instr *pcopy = nir_instr_as_parallel_copy(instr);
if (instr->type != nir_instr_type_intrinsic)
continue;
resolve_parallel_copy(pcopy, state);
nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
if (intrinsic->intrinsic == nir_intrinsic_parallel_copy) {
nir_instr_remove(instr);
exec_list_push_tail(&state->dead_instrs, &instr->node);
}
}
return true;
}
static bool
@ -1022,16 +936,20 @@ nir_convert_from_ssa_impl(nir_function_impl *impl,
struct from_ssa_state state;
nir_metadata_require(impl, nir_metadata_block_index);
state.builder = nir_builder_create(impl);
state.dead_ctx = ralloc_context(NULL);
state.phi_webs_only = phi_webs_only;
state.merge_node_table = _mesa_pointer_hash_table_create(NULL);
state.parallel_copies = ralloc_array(state.dead_ctx, struct block_parallel_copies, impl->num_blocks);
state.consider_divergence = consider_divergence;
state.progress = false;
exec_list_make_empty(&state.dead_instrs);
nir_foreach_block(block, impl) {
add_parallel_copy_to_end_of_block(shader, block, state.dead_ctx);
for (uint32_t i = 0; i < impl->num_blocks; i++) {
util_dynarray_init(&state.parallel_copies[i].start, state.parallel_copies);
util_dynarray_init(&state.parallel_copies[i].end, state.parallel_copies);
}
nir_foreach_block(block, impl) {

View file

@ -358,6 +358,9 @@ index("struct glsl_cmat_description", "src_cmat_desc")
# coordinates in compute.
index("bool", "explicit_coord")
index("bool", "src_is_reg")
index("bool", "dst_is_reg")
# The index of the format string used by a printf. (u_printf_info element of the shader)
index("unsigned", "fmt_idx")
# for NV coop matrix - num of matrix in load 1/2/4
@ -389,6 +392,9 @@ intrinsic("store_deref", src_comp=[-1, 0], indices=[WRITE_MASK, ACCESS])
intrinsic("copy_deref", src_comp=[-1, -1], indices=[DST_ACCESS, SRC_ACCESS])
intrinsic("memcpy_deref", src_comp=[-1, -1, 1], indices=[DST_ACCESS, SRC_ACCESS])
# Sources: [src, reg_dst], Definition: dst
intrinsic("parallel_copy", dest_comp=0, src_comp=[-1, -1], indices=[SRC_IS_REG, DST_IS_REG])
# Returns an opaque handle representing a register indexed by BASE. The
# logically def-use list of a register is given by the use list of this handle.
# The shape of the underlying register is given by the indices, the handle