mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-04-30 17:00:38 +02:00
Cc: mesa-stable Reviewed-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35247>
499 lines
18 KiB
C
499 lines
18 KiB
C
/*
|
|
* Copyright © 2024 Advanced Micro Devices, Inc.
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
/* For each output slot, gather which input components are used to compute it.
|
|
* Component-wise ALU instructions must be scalar.
|
|
*/
|
|
|
|
#include "nir_builder.h"
|
|
#include "util/hash_table.h"
|
|
#include "util/u_dynarray.h"
|
|
#include "util/u_memory.h"
|
|
|
|
static void
|
|
accum_deps(BITSET_WORD *dst, BITSET_WORD *src, unsigned num_bitset_words)
|
|
{
|
|
__bitset_or(dst, dst, src, num_bitset_words);
|
|
}
|
|
|
|
typedef struct {
|
|
BITSET_WORD **instr_deps;
|
|
BITSET_WORD *tmp;
|
|
unsigned num_bitset_words;
|
|
} foreach_src_data;
|
|
|
|
static bool
|
|
accum_src_deps(nir_src *src, void *opaque)
|
|
{
|
|
foreach_src_data *data = (foreach_src_data *)opaque;
|
|
nir_instr *src_instr = src->ssa->parent_instr;
|
|
|
|
if (src_instr->type == nir_instr_type_load_const ||
|
|
src_instr->type == nir_instr_type_undef)
|
|
return true;
|
|
|
|
nir_instr *dst_instr = nir_src_parent_instr(src);
|
|
accum_deps(data->instr_deps[dst_instr->index],
|
|
data->instr_deps[src_instr->index], data->num_bitset_words);
|
|
return true;
|
|
}
|
|
|
|
typedef struct {
|
|
nir_block *start_block; /* the first block of the loop */
|
|
nir_block *exit_block; /* the first block after the loop */
|
|
bool header_phi_changed;
|
|
} loop_entry;
|
|
|
|
static loop_entry *
|
|
get_current_loop(struct util_dynarray *loop_stack)
|
|
{
|
|
assert(util_dynarray_num_elements(loop_stack, loop_entry));
|
|
return util_dynarray_last_ptr(loop_stack, loop_entry);
|
|
}
|
|
|
|
/* For each output slot, gather which instructions are used to compute it.
|
|
* The result is that each output slot will have the list of all instructions
|
|
* that must execute to compute that output.
|
|
*
|
|
* If there are memory operations that affect other memory operations, those
|
|
* dependencies are not gathered.
|
|
*
|
|
* Required:
|
|
* - The shader must be in LCSSA form.
|
|
*
|
|
* Recommended:
|
|
* - IO intrinsics and component-wise ALU instructions should be scalar, and
|
|
* vecN opcodes should have their components copy-propagated. If not,
|
|
* the results will have false dependencies.
|
|
*
|
|
* Algorithm:
|
|
* - For each instruction, compute a bitset of instruction indices whose
|
|
* results are needed to compute the result of the instruction. The final
|
|
* bitset is the instruction index OR'd with bitsets of all its sources and
|
|
* also all if-conditions used to enter the block, recursively.
|
|
* - Since every instruction inherits instruction bitsets from its sources,
|
|
* every instruction contains the list of all instructions that must execute
|
|
* before the instruction can execute.
|
|
* - At the end, output stores contain the list of instructions that must
|
|
* execute to compute their results. This may be any subset of instructions
|
|
* from the shader, including all instructions.
|
|
*
|
|
* Control flow notes:
|
|
* - There is a stack of "if" conditions for entered ifs.
|
|
* - The dependencies of instructions are the union of dependencies of all
|
|
* their sources and all if conditions on the if-condition stack.
|
|
* - For each continue, all loop-header phis receive the dependencies of all
|
|
* if-conditions on the if-condition stack at the continue.
|
|
* - For each break, all loop-exit phis receive the dependencies of all
|
|
* if-conditions on the if-condition stack at the break.
|
|
* - If there is any change to loop-header phis while iterating over a loop,
|
|
* we iterate over the loop again after the current iteration is finished.
|
|
*/
|
|
void
|
|
nir_gather_output_dependencies(nir_shader *nir, nir_output_deps *deps)
|
|
{
|
|
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
|
|
nir_metadata_require(impl, nir_metadata_instr_index);
|
|
unsigned num_instr = nir_impl_last_block(impl)->end_ip;
|
|
|
|
/* Allocate bitsets of instruction->instruction dependencies. */
|
|
unsigned num_bitset_words = BITSET_WORDS(num_instr);
|
|
BITSET_WORD **instr_deps = rzalloc_array(NULL, BITSET_WORD*, num_instr);
|
|
void *mem_ctx = instr_deps;
|
|
for (unsigned i = 0; i < num_instr; i++)
|
|
instr_deps[i] = rzalloc_array(mem_ctx, BITSET_WORD, num_bitset_words);
|
|
|
|
/* Allocate bitsets of instruction->output dependencies. */
|
|
BITSET_WORD **out_deps = rzalloc_array(mem_ctx, BITSET_WORD*,
|
|
NUM_TOTAL_VARYING_SLOTS);
|
|
|
|
/* Allocate stacks. */
|
|
struct util_dynarray loop_stack, if_cond_stack;
|
|
util_dynarray_init(&loop_stack, mem_ctx);
|
|
util_dynarray_init(&if_cond_stack, mem_ctx);
|
|
|
|
/* Gather dependencies of every instruction.
|
|
* Dependencies of each instruction are OR'd dependencies of its sources and
|
|
* control flow conditions.
|
|
*/
|
|
nir_foreach_block(block, impl) {
|
|
nir_cf_node *parent_cf = block->cf_node.parent;
|
|
bool is_loop_first_block = parent_cf->type == nir_cf_node_loop &&
|
|
block == nir_cf_node_cf_tree_first(parent_cf);
|
|
if (is_loop_first_block) {
|
|
loop_entry loop = {
|
|
.start_block = block,
|
|
.exit_block = nir_cf_node_cf_tree_next(parent_cf),
|
|
};
|
|
util_dynarray_append(&loop_stack, loop_entry, loop);
|
|
}
|
|
|
|
if (parent_cf->type == nir_cf_node_if &&
|
|
block == nir_if_first_then_block(nir_cf_node_as_if(parent_cf))) {
|
|
util_dynarray_append(&if_cond_stack, nir_def *,
|
|
nir_cf_node_as_if(parent_cf)->condition.ssa);
|
|
}
|
|
|
|
loop_again:
|
|
nir_foreach_instr(instr, block) {
|
|
/* Add self as a dependency. */
|
|
BITSET_WORD *this_instr_deps = instr_deps[instr->index];
|
|
BITSET_SET(this_instr_deps, instr->index);
|
|
|
|
/* Add sources as dependencies. */
|
|
nir_foreach_src(instr, accum_src_deps,
|
|
&(foreach_src_data){instr_deps, NULL, num_bitset_words});
|
|
|
|
/* Add parent if-conditions as dependencies.
|
|
*
|
|
* Note that phis with sources inside conditional blocks don't need
|
|
* this because the phi sources already contain if-conditions.
|
|
*/
|
|
util_dynarray_foreach(&if_cond_stack, nir_def *, cond) {
|
|
accum_deps(this_instr_deps,
|
|
instr_deps[(*cond)->parent_instr->index],
|
|
num_bitset_words);
|
|
}
|
|
|
|
/* Gather the current instruction. */
|
|
switch (instr->type) {
|
|
case nir_instr_type_jump:
|
|
switch (nir_instr_as_jump(instr)->type) {
|
|
case nir_jump_continue:
|
|
case nir_jump_break: {
|
|
loop_entry *loop = get_current_loop(&loop_stack);
|
|
/* Iterate over all loop-header phis (for continue) or all
|
|
* loop-exit phis (for break).
|
|
*
|
|
* Assumption: Only the loop-start block can have loop-header
|
|
* phis.
|
|
*/
|
|
bool is_continue =
|
|
nir_instr_as_jump(instr)->type == nir_jump_continue;
|
|
nir_block *iter_block =
|
|
is_continue ? loop->start_block : loop->exit_block;
|
|
assert(iter_block);
|
|
|
|
nir_foreach_phi(phi, iter_block) {
|
|
/* We need to track whether any header phi of the current
|
|
* loop has changed because we need to walk such loops
|
|
* again. Use the bitset bitcount to determine whether
|
|
* any instruction has been added to header phis as
|
|
* a dependency.
|
|
*/
|
|
unsigned old_count = 0;
|
|
if (is_continue) {
|
|
old_count = __bitset_count(instr_deps[phi->instr.index],
|
|
num_bitset_words);
|
|
}
|
|
|
|
/* Add dependencies of all if-conditions affecting the
|
|
* jump statement to phis at the loop header / exit.
|
|
*/
|
|
util_dynarray_foreach(&if_cond_stack, nir_def *, cond) {
|
|
accum_deps(instr_deps[phi->instr.index],
|
|
instr_deps[(*cond)->parent_instr->index],
|
|
num_bitset_words);
|
|
}
|
|
|
|
if (is_continue &&
|
|
old_count != __bitset_count(instr_deps[phi->instr.index],
|
|
num_bitset_words))
|
|
loop->header_phi_changed = true;
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
unreachable("unexpected jump type");
|
|
}
|
|
break;
|
|
|
|
case nir_instr_type_intrinsic: {
|
|
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
|
|
|
switch (intr->intrinsic) {
|
|
case nir_intrinsic_store_output:
|
|
case nir_intrinsic_store_per_vertex_output:
|
|
case nir_intrinsic_store_per_primitive_output:
|
|
case nir_intrinsic_store_per_view_output: {
|
|
/* The write mask must be contiguous starting from x. */
|
|
ASSERTED unsigned writemask = nir_intrinsic_write_mask(intr);
|
|
assert(writemask == BITFIELD_MASK(util_bitcount(writemask)));
|
|
|
|
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
|
|
assert(sem.num_slots >= 1);
|
|
|
|
for (unsigned i = 0; i < sem.num_slots; i++) {
|
|
unsigned slot = sem.location + i;
|
|
if (!out_deps[slot]) {
|
|
out_deps[slot] = rzalloc_array(mem_ctx, BITSET_WORD,
|
|
num_bitset_words);
|
|
}
|
|
accum_deps(out_deps[slot], this_instr_deps, num_bitset_words);
|
|
}
|
|
break;
|
|
}
|
|
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (parent_cf->type == nir_cf_node_if &&
|
|
block == nir_if_last_else_block(nir_cf_node_as_if(parent_cf))) {
|
|
/* Add the current if stack to the phis after the if node because
|
|
* this can happen:
|
|
*
|
|
* a = load_const true
|
|
* b = load_const false
|
|
* if (cond) {
|
|
* } else {
|
|
* }
|
|
* c = phi a, b
|
|
*
|
|
* c depends on cond, but doesn't use any defs from then/else blocks.
|
|
*/
|
|
nir_foreach_phi(phi, nir_cf_node_cf_tree_next(parent_cf)) {
|
|
util_dynarray_foreach(&if_cond_stack, nir_def *, cond) {
|
|
accum_deps(instr_deps[phi->instr.index],
|
|
instr_deps[(*cond)->parent_instr->index],
|
|
num_bitset_words);
|
|
}
|
|
}
|
|
|
|
assert(util_dynarray_num_elements(&if_cond_stack, nir_def *));
|
|
(void)util_dynarray_pop_ptr(&if_cond_stack, nir_def *);
|
|
}
|
|
|
|
if (parent_cf->type == nir_cf_node_loop &&
|
|
block == nir_cf_node_cf_tree_last(parent_cf)) {
|
|
assert(util_dynarray_num_elements(&loop_stack, loop_entry));
|
|
loop_entry *loop = get_current_loop(&loop_stack);
|
|
|
|
/* Check if any loop header phis would be changed by iterating over
|
|
* the loop again.
|
|
*/
|
|
nir_foreach_phi(phi, loop->start_block) {
|
|
unsigned old_count = __bitset_count(instr_deps[phi->instr.index],
|
|
num_bitset_words);
|
|
nir_foreach_src(&phi->instr, accum_src_deps,
|
|
&(foreach_src_data){instr_deps, NULL, num_bitset_words});
|
|
if (old_count != __bitset_count(instr_deps[phi->instr.index],
|
|
num_bitset_words)) {
|
|
loop->header_phi_changed = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (loop->header_phi_changed) {
|
|
loop->header_phi_changed = false;
|
|
/* Iterate over the loop again: */
|
|
is_loop_first_block = true;
|
|
block = loop->start_block;
|
|
assert(block);
|
|
goto loop_again;
|
|
}
|
|
|
|
(void)util_dynarray_pop_ptr(&loop_stack, loop_entry);
|
|
}
|
|
}
|
|
|
|
/* Gather instructions that affect each output from bitsets. */
|
|
memset(deps, 0, sizeof(*deps));
|
|
|
|
for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
|
|
if (!out_deps[i])
|
|
continue;
|
|
|
|
unsigned total = __bitset_count(out_deps[i], num_bitset_words);
|
|
unsigned added = 0;
|
|
deps->output[i].num_instr = total;
|
|
deps->output[i].instr_list = malloc(total * sizeof(nir_instr*));
|
|
|
|
nir_foreach_block(block, impl) {
|
|
nir_foreach_instr(instr, block) {
|
|
if (BITSET_TEST(out_deps[i], instr->index)) {
|
|
assert(added < total);
|
|
deps->output[i].instr_list[added++] = instr;
|
|
}
|
|
}
|
|
}
|
|
assert(added == total);
|
|
}
|
|
|
|
ralloc_free(mem_ctx);
|
|
}
|
|
|
|
void
|
|
nir_free_output_dependencies(nir_output_deps *deps)
|
|
{
|
|
for (unsigned i = 0; i < ARRAY_SIZE(deps->output); i++) {
|
|
assert(!!deps->output[i].instr_list == !!deps->output[i].num_instr);
|
|
if (deps->output[i].instr_list)
|
|
free(deps->output[i].instr_list);
|
|
}
|
|
}
|
|
|
|
static unsigned
|
|
get_slot_index(nir_intrinsic_instr *intr, unsigned slot_offset)
|
|
{
|
|
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
|
|
return (sem.location + slot_offset) * 8 + nir_intrinsic_component(intr) * 2 +
|
|
sem.high_16bits;
|
|
}
|
|
|
|
/* For each output slot, gather which inputs are used to compute it.
|
|
* The shader must be in LCSSA form.
|
|
*
|
|
* If there are memory operations that affect other memory operations, those
|
|
* dependencies are not gathered.
|
|
*/
|
|
void
|
|
nir_gather_input_to_output_dependencies(nir_shader *nir,
|
|
nir_input_to_output_deps *out_deps)
|
|
{
|
|
nir_output_deps deps;
|
|
nir_gather_output_dependencies(nir, &deps);
|
|
|
|
memset(out_deps, 0, sizeof(*out_deps));
|
|
|
|
for (unsigned out = 0; out < ARRAY_SIZE(deps.output); out++) {
|
|
unsigned num_instr = deps.output[out].num_instr;
|
|
|
|
if (!num_instr)
|
|
continue;
|
|
|
|
out_deps->output[out].defined = true;
|
|
|
|
for (unsigned i = 0; i < num_instr; i++) {
|
|
nir_instr *instr = deps.output[out].instr_list[i];
|
|
if (instr->type == nir_instr_type_tex) {
|
|
if (!nir_tex_instr_is_query(nir_instr_as_tex(instr)))
|
|
out_deps->output[out].uses_image_reads = true;
|
|
}
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
continue;
|
|
|
|
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
|
switch (intr->intrinsic) {
|
|
case nir_intrinsic_load_input:
|
|
case nir_intrinsic_load_input_vertex:
|
|
case nir_intrinsic_load_per_vertex_input:
|
|
case nir_intrinsic_load_per_primitive_input:
|
|
case nir_intrinsic_load_interpolated_input: {
|
|
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
|
|
assert(intr->def.num_components == 1);
|
|
assert(sem.num_slots >= 1);
|
|
|
|
for (unsigned index = 0; index < sem.num_slots; index++) {
|
|
unsigned slot = get_slot_index(intr, index);
|
|
BITSET_SET(out_deps->output[out].inputs, slot);
|
|
}
|
|
break;
|
|
}
|
|
default: {
|
|
const char *name = nir_intrinsic_infos[intr->intrinsic].name;
|
|
|
|
if (strstr(name, "load_ssbo") || strstr(name, "ssbo_atomic"))
|
|
out_deps->output[out].uses_ssbo_reads = true;
|
|
|
|
if (strstr(name, "image") &&
|
|
(strstr(name, "load") || strstr(name, "atomic")))
|
|
out_deps->output[out].uses_image_reads = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
nir_free_output_dependencies(&deps);
|
|
}
|
|
|
|
void
|
|
nir_print_input_to_output_deps(nir_input_to_output_deps *deps,
|
|
nir_shader *nir, FILE *f)
|
|
{
|
|
for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
|
|
if (!deps->output[i].defined)
|
|
continue;
|
|
|
|
fprintf(f, "%s(->%s): %s =",
|
|
_mesa_shader_stage_to_abbrev(nir->info.stage),
|
|
nir->info.next_stage != MESA_SHADER_NONE ?
|
|
_mesa_shader_stage_to_abbrev(nir->info.next_stage) :
|
|
"NONE",
|
|
gl_varying_slot_name_for_stage(i, nir->info.stage));
|
|
|
|
unsigned in;
|
|
BITSET_FOREACH_SET(in, deps->output[i].inputs, NUM_TOTAL_VARYING_SLOTS * 8) {
|
|
fprintf(f, " %u.%c%s", in / 8, "xyzw"[(in % 8) / 2], in % 2 ? ".hi" : "");
|
|
}
|
|
fprintf(f, "%s%s",
|
|
deps->output[i].uses_ssbo_reads ? " (ssbo read)" : "",
|
|
deps->output[i].uses_image_reads ? " (image read)" : "");
|
|
fprintf(f, "\n");
|
|
}
|
|
}
|
|
|
|
/* Gather 3 disjoint sets:
|
|
* - the set of input components only used to compute outputs for the clipper
|
|
* (those that are only used to compute the position and clip outputs)
|
|
* - the set of input components only used to compute all other outputs
|
|
* - the set of input components that are used to compute BOTH outputs for
|
|
* the clipper and all other outputs
|
|
*
|
|
* If there are memory operations that affect other memory operations, those
|
|
* dependencies are not gathered.
|
|
*
|
|
* The shader must be in LCSSA form.
|
|
*
|
|
* Patch outputs are not gathered because shaders feeding the clipper don't
|
|
* have patch outputs.
|
|
*/
|
|
void
|
|
nir_gather_output_clipper_var_groups(nir_shader *nir,
|
|
nir_output_clipper_var_groups *groups)
|
|
{
|
|
nir_input_to_output_deps *deps = calloc(1, sizeof(*deps));
|
|
nir_gather_input_to_output_dependencies(nir, deps);
|
|
|
|
uint32_t clipper_outputs = VARYING_BIT_POS |
|
|
VARYING_BIT_CLIP_VERTEX |
|
|
VARYING_BIT_CLIP_DIST0 |
|
|
VARYING_BIT_CLIP_DIST1 |
|
|
VARYING_BIT_CULL_DIST0 |
|
|
VARYING_BIT_CULL_DIST1;
|
|
|
|
/* OR-reduce the per-output sets. */
|
|
memset(groups, 0, sizeof(*groups));
|
|
|
|
u_foreach_bit(i, clipper_outputs) {
|
|
if (deps->output[i].defined) {
|
|
BITSET_OR(groups->pos_only, groups->pos_only,
|
|
deps->output[i].inputs);
|
|
}
|
|
}
|
|
|
|
for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
|
|
if (deps->output[i].defined &&
|
|
(i >= 32 || !(clipper_outputs & BITFIELD_BIT(i)))) {
|
|
BITSET_OR(groups->var_only, groups->var_only,
|
|
deps->output[i].inputs);
|
|
}
|
|
}
|
|
|
|
/* Compute the intersection of the above and make them disjoint. */
|
|
BITSET_AND(groups->both, groups->pos_only, groups->var_only);
|
|
BITSET_ANDNOT(groups->pos_only, groups->pos_only, groups->both);
|
|
BITSET_ANDNOT(groups->var_only, groups->var_only, groups->both);
|
|
free(deps);
|
|
}
|