From f15399af0fef2e2adebccbddbf1f85f63fdccc34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 18 Nov 2024 16:10:26 -0500 Subject: [PATCH] nir: add gathering passes that gather which inputs affect specific outputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first pass computes which shader instructions contribute to each output. It can be used to query how data flows within shaders towards outputs. The second pass computes which shader input components and which types of memory loads are used to compute shader outputs. The third pass uses the second pass to gather which input components are used to compute pos and clip dist outputs, which input components are used to compute all other outputs, and which input components are used to compute both. This will be used by compaction in nir_opt_varyings for drivers that split TES into a separate position cull shader and varying shader to make it less likely that the same vec4 inputs are needed in both. Reviewed-by: Daniel Schürmann Part-of: --- src/compiler/nir/meson.build | 1 + src/compiler/nir/nir.h | 38 ++ src/compiler/nir/nir_gather_output_deps.c | 499 ++++++++++++++++++++++ src/util/u_dynarray.h | 1 + 4 files changed, 539 insertions(+) create mode 100644 src/compiler/nir/nir_gather_output_deps.c diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build index 286c83298a5..8e0595cc323 100644 --- a/src/compiler/nir/meson.build +++ b/src/compiler/nir/meson.build @@ -99,6 +99,7 @@ files_libnir = files( 'nir_from_ssa.c', 'nir_functions.c', 'nir_gather_info.c', + 'nir_gather_output_deps.c', 'nir_gather_tcs_info.c', 'nir_gather_types.c', 'nir_gather_xfb_info.c', diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index b76be907888..f0a4682700f 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -6461,6 +6461,44 @@ nir_verts_in_output_prim(nir_shader *gs) return mesa_vertices_per_prim(gs->info.gs.output_primitive); } +typedef struct { + struct { + /* The list of instructions that affect this output including the output + * store itself. If NULL, the output isn't stored. + */ + nir_instr **instr_list; + unsigned num_instr; + } output[NUM_TOTAL_VARYING_SLOTS]; +} nir_output_deps; + +void nir_gather_output_dependencies(nir_shader *nir, nir_output_deps *deps); +void nir_free_output_dependencies(nir_output_deps *deps); + +typedef struct { + struct { + /* Per component mask of input slots. */ + BITSET_DECLARE(inputs, NUM_TOTAL_VARYING_SLOTS * 8); + bool defined; + bool uses_ssbo_reads; + bool uses_image_reads; + } output[NUM_TOTAL_VARYING_SLOTS]; +} nir_input_to_output_deps; + +void nir_gather_input_to_output_dependencies(nir_shader *nir, + nir_input_to_output_deps *out_deps); +void nir_print_input_to_output_deps(nir_input_to_output_deps *deps, + nir_shader *nir, FILE *f); + +typedef struct { + /* 1 bit per 16-bit component. */ + BITSET_DECLARE(pos_only, NUM_TOTAL_VARYING_SLOTS * 8); + BITSET_DECLARE(var_only, NUM_TOTAL_VARYING_SLOTS * 8); + BITSET_DECLARE(both, NUM_TOTAL_VARYING_SLOTS * 8); +} nir_output_clipper_var_groups; + +void nir_gather_output_clipper_var_groups(nir_shader *nir, + nir_output_clipper_var_groups *groups); + #include "nir_inline_helpers.h" #ifdef __cplusplus diff --git a/src/compiler/nir/nir_gather_output_deps.c b/src/compiler/nir/nir_gather_output_deps.c new file mode 100644 index 00000000000..c3681844015 --- /dev/null +++ b/src/compiler/nir/nir_gather_output_deps.c @@ -0,0 +1,499 @@ +/* + * Copyright © 2024 Advanced Micro Devices, Inc. + * + * SPDX-License-Identifier: MIT + */ + +/* For each output slot, gather which input components are used to compute it. + * Component-wise ALU instructions must be scalar. + */ + +#include "nir_builder.h" +#include "util/hash_table.h" +#include "util/u_dynarray.h" +#include "util/u_memory.h" + +static void +accum_deps(BITSET_WORD *dst, BITSET_WORD *src, unsigned num_bitset_words) +{ + __bitset_or(dst, dst, src, num_bitset_words); +} + +typedef struct { + BITSET_WORD **instr_deps; + BITSET_WORD *tmp; + unsigned num_bitset_words; +} foreach_src_data; + +static bool +accum_src_deps(nir_src *src, void *opaque) +{ + foreach_src_data *data = (foreach_src_data *)opaque; + nir_instr *src_instr = src->ssa->parent_instr; + + if (src_instr->type == nir_instr_type_load_const || + src_instr->type == nir_instr_type_undef) + return true; + + nir_instr *dst_instr = nir_src_parent_instr(src); + accum_deps(data->instr_deps[dst_instr->index], + data->instr_deps[src_instr->index], data->num_bitset_words); + return true; +} + +typedef struct { + nir_block *start_block; /* the first block of the loop */ + nir_block *exit_block; /* the first block after the loop */ + bool header_phi_changed; +} loop_entry; + +static loop_entry * +get_current_loop(struct util_dynarray *loop_stack) +{ + assert(util_dynarray_num_elements(loop_stack, loop_entry)); + return util_dynarray_last_ptr(loop_stack, loop_entry); +} + +/* For each output slot, gather which instructions are used to compute it. + * The result is that each output slot will have the list of all instructions + * that must execute to compute that output. + * + * If there are memory operations that affect other memory operations, those + * dependencies are not gathered. + * + * Required: + * - The shader must be in LCSSA form. + * + * Recommended: + * - IO intrinsics and component-wise ALU instructions should be scalar, and + * vecN opcodes should have their components copy-propagated. If not, + * the results will have false dependencies. + * + * Algorithm: + * - For each instruction, compute a bitset of instruction indices whose + * results are needed to compute the result of the instruction. The final + * bitset is the instruction index OR'd with bitsets of all its sources and + * also all if-conditions used to enter the block, recursively. + * - Since every instruction inherits instruction bitsets from its sources, + * every instruction contains the list of all instructions that must execute + * before the instruction can execute. + * - At the end, output stores contain the list of instructions that must + * execute to compute their results. This may be any subset of instructions + * from the shader, including all instructions. + * + * Control flow notes: + * - There is a stack of "if" conditions for entered ifs. + * - The dependencies of instructions are the union of dependencies of all + * their sources and all if conditions on the if-condition stack. + * - For each continue, all loop-header phis receive the dependencies of all + * if-conditions on the if-condition stack at the continue. + * - For each break, all loop-exit phis receive the dependencies of all + * if-conditions on the if-condition stack at the break. + * - If there is any change to loop-header phis while iterating over a loop, + * we iterate over the loop again after the current iteration is finished. + */ +void +nir_gather_output_dependencies(nir_shader *nir, nir_output_deps *deps) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + nir_metadata_require(impl, nir_metadata_instr_index); + unsigned num_instr = nir_impl_last_block(impl)->end_ip; + + /* Allocate bitsets of instruction->instruction dependencies. */ + unsigned num_bitset_words = BITSET_WORDS(num_instr); + BITSET_WORD **instr_deps = rzalloc_array(NULL, BITSET_WORD*, num_instr); + void *mem_ctx = instr_deps; + for (unsigned i = 0; i < num_instr; i++) + instr_deps[i] = rzalloc_array(mem_ctx, BITSET_WORD, num_bitset_words); + + /* Allocate bitsets of instruction->output dependencies. */ + BITSET_WORD **out_deps = rzalloc_array(mem_ctx, BITSET_WORD*, + NUM_TOTAL_VARYING_SLOTS); + + /* Allocate stacks. */ + struct util_dynarray loop_stack, if_cond_stack; + util_dynarray_init(&loop_stack, mem_ctx); + util_dynarray_init(&if_cond_stack, mem_ctx); + + /* Gather dependencies of every instruction. + * Dependencies of each instruction are OR'd dependencies of its sources and + * control flow conditions. + */ + nir_foreach_block(block, impl) { + nir_cf_node *parent_cf = block->cf_node.parent; + bool is_loop_first_block = parent_cf->type == nir_cf_node_loop && + block == nir_cf_node_cf_tree_first(parent_cf); + if (is_loop_first_block) { + loop_entry loop = { + .start_block = block, + .exit_block = nir_cf_node_cf_tree_next(parent_cf), + }; + util_dynarray_append(&loop_stack, loop_entry, loop); + } + + if (parent_cf->type == nir_cf_node_if && + block == nir_if_first_then_block(nir_cf_node_as_if(parent_cf))) { + util_dynarray_append(&if_cond_stack, nir_def *, + nir_cf_node_as_if(parent_cf)->condition.ssa); + } + + loop_again: + nir_foreach_instr(instr, block) { + /* Add self as a dependency. */ + BITSET_WORD *this_instr_deps = instr_deps[instr->index]; + BITSET_SET(this_instr_deps, instr->index); + + /* Add sources as dependencies. */ + nir_foreach_src(instr, accum_src_deps, + &(foreach_src_data){instr_deps, NULL, num_bitset_words}); + + /* Add parent if-conditions as dependencies. + * + * Note that phis with sources inside conditional blocks don't need + * this because the phi sources already contain if-conditions. + */ + util_dynarray_foreach(&if_cond_stack, nir_def *, cond) { + accum_deps(this_instr_deps, + instr_deps[(*cond)->parent_instr->index], + num_bitset_words); + } + + /* Gather the current instruction. */ + switch (instr->type) { + case nir_instr_type_jump: + switch (nir_instr_as_jump(instr)->type) { + case nir_jump_continue: + case nir_jump_break: { + loop_entry *loop = get_current_loop(&loop_stack); + /* Iterate over all loop-header phis (for continue) or all + * loop-exit phis (for break). + * + * Assumption: Only the loop-start block can have loop-header + * phis. + */ + bool is_continue = + nir_instr_as_jump(instr)->type == nir_jump_continue; + nir_block *iter_block = + is_continue ? loop->start_block : loop->exit_block; + assert(iter_block); + + nir_foreach_phi(phi, iter_block) { + /* We need to track whether any header phi of the current + * loop has changed because we need to walk such loops + * again. Use the bitset bitcount to determine whether + * any instruction has been added to header phis as + * a dependency. + */ + unsigned old_count = 0; + if (is_continue) { + old_count = __bitset_count(instr_deps[phi->instr.index], + num_bitset_words); + } + + /* Add dependencies of all if-conditions affecting the + * jump statement to phis at the loop header / exit. + */ + util_dynarray_foreach(&if_cond_stack, nir_def *, cond) { + accum_deps(instr_deps[phi->instr.index], + instr_deps[(*cond)->parent_instr->index], + num_bitset_words); + } + + if (is_continue && + old_count != __bitset_count(instr_deps[phi->instr.index], + num_bitset_words)) + loop->header_phi_changed = true; + } + break; + } + default: + unreachable("unexpected jump type"); + } + break; + + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_vertex_output: + case nir_intrinsic_store_per_primitive_output: + case nir_intrinsic_store_per_view_output: { + /* The write mask must be contiguous starting from x. */ + ASSERTED unsigned writemask = nir_intrinsic_write_mask(intr); + assert(writemask == BITFIELD_MASK(util_bitcount(writemask))); + + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + assert(sem.num_slots >= 1); + + for (unsigned i = 0; i < sem.num_slots; i++) { + unsigned slot = sem.location + i; + if (!out_deps[slot]) { + out_deps[slot] = rzalloc_array(mem_ctx, BITSET_WORD, + num_bitset_words); + } + accum_deps(out_deps[slot], this_instr_deps, num_bitset_words); + } + break; + } + + default: + break; + } + break; + } + + default: + break; + } + } + + if (parent_cf->type == nir_cf_node_if && + block == nir_if_last_else_block(nir_cf_node_as_if(parent_cf))) { + /* Add the current if stack to the phis after the if node because + * this can happen: + * + * a = load_const true + * b = load_const false + * if (cond) { + * } else { + * } + * c = phi a, b + * + * c depends on cond, but doesn't use any defs from then/else blocks. + */ + nir_foreach_phi(phi, nir_cf_node_cf_tree_next(parent_cf)) { + util_dynarray_foreach(&if_cond_stack, nir_def *, cond) { + accum_deps(instr_deps[phi->instr.index], + instr_deps[(*cond)->parent_instr->index], + num_bitset_words); + } + } + + assert(util_dynarray_num_elements(&if_cond_stack, nir_def *)); + (void)util_dynarray_pop_ptr(&if_cond_stack, nir_def *); + } + + if (parent_cf->type == nir_cf_node_loop && + block == nir_cf_node_cf_tree_last(parent_cf)) { + assert(util_dynarray_num_elements(&loop_stack, loop_entry)); + loop_entry *loop = get_current_loop(&loop_stack); + + /* Check if any loop header phis would be changed by iterating over + * the loop again. + */ + nir_foreach_phi(phi, loop->start_block) { + unsigned old_count = __bitset_count(instr_deps[phi->instr.index], + num_bitset_words); + nir_foreach_src(&phi->instr, accum_src_deps, + &(foreach_src_data){instr_deps, NULL, num_bitset_words}); + if (old_count != __bitset_count(instr_deps[phi->instr.index], + num_bitset_words)) { + loop->header_phi_changed = true; + break; + } + } + + if (loop->header_phi_changed) { + loop->header_phi_changed = false; + /* Iterate over the loop again: */ + is_loop_first_block = true; + block = loop->start_block; + assert(block); + goto loop_again; + } + + (void)util_dynarray_pop_ptr(&loop_stack, loop_entry); + } + } + + /* Gather instructions that affect each output from bitsets. */ + memset(deps, 0, sizeof(*deps)); + + for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) { + if (!out_deps[i]) + continue; + + unsigned total = __bitset_count(out_deps[i], num_bitset_words); + unsigned added = 0; + deps->output[i].num_instr = total; + deps->output[i].instr_list = malloc(total * sizeof(nir_instr*)); + + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (BITSET_TEST(out_deps[i], instr->index)) { + assert(added < total); + deps->output[i].instr_list[added++] = instr; + } + } + } + assert(added == total); + } + + ralloc_free(mem_ctx); +} + +void +nir_free_output_dependencies(nir_output_deps *deps) +{ + for (unsigned i = 0; i < ARRAY_SIZE(deps->output); i++) { + assert(!!deps->output[i].instr_list == !!deps->output[i].num_instr); + if (deps->output[i].instr_list) + free(deps->output[i].instr_list); + } +} + +static unsigned +get_slot_index(nir_intrinsic_instr *intr, unsigned slot_offset) +{ + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + return (sem.location + slot_offset) * 8 + nir_intrinsic_component(intr) * 2 + + sem.high_16bits; +} + +/* For each output slot, gather which inputs are used to compute it. + * The shader must be in LCSSA form. + * + * If there are memory operations that affect other memory operations, those + * dependencies are not gathered. + */ +void +nir_gather_input_to_output_dependencies(nir_shader *nir, + nir_input_to_output_deps *out_deps) +{ + nir_output_deps deps; + nir_gather_output_dependencies(nir, &deps); + + memset(out_deps, 0, sizeof(*out_deps)); + + for (unsigned out = 0; out < ARRAY_SIZE(deps.output); out++) { + unsigned num_instr = deps.output[out].num_instr; + + if (!num_instr) + continue; + + out_deps->output[out].defined = true; + + for (unsigned i = 0; i < num_instr; i++) { + nir_instr *instr = deps.output[out].instr_list[i]; + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + switch (intr->intrinsic) { + case nir_intrinsic_load_input: + case nir_intrinsic_load_input_vertex: + case nir_intrinsic_load_per_vertex_input: + case nir_intrinsic_load_per_primitive_input: + case nir_intrinsic_load_interpolated_input: { + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + assert(intr->def.num_components == 1); + assert(sem.num_slots >= 1); + + for (unsigned index = 0; index < sem.num_slots; index++) { + unsigned slot = get_slot_index(intr, index); + BITSET_SET(out_deps->output[out].inputs, slot); + } + break; + } + case nir_instr_type_tex: + if (!nir_tex_instr_is_query(nir_instr_as_tex(instr))) + out_deps->output[out].uses_image_reads = true; + break; + default: { + const char *name = nir_intrinsic_infos[intr->intrinsic].name; + + if (strstr(name, "load_ssbo") || strstr(name, "ssbo_atomic")) + out_deps->output[out].uses_ssbo_reads = true; + + if (strstr(name, "image") && + (strstr(name, "load") || strstr(name, "atomic"))) + out_deps->output[out].uses_image_reads = true; + break; + } + } + } + } + + nir_free_output_dependencies(&deps); +} + +void +nir_print_input_to_output_deps(nir_input_to_output_deps *deps, + nir_shader *nir, FILE *f) +{ + for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) { + if (!deps->output[i].defined) + continue; + + fprintf(f, "%s(->%s): %s =", + _mesa_shader_stage_to_abbrev(nir->info.stage), + nir->info.next_stage != MESA_SHADER_NONE ? + _mesa_shader_stage_to_abbrev(nir->info.next_stage) : + "NONE", + gl_varying_slot_name_for_stage(i, nir->info.stage)); + + unsigned in; + BITSET_FOREACH_SET(in, deps->output[i].inputs, NUM_TOTAL_VARYING_SLOTS * 8) { + fprintf(f, " %u.%c%s", in / 8, "xyzw"[(in % 8) / 2], in % 2 ? ".hi" : ""); + } + fprintf(f, "%s%s", + deps->output[i].uses_ssbo_reads ? " (ssbo read)" : "", + deps->output[i].uses_image_reads ? " (image read)" : ""); + fprintf(f, "\n"); + } +} + +/* Gather 3 disjoint sets: + * - the set of input components only used to compute outputs for the clipper + * (those that are only used to compute the position and clip outputs) + * - the set of input components only used to compute all other outputs + * - the set of input components that are used to compute BOTH outputs for + * the clipper and all other outputs + * + * If there are memory operations that affect other memory operations, those + * dependencies are not gathered. + * + * The shader must be in LCSSA form. + * + * Patch outputs are not gathered because shaders feeding the clipper don't + * have patch outputs. + */ +void +nir_gather_output_clipper_var_groups(nir_shader *nir, + nir_output_clipper_var_groups *groups) +{ + nir_input_to_output_deps *deps = calloc(1, sizeof(*deps)); + nir_gather_input_to_output_dependencies(nir, deps); + + uint32_t clipper_outputs = VARYING_BIT_POS | + VARYING_BIT_CLIP_VERTEX | + VARYING_BIT_CLIP_DIST0 | + VARYING_BIT_CLIP_DIST1 | + VARYING_BIT_CULL_DIST0 | + VARYING_BIT_CULL_DIST1; + + /* OR-reduce the per-output sets. */ + memset(groups, 0, sizeof(*groups)); + + u_foreach_bit(i, clipper_outputs) { + if (deps->output[i].defined) { + BITSET_OR(groups->pos_only, groups->pos_only, + deps->output[i].inputs); + } + } + + for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) { + if (deps->output[i].defined && + (i >= 32 || !(clipper_outputs & BITFIELD_BIT(i)))) { + BITSET_OR(groups->var_only, groups->var_only, + deps->output[i].inputs); + } + } + + /* Compute the intersection of the above and make them disjoint. */ + BITSET_AND(groups->both, groups->pos_only, groups->var_only); + BITSET_ANDNOT(groups->pos_only, groups->pos_only, groups->both); + BITSET_ANDNOT(groups->var_only, groups->var_only, groups->both); + free(deps); +} diff --git a/src/util/u_dynarray.h b/src/util/u_dynarray.h index 6f50dcc929b..5b95d7da773 100644 --- a/src/util/u_dynarray.h +++ b/src/util/u_dynarray.h @@ -221,6 +221,7 @@ util_dynarray_append_dynarray(struct util_dynarray *buf, #define util_dynarray_top(buf, type) *util_dynarray_top_ptr(buf, type) #define util_dynarray_pop_ptr(buf, type) ((type*)((char*)(buf)->data + ((buf)->size -= sizeof(type)))) #define util_dynarray_pop(buf, type) *util_dynarray_pop_ptr(buf, type) +#define util_dynarray_last_ptr(buf, type) ((type*)((char*)(buf)->data + ((buf)->size - sizeof(type)))) #define util_dynarray_contains(buf, type) ((buf)->size >= sizeof(type)) #define util_dynarray_element(buf, type, idx) ((type*)(buf)->data + (idx)) #define util_dynarray_begin(buf) ((buf)->data)