diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build index 286c83298a5..8e0595cc323 100644 --- a/src/compiler/nir/meson.build +++ b/src/compiler/nir/meson.build @@ -99,6 +99,7 @@ files_libnir = files( 'nir_from_ssa.c', 'nir_functions.c', 'nir_gather_info.c', + 'nir_gather_output_deps.c', 'nir_gather_tcs_info.c', 'nir_gather_types.c', 'nir_gather_xfb_info.c', diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index b76be907888..f0a4682700f 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -6461,6 +6461,44 @@ nir_verts_in_output_prim(nir_shader *gs) return mesa_vertices_per_prim(gs->info.gs.output_primitive); } +typedef struct { + struct { + /* The list of instructions that affect this output including the output + * store itself. If NULL, the output isn't stored. + */ + nir_instr **instr_list; + unsigned num_instr; + } output[NUM_TOTAL_VARYING_SLOTS]; +} nir_output_deps; + +void nir_gather_output_dependencies(nir_shader *nir, nir_output_deps *deps); +void nir_free_output_dependencies(nir_output_deps *deps); + +typedef struct { + struct { + /* Per component mask of input slots. */ + BITSET_DECLARE(inputs, NUM_TOTAL_VARYING_SLOTS * 8); + bool defined; + bool uses_ssbo_reads; + bool uses_image_reads; + } output[NUM_TOTAL_VARYING_SLOTS]; +} nir_input_to_output_deps; + +void nir_gather_input_to_output_dependencies(nir_shader *nir, + nir_input_to_output_deps *out_deps); +void nir_print_input_to_output_deps(nir_input_to_output_deps *deps, + nir_shader *nir, FILE *f); + +typedef struct { + /* 1 bit per 16-bit component. */ + BITSET_DECLARE(pos_only, NUM_TOTAL_VARYING_SLOTS * 8); + BITSET_DECLARE(var_only, NUM_TOTAL_VARYING_SLOTS * 8); + BITSET_DECLARE(both, NUM_TOTAL_VARYING_SLOTS * 8); +} nir_output_clipper_var_groups; + +void nir_gather_output_clipper_var_groups(nir_shader *nir, + nir_output_clipper_var_groups *groups); + #include "nir_inline_helpers.h" #ifdef __cplusplus diff --git a/src/compiler/nir/nir_gather_output_deps.c b/src/compiler/nir/nir_gather_output_deps.c new file mode 100644 index 00000000000..c3681844015 --- /dev/null +++ b/src/compiler/nir/nir_gather_output_deps.c @@ -0,0 +1,499 @@ +/* + * Copyright © 2024 Advanced Micro Devices, Inc. + * + * SPDX-License-Identifier: MIT + */ + +/* For each output slot, gather which input components are used to compute it. + * Component-wise ALU instructions must be scalar. + */ + +#include "nir_builder.h" +#include "util/hash_table.h" +#include "util/u_dynarray.h" +#include "util/u_memory.h" + +static void +accum_deps(BITSET_WORD *dst, BITSET_WORD *src, unsigned num_bitset_words) +{ + __bitset_or(dst, dst, src, num_bitset_words); +} + +typedef struct { + BITSET_WORD **instr_deps; + BITSET_WORD *tmp; + unsigned num_bitset_words; +} foreach_src_data; + +static bool +accum_src_deps(nir_src *src, void *opaque) +{ + foreach_src_data *data = (foreach_src_data *)opaque; + nir_instr *src_instr = src->ssa->parent_instr; + + if (src_instr->type == nir_instr_type_load_const || + src_instr->type == nir_instr_type_undef) + return true; + + nir_instr *dst_instr = nir_src_parent_instr(src); + accum_deps(data->instr_deps[dst_instr->index], + data->instr_deps[src_instr->index], data->num_bitset_words); + return true; +} + +typedef struct { + nir_block *start_block; /* the first block of the loop */ + nir_block *exit_block; /* the first block after the loop */ + bool header_phi_changed; +} loop_entry; + +static loop_entry * +get_current_loop(struct util_dynarray *loop_stack) +{ + assert(util_dynarray_num_elements(loop_stack, loop_entry)); + return util_dynarray_last_ptr(loop_stack, loop_entry); +} + +/* For each output slot, gather which instructions are used to compute it. + * The result is that each output slot will have the list of all instructions + * that must execute to compute that output. + * + * If there are memory operations that affect other memory operations, those + * dependencies are not gathered. + * + * Required: + * - The shader must be in LCSSA form. + * + * Recommended: + * - IO intrinsics and component-wise ALU instructions should be scalar, and + * vecN opcodes should have their components copy-propagated. If not, + * the results will have false dependencies. + * + * Algorithm: + * - For each instruction, compute a bitset of instruction indices whose + * results are needed to compute the result of the instruction. The final + * bitset is the instruction index OR'd with bitsets of all its sources and + * also all if-conditions used to enter the block, recursively. + * - Since every instruction inherits instruction bitsets from its sources, + * every instruction contains the list of all instructions that must execute + * before the instruction can execute. + * - At the end, output stores contain the list of instructions that must + * execute to compute their results. This may be any subset of instructions + * from the shader, including all instructions. + * + * Control flow notes: + * - There is a stack of "if" conditions for entered ifs. + * - The dependencies of instructions are the union of dependencies of all + * their sources and all if conditions on the if-condition stack. + * - For each continue, all loop-header phis receive the dependencies of all + * if-conditions on the if-condition stack at the continue. + * - For each break, all loop-exit phis receive the dependencies of all + * if-conditions on the if-condition stack at the break. + * - If there is any change to loop-header phis while iterating over a loop, + * we iterate over the loop again after the current iteration is finished. + */ +void +nir_gather_output_dependencies(nir_shader *nir, nir_output_deps *deps) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + nir_metadata_require(impl, nir_metadata_instr_index); + unsigned num_instr = nir_impl_last_block(impl)->end_ip; + + /* Allocate bitsets of instruction->instruction dependencies. */ + unsigned num_bitset_words = BITSET_WORDS(num_instr); + BITSET_WORD **instr_deps = rzalloc_array(NULL, BITSET_WORD*, num_instr); + void *mem_ctx = instr_deps; + for (unsigned i = 0; i < num_instr; i++) + instr_deps[i] = rzalloc_array(mem_ctx, BITSET_WORD, num_bitset_words); + + /* Allocate bitsets of instruction->output dependencies. */ + BITSET_WORD **out_deps = rzalloc_array(mem_ctx, BITSET_WORD*, + NUM_TOTAL_VARYING_SLOTS); + + /* Allocate stacks. */ + struct util_dynarray loop_stack, if_cond_stack; + util_dynarray_init(&loop_stack, mem_ctx); + util_dynarray_init(&if_cond_stack, mem_ctx); + + /* Gather dependencies of every instruction. + * Dependencies of each instruction are OR'd dependencies of its sources and + * control flow conditions. + */ + nir_foreach_block(block, impl) { + nir_cf_node *parent_cf = block->cf_node.parent; + bool is_loop_first_block = parent_cf->type == nir_cf_node_loop && + block == nir_cf_node_cf_tree_first(parent_cf); + if (is_loop_first_block) { + loop_entry loop = { + .start_block = block, + .exit_block = nir_cf_node_cf_tree_next(parent_cf), + }; + util_dynarray_append(&loop_stack, loop_entry, loop); + } + + if (parent_cf->type == nir_cf_node_if && + block == nir_if_first_then_block(nir_cf_node_as_if(parent_cf))) { + util_dynarray_append(&if_cond_stack, nir_def *, + nir_cf_node_as_if(parent_cf)->condition.ssa); + } + + loop_again: + nir_foreach_instr(instr, block) { + /* Add self as a dependency. */ + BITSET_WORD *this_instr_deps = instr_deps[instr->index]; + BITSET_SET(this_instr_deps, instr->index); + + /* Add sources as dependencies. */ + nir_foreach_src(instr, accum_src_deps, + &(foreach_src_data){instr_deps, NULL, num_bitset_words}); + + /* Add parent if-conditions as dependencies. + * + * Note that phis with sources inside conditional blocks don't need + * this because the phi sources already contain if-conditions. + */ + util_dynarray_foreach(&if_cond_stack, nir_def *, cond) { + accum_deps(this_instr_deps, + instr_deps[(*cond)->parent_instr->index], + num_bitset_words); + } + + /* Gather the current instruction. */ + switch (instr->type) { + case nir_instr_type_jump: + switch (nir_instr_as_jump(instr)->type) { + case nir_jump_continue: + case nir_jump_break: { + loop_entry *loop = get_current_loop(&loop_stack); + /* Iterate over all loop-header phis (for continue) or all + * loop-exit phis (for break). + * + * Assumption: Only the loop-start block can have loop-header + * phis. + */ + bool is_continue = + nir_instr_as_jump(instr)->type == nir_jump_continue; + nir_block *iter_block = + is_continue ? loop->start_block : loop->exit_block; + assert(iter_block); + + nir_foreach_phi(phi, iter_block) { + /* We need to track whether any header phi of the current + * loop has changed because we need to walk such loops + * again. Use the bitset bitcount to determine whether + * any instruction has been added to header phis as + * a dependency. + */ + unsigned old_count = 0; + if (is_continue) { + old_count = __bitset_count(instr_deps[phi->instr.index], + num_bitset_words); + } + + /* Add dependencies of all if-conditions affecting the + * jump statement to phis at the loop header / exit. + */ + util_dynarray_foreach(&if_cond_stack, nir_def *, cond) { + accum_deps(instr_deps[phi->instr.index], + instr_deps[(*cond)->parent_instr->index], + num_bitset_words); + } + + if (is_continue && + old_count != __bitset_count(instr_deps[phi->instr.index], + num_bitset_words)) + loop->header_phi_changed = true; + } + break; + } + default: + unreachable("unexpected jump type"); + } + break; + + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_vertex_output: + case nir_intrinsic_store_per_primitive_output: + case nir_intrinsic_store_per_view_output: { + /* The write mask must be contiguous starting from x. */ + ASSERTED unsigned writemask = nir_intrinsic_write_mask(intr); + assert(writemask == BITFIELD_MASK(util_bitcount(writemask))); + + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + assert(sem.num_slots >= 1); + + for (unsigned i = 0; i < sem.num_slots; i++) { + unsigned slot = sem.location + i; + if (!out_deps[slot]) { + out_deps[slot] = rzalloc_array(mem_ctx, BITSET_WORD, + num_bitset_words); + } + accum_deps(out_deps[slot], this_instr_deps, num_bitset_words); + } + break; + } + + default: + break; + } + break; + } + + default: + break; + } + } + + if (parent_cf->type == nir_cf_node_if && + block == nir_if_last_else_block(nir_cf_node_as_if(parent_cf))) { + /* Add the current if stack to the phis after the if node because + * this can happen: + * + * a = load_const true + * b = load_const false + * if (cond) { + * } else { + * } + * c = phi a, b + * + * c depends on cond, but doesn't use any defs from then/else blocks. + */ + nir_foreach_phi(phi, nir_cf_node_cf_tree_next(parent_cf)) { + util_dynarray_foreach(&if_cond_stack, nir_def *, cond) { + accum_deps(instr_deps[phi->instr.index], + instr_deps[(*cond)->parent_instr->index], + num_bitset_words); + } + } + + assert(util_dynarray_num_elements(&if_cond_stack, nir_def *)); + (void)util_dynarray_pop_ptr(&if_cond_stack, nir_def *); + } + + if (parent_cf->type == nir_cf_node_loop && + block == nir_cf_node_cf_tree_last(parent_cf)) { + assert(util_dynarray_num_elements(&loop_stack, loop_entry)); + loop_entry *loop = get_current_loop(&loop_stack); + + /* Check if any loop header phis would be changed by iterating over + * the loop again. + */ + nir_foreach_phi(phi, loop->start_block) { + unsigned old_count = __bitset_count(instr_deps[phi->instr.index], + num_bitset_words); + nir_foreach_src(&phi->instr, accum_src_deps, + &(foreach_src_data){instr_deps, NULL, num_bitset_words}); + if (old_count != __bitset_count(instr_deps[phi->instr.index], + num_bitset_words)) { + loop->header_phi_changed = true; + break; + } + } + + if (loop->header_phi_changed) { + loop->header_phi_changed = false; + /* Iterate over the loop again: */ + is_loop_first_block = true; + block = loop->start_block; + assert(block); + goto loop_again; + } + + (void)util_dynarray_pop_ptr(&loop_stack, loop_entry); + } + } + + /* Gather instructions that affect each output from bitsets. */ + memset(deps, 0, sizeof(*deps)); + + for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) { + if (!out_deps[i]) + continue; + + unsigned total = __bitset_count(out_deps[i], num_bitset_words); + unsigned added = 0; + deps->output[i].num_instr = total; + deps->output[i].instr_list = malloc(total * sizeof(nir_instr*)); + + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (BITSET_TEST(out_deps[i], instr->index)) { + assert(added < total); + deps->output[i].instr_list[added++] = instr; + } + } + } + assert(added == total); + } + + ralloc_free(mem_ctx); +} + +void +nir_free_output_dependencies(nir_output_deps *deps) +{ + for (unsigned i = 0; i < ARRAY_SIZE(deps->output); i++) { + assert(!!deps->output[i].instr_list == !!deps->output[i].num_instr); + if (deps->output[i].instr_list) + free(deps->output[i].instr_list); + } +} + +static unsigned +get_slot_index(nir_intrinsic_instr *intr, unsigned slot_offset) +{ + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + return (sem.location + slot_offset) * 8 + nir_intrinsic_component(intr) * 2 + + sem.high_16bits; +} + +/* For each output slot, gather which inputs are used to compute it. + * The shader must be in LCSSA form. + * + * If there are memory operations that affect other memory operations, those + * dependencies are not gathered. + */ +void +nir_gather_input_to_output_dependencies(nir_shader *nir, + nir_input_to_output_deps *out_deps) +{ + nir_output_deps deps; + nir_gather_output_dependencies(nir, &deps); + + memset(out_deps, 0, sizeof(*out_deps)); + + for (unsigned out = 0; out < ARRAY_SIZE(deps.output); out++) { + unsigned num_instr = deps.output[out].num_instr; + + if (!num_instr) + continue; + + out_deps->output[out].defined = true; + + for (unsigned i = 0; i < num_instr; i++) { + nir_instr *instr = deps.output[out].instr_list[i]; + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + switch (intr->intrinsic) { + case nir_intrinsic_load_input: + case nir_intrinsic_load_input_vertex: + case nir_intrinsic_load_per_vertex_input: + case nir_intrinsic_load_per_primitive_input: + case nir_intrinsic_load_interpolated_input: { + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + assert(intr->def.num_components == 1); + assert(sem.num_slots >= 1); + + for (unsigned index = 0; index < sem.num_slots; index++) { + unsigned slot = get_slot_index(intr, index); + BITSET_SET(out_deps->output[out].inputs, slot); + } + break; + } + case nir_instr_type_tex: + if (!nir_tex_instr_is_query(nir_instr_as_tex(instr))) + out_deps->output[out].uses_image_reads = true; + break; + default: { + const char *name = nir_intrinsic_infos[intr->intrinsic].name; + + if (strstr(name, "load_ssbo") || strstr(name, "ssbo_atomic")) + out_deps->output[out].uses_ssbo_reads = true; + + if (strstr(name, "image") && + (strstr(name, "load") || strstr(name, "atomic"))) + out_deps->output[out].uses_image_reads = true; + break; + } + } + } + } + + nir_free_output_dependencies(&deps); +} + +void +nir_print_input_to_output_deps(nir_input_to_output_deps *deps, + nir_shader *nir, FILE *f) +{ + for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) { + if (!deps->output[i].defined) + continue; + + fprintf(f, "%s(->%s): %s =", + _mesa_shader_stage_to_abbrev(nir->info.stage), + nir->info.next_stage != MESA_SHADER_NONE ? + _mesa_shader_stage_to_abbrev(nir->info.next_stage) : + "NONE", + gl_varying_slot_name_for_stage(i, nir->info.stage)); + + unsigned in; + BITSET_FOREACH_SET(in, deps->output[i].inputs, NUM_TOTAL_VARYING_SLOTS * 8) { + fprintf(f, " %u.%c%s", in / 8, "xyzw"[(in % 8) / 2], in % 2 ? ".hi" : ""); + } + fprintf(f, "%s%s", + deps->output[i].uses_ssbo_reads ? " (ssbo read)" : "", + deps->output[i].uses_image_reads ? " (image read)" : ""); + fprintf(f, "\n"); + } +} + +/* Gather 3 disjoint sets: + * - the set of input components only used to compute outputs for the clipper + * (those that are only used to compute the position and clip outputs) + * - the set of input components only used to compute all other outputs + * - the set of input components that are used to compute BOTH outputs for + * the clipper and all other outputs + * + * If there are memory operations that affect other memory operations, those + * dependencies are not gathered. + * + * The shader must be in LCSSA form. + * + * Patch outputs are not gathered because shaders feeding the clipper don't + * have patch outputs. + */ +void +nir_gather_output_clipper_var_groups(nir_shader *nir, + nir_output_clipper_var_groups *groups) +{ + nir_input_to_output_deps *deps = calloc(1, sizeof(*deps)); + nir_gather_input_to_output_dependencies(nir, deps); + + uint32_t clipper_outputs = VARYING_BIT_POS | + VARYING_BIT_CLIP_VERTEX | + VARYING_BIT_CLIP_DIST0 | + VARYING_BIT_CLIP_DIST1 | + VARYING_BIT_CULL_DIST0 | + VARYING_BIT_CULL_DIST1; + + /* OR-reduce the per-output sets. */ + memset(groups, 0, sizeof(*groups)); + + u_foreach_bit(i, clipper_outputs) { + if (deps->output[i].defined) { + BITSET_OR(groups->pos_only, groups->pos_only, + deps->output[i].inputs); + } + } + + for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) { + if (deps->output[i].defined && + (i >= 32 || !(clipper_outputs & BITFIELD_BIT(i)))) { + BITSET_OR(groups->var_only, groups->var_only, + deps->output[i].inputs); + } + } + + /* Compute the intersection of the above and make them disjoint. */ + BITSET_AND(groups->both, groups->pos_only, groups->var_only); + BITSET_ANDNOT(groups->pos_only, groups->pos_only, groups->both); + BITSET_ANDNOT(groups->var_only, groups->var_only, groups->both); + free(deps); +} diff --git a/src/util/u_dynarray.h b/src/util/u_dynarray.h index 6f50dcc929b..5b95d7da773 100644 --- a/src/util/u_dynarray.h +++ b/src/util/u_dynarray.h @@ -221,6 +221,7 @@ util_dynarray_append_dynarray(struct util_dynarray *buf, #define util_dynarray_top(buf, type) *util_dynarray_top_ptr(buf, type) #define util_dynarray_pop_ptr(buf, type) ((type*)((char*)(buf)->data + ((buf)->size -= sizeof(type)))) #define util_dynarray_pop(buf, type) *util_dynarray_pop_ptr(buf, type) +#define util_dynarray_last_ptr(buf, type) ((type*)((char*)(buf)->data + ((buf)->size - sizeof(type)))) #define util_dynarray_contains(buf, type) ((buf)->size >= sizeof(type)) #define util_dynarray_element(buf, type, idx) ((type*)(buf)->data + (idx)) #define util_dynarray_begin(buf) ((buf)->data)