From fccc35c2def2293b7adb313265b62d4aa198ff9e Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Fri, 24 Sep 2021 19:04:04 +0200 Subject: [PATCH] ir3: Add preamble optimization pass Now that everything is plumbed through, we can tie it together. Part-of: --- src/freedreno/ir3/ir3_compiler.c | 3 + src/freedreno/ir3/ir3_compiler.h | 4 + src/freedreno/ir3/ir3_nir.c | 20 +- src/freedreno/ir3/ir3_nir.h | 2 + .../ir3/ir3_nir_analyze_ubo_ranges.c | 4 +- src/freedreno/ir3/ir3_nir_opt_preamble.c | 420 ++++++++++++++++++ src/freedreno/ir3/ir3_shader.h | 3 + src/freedreno/ir3/meson.build | 1 + 8 files changed, 455 insertions(+), 2 deletions(-) create mode 100644 src/freedreno/ir3/ir3_nir_opt_preamble.c diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 3c9e0db7d57..0d10ae8f23e 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -45,6 +45,7 @@ static const struct debug_named_value shader_debug_options[] = { {"nofp16", IR3_DBG_NOFP16, "Don't lower mediump to fp16"}, {"nocache", IR3_DBG_NOCACHE, "Disable shader cache"}, {"spillall", IR3_DBG_SPILLALL, "Spill as much as possible to test the spiller"}, + {"nopreamble", IR3_DBG_NOPREAMBLE, "Disable the preamble pass"}, #ifdef DEBUG /* DEBUG-only options: */ {"schedmsgs", IR3_DBG_SCHEDMSGS, "Enable scheduler debug messages"}, @@ -245,6 +246,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, /* TODO: implement private memory on earlier gen's */ compiler->has_pvtmem = true; + compiler->has_preamble = true; + compiler->tess_use_shared = dev_info->a6xx.tess_use_shared; compiler->storage_16bit = dev_info->a6xx.storage_16bit; diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index ddbd152ea7c..1521d58c8cf 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -182,6 +182,9 @@ struct ir3_compiler { * constbuf. a5xx+ has the shared regfile. */ bool has_shared_regfile; + + /* True if preamble instructions (shps, shpe, etc.) are supported */ + bool has_preamble; }; void ir3_compiler_destroy(struct ir3_compiler *compiler); @@ -224,6 +227,7 @@ enum ir3_shader_debug { IR3_DBG_NOFP16 = BITFIELD_BIT(10), IR3_DBG_NOCACHE = BITFIELD_BIT(11), IR3_DBG_SPILLALL = BITFIELD_BIT(12), + IR3_DBG_NOPREAMBLE = BITFIELD_BIT(13), /* DEBUG-only options: */ IR3_DBG_SCHEDMSGS = BITFIELD_BIT(20), diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 207a327f94f..96529f35727 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -640,11 +640,28 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s) progress |= OPT(s, ir3_nir_lower_64b_undef); progress |= OPT(s, nir_lower_int64); + /* Cleanup code leftover from lowering passes before opt_preamble */ + if (progress) { + progress |= OPT(s, nir_opt_constant_folding); + } + + /* Do the preamble before analysing UBO ranges, because it's usually + * higher-value and because it can result in eliminating some indirect UBO + * accesses where otherwise we'd have to push the whole range. However we + * have to lower the preamble after UBO lowering so that UBO lowering can + * insert instructions in the preamble to push UBOs. + */ + if (so->shader->compiler->has_preamble && + !(ir3_shader_debug & IR3_DBG_NOPREAMBLE)) + progress |= OPT(s, ir3_nir_opt_preamble, so); + if (!so->binning_pass) OPT_V(s, ir3_nir_analyze_ubo_ranges, so); progress |= OPT(s, ir3_nir_lower_ubo_loads, so); + progress |= OPT(s, ir3_nir_lower_preamble, so); + OPT_V(s, nir_lower_amul, ir3_glsl_type_size); /* UBO offset lowering has to come after we've decided what will @@ -826,7 +843,8 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, debug_assert((const_state->ubo_state.size % 16) == 0); unsigned constoff = v->shader->num_reserved_user_consts + - const_state->ubo_state.size / 16; + const_state->ubo_state.size / 16 + + const_state->preamble_size; unsigned ptrsz = ir3_pointer_size(compiler); if (const_state->num_ubos > 0) { diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index 0ad5f09766c..7a780191d32 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -73,6 +73,8 @@ bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v); void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v); bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v); bool ir3_nir_fixup_load_uniform(nir_shader *nir); +bool ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v); +bool ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v); nir_ssa_def *ir3_nir_try_propagate_bit_shift(nir_builder *b, nir_ssa_def *offset, diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c index 5ee39737016..dea75e95c9e 100644 --- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c +++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c @@ -369,7 +369,9 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v) * allocation of the driver params' const space, because UBO pointers can * be driver params but this pass usually eliminatings them. */ - struct ir3_const_state worst_case_const_state = {}; + struct ir3_const_state worst_case_const_state = { + .preamble_size = const_state->preamble_size, + }; ir3_setup_const_state(nir, v, &worst_case_const_state); const uint32_t max_upload = (ir3_max_const(v) - worst_case_const_state.offsets.immediate) * 16; diff --git a/src/freedreno/ir3/ir3_nir_opt_preamble.c b/src/freedreno/ir3/ir3_nir_opt_preamble.c new file mode 100644 index 00000000000..7c5c60c78d9 --- /dev/null +++ b/src/freedreno/ir3/ir3_nir_opt_preamble.c @@ -0,0 +1,420 @@ +/* + * Copyright © 2021 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "ir3_compiler.h" +#include "ir3_nir.h" + +/* Preamble optimization happens in two parts: first we generate the preamble + * using the generic NIR pass, then we setup the preamble sequence and inline + * the preamble into the main shader if there was a preamble. The first part + * should happen before UBO lowering, because we want to prefer more complex + * expressions over UBO loads, but the second part has to happen after UBO + * lowering because it may add copy instructions to the preamble. + */ + +static void +def_size(nir_ssa_def *def, unsigned *size, unsigned *align) +{ + unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size; + /* Due to the implicit const file promotion we want to expand 16-bit values + * to 32-bit so that the truncation in the main shader can hopefully be + * folded into the use. + */ + *size = DIV_ROUND_UP(bit_size, 32) * def->num_components; + *align = 1; +} + +static bool +all_uses_float(nir_ssa_def *def, bool allow_src2) +{ + nir_foreach_if_use (use, def) { + return false; + } + + nir_foreach_use (use, def) { + nir_instr *use_instr = use->parent_instr; + if (use_instr->type != nir_instr_type_alu) + return false; + nir_alu_instr *use_alu = nir_instr_as_alu(use_instr); + unsigned src_index = ~0; + for (unsigned i = 0; i < nir_op_infos[use_alu->op].num_inputs; i++) { + if (&use_alu->src[i].src == use) { + src_index = i; + break; + } + } + + assert(src_index != ~0); + nir_alu_type src_type = + nir_alu_type_get_base_type(nir_op_infos[use_alu->op].input_types[src_index]); + + if (src_type != nir_type_float || (src_index == 2 && !allow_src2)) + return false; + } + + return true; +} + +static bool +all_uses_bit(nir_ssa_def *def) +{ + nir_foreach_if_use (use, def) { + return false; + } + + nir_foreach_use (use, def) { + nir_instr *use_instr = use->parent_instr; + if (use_instr->type != nir_instr_type_alu) + return false; + nir_alu_instr *use_alu = nir_instr_as_alu(use_instr); + + /* See ir3_cat2_absneg() */ + switch (use_alu->op) { + case nir_op_iand: + case nir_op_ior: + case nir_op_inot: + case nir_op_ixor: + case nir_op_bitfield_reverse: + case nir_op_ufind_msb: + case nir_op_ifind_msb: + case nir_op_find_lsb: + case nir_op_ishl: + case nir_op_ushr: + case nir_op_ishr: + case nir_op_bit_count: + continue; + default: + return false; + } + } + + return true; +} + +static float +instr_cost(nir_instr *instr, const void *data) +{ + /* We'll assume wave64 here for simplicity and assume normal cat1-cat3 ops + * take 1 (normalized) cycle. + * + * See https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/A6xx-SP + * + * TODO: assume wave128 on fragment/compute shaders? + */ + + switch (instr->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(instr); + unsigned components = alu->dest.dest.ssa.num_components; + switch (alu->op) { + /* cat4 */ + case nir_op_frcp: + case nir_op_fsqrt: + case nir_op_frsq: + case nir_op_flog2: + case nir_op_fexp2: + case nir_op_fsin: + case nir_op_fcos: + return 4 * components; + + /* Instructions that become src modifiers. Note for conversions this is + * really an approximation. + * + * This prevents silly things like lifting a negate that would become a + * modifier. + */ + case nir_op_f2f32: + case nir_op_f2f16: + case nir_op_f2fmp: + case nir_op_fneg: + return all_uses_float(&alu->dest.dest.ssa, true) ? 0 : 1 * components; + + case nir_op_fabs: + return all_uses_float(&alu->dest.dest.ssa, false) ? 0 : 1 * components; + + case nir_op_inot: + return all_uses_bit(&alu->dest.dest.ssa) ? 0 : 1 * components; + + /* Instructions that become vector split/collect */ + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: + case nir_op_mov: + return 0; + + /* cat1-cat3 */ + default: + return 1 * components; + } + break; + } + + case nir_instr_type_tex: + /* cat5 */ + return 8; + + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_ubo: { + /* If the UBO and offset are constant, then UBO lowering should do a + * better job trying to lower this, and opt_preamble shouldn't try to + * duplicate it. However if it has a non-constant offset then we can + * avoid setting up a0.x etc. in the main shader and potentially have + * to push less. + */ + bool const_ubo = nir_src_is_const(intrin->src[0]); + if (!const_ubo) { + nir_intrinsic_instr *rsrc = ir3_bindless_resource(intrin->src[0]); + if (rsrc) + const_ubo = nir_src_is_const(rsrc->src[0]); + } + + if (const_ubo && nir_src_is_const(intrin->src[1])) + return 0; + + /* TODO: get actual numbers for ldc */ + return 8; + } + + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_ssbo_ir3: + case nir_intrinsic_get_ssbo_size: + case nir_intrinsic_image_load: + case nir_intrinsic_bindless_image_load: + /* cat5/isam */ + return 8; + + /* By default assume it's a sysval or something */ + default: + return 0; + } + } + + default: + return 0; + } +} + +static float +rewrite_cost(nir_ssa_def *def, const void *data) +{ + /* We always have to expand booleans */ + if (def->bit_size == 1) + return def->num_components; + + bool mov_needed = false; + nir_foreach_use (use, def) { + nir_instr *parent_instr = use->parent_instr; + if (parent_instr->type != nir_instr_type_alu) { + mov_needed = true; + break; + } else { + nir_alu_instr *alu = nir_instr_as_alu(parent_instr); + if (alu->op == nir_op_vec2 || + alu->op == nir_op_vec3 || + alu->op == nir_op_vec4 || + alu->op == nir_op_mov) { + mov_needed = true; + break; + } else { + /* Assume for non-moves that the const is folded into the src */ + } + } + } + + return mov_needed ? def->num_components : 0; +} + +static bool +avoid_instr(const nir_instr *instr, const void *data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + return intrin->intrinsic == nir_intrinsic_bindless_resource_ir3; +} + +bool +ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v) +{ + struct ir3_const_state *const_state = ir3_const_state(v); + + unsigned max_size; + if (v->binning_pass) { + max_size = const_state->preamble_size * 4; + } else { + struct ir3_const_state worst_case_const_state = {}; + ir3_setup_const_state(nir, v, &worst_case_const_state); + max_size = (ir3_max_const(v) - worst_case_const_state.offsets.immediate) * 4; + } + + if (max_size == 0) + return false; + + nir_opt_preamble_options options = { + .drawid_uniform = true, + .subgroup_size_uniform = true, + .def_size = def_size, + .preamble_storage_size = max_size, + .instr_cost_cb = instr_cost, + .avoid_instr_cb = avoid_instr, + .rewrite_cost_cb = rewrite_cost, + }; + + unsigned size; + bool progress = nir_opt_preamble(nir, &options, &size); + + if (!v->binning_pass) + const_state->preamble_size = DIV_ROUND_UP(size, 4); + + return progress; +} + +bool +ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v) +{ + nir_function_impl *main = nir_shader_get_entrypoint(nir); + + if (!main->preamble) + return false; + + nir_function_impl *preamble = main->preamble->impl; + + /* First, lower load/store_preamble. */ + const struct ir3_const_state *const_state = ir3_const_state(v); + unsigned preamble_base = v->shader->num_reserved_user_consts * 4 + + const_state->ubo_state.size / 4; + unsigned preamble_size = const_state->preamble_size * 4; + + BITSET_DECLARE(promoted_to_float, preamble_size); + memset(promoted_to_float, 0, sizeof(promoted_to_float)); + + nir_builder _b; + nir_builder *b = &_b; + nir_builder_init(b, main); + + nir_foreach_block (block, main) { + nir_foreach_instr_safe (instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_preamble) + continue; + + nir_ssa_def *dest = &intrin->dest.ssa; + + unsigned offset = preamble_base + nir_intrinsic_base(intrin); + b->cursor = nir_before_instr(instr); + + nir_ssa_def *new_dest = + nir_load_uniform(b, dest->num_components, 32, nir_imm_int(b, 0), + .base = offset); + + if (dest->bit_size == 1) { + new_dest = nir_i2b1(b, new_dest); + } else if (dest->bit_size != 32) { + assert(dest->bit_size == 16); + if (all_uses_float(dest, true)) { + new_dest = nir_f2f16(b, new_dest); + BITSET_SET(promoted_to_float, nir_intrinsic_base(intrin)); + } else { + new_dest = nir_u2u16(b, new_dest); + } + } + + nir_ssa_def_rewrite_uses(dest, new_dest); + nir_instr_remove(instr); + nir_instr_free(instr); + } + } + + nir_builder_init(b, preamble); + + nir_foreach_block (block, preamble) { + nir_foreach_instr_safe (instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_store_preamble) + continue; + + nir_ssa_def *src = intrin->src[0].ssa; + unsigned offset = preamble_base + nir_intrinsic_base(intrin); + + b->cursor = nir_before_instr(instr); + + if (src->bit_size == 1) + src = nir_b2i32(b, src); + if (src->bit_size != 32) { + assert(src->bit_size == 16); + if (BITSET_TEST(promoted_to_float, nir_intrinsic_base(intrin))) { + src = nir_f2f32(b, src); + } else { + src = nir_u2u32(b, src); + } + } + + nir_store_uniform_ir3(b, src, .base = offset); + nir_instr_remove(instr); + nir_instr_free(instr); + } + } + + /* Now, create the preamble sequence and move the preamble into the main + * shader: + * + * if (preamble_start_ir3()) { + * if (subgroupElect()) { + * preamble(); + * preamble_end_ir3(); + * } + * } + * ... + */ + + b->cursor = nir_before_cf_list(&main->body); + + nir_if *outer_if = nir_push_if(b, nir_preamble_start_ir3(b, 1)); + { + nir_if *inner_if = nir_push_if(b, nir_elect(b, 1)); + { + nir_call_instr *call = nir_call_instr_create(nir, main->preamble); + nir_builder_instr_insert(b, &call->instr); + nir_preamble_end_ir3(b); + } + nir_pop_if(b, inner_if); + } + nir_pop_if(b, outer_if); + + nir_inline_functions(nir); + exec_node_remove(&main->preamble->node); + main->preamble = NULL; + + nir_metadata_preserve(main, nir_metadata_none); + return true; +} diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index e764fa02044..f543d8316d5 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -157,6 +157,7 @@ struct ir3_ubo_analysis_state { * that pointer size (ubo, etc) changes depending on generation. * * user consts + * preamble consts * UBO addresses * SSBO sizes * image dimensions @@ -209,6 +210,8 @@ struct ir3_const_state { unsigned immediates_size; uint32_t *immediates; + unsigned preamble_size; + /* State of ubo access lowered to push consts: */ struct ir3_ubo_analysis_state ubo_state; }; diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build index 54d13f50db0..031f0220da9 100644 --- a/src/freedreno/ir3/meson.build +++ b/src/freedreno/ir3/meson.build @@ -102,6 +102,7 @@ libfreedreno_ir3_files = files( 'ir3_nir_lower_tex_prefetch.c', 'ir3_nir_lower_wide_load_store.c', 'ir3_nir_move_varying_inputs.c', + 'ir3_nir_opt_preamble.c', 'ir3_postsched.c', 'ir3_print.c', 'ir3_ra.c',