From f83bc5beb803348f8226ea904c55dc6f9262d549 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 1 Sep 2020 16:31:37 +0100 Subject: [PATCH] nir: add pass to optimize uniform atomics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This optimizes atomics with a uniform offset so that only one atomic operation is done in the subgroup. For shaders which do a very large amount of atomics, this can significantly improve performance. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/compiler/Makefile.sources | 1 + src/compiler/nir/meson.build | 1 + src/compiler/nir/nir.h | 2 + src/compiler/nir/nir_opt_uniform_atomics.c | 257 +++++++++++++++++++++ 4 files changed, 261 insertions(+) create mode 100644 src/compiler/nir/nir_opt_uniform_atomics.c diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources index 4925b53c2c2..931af638bf7 100644 --- a/src/compiler/Makefile.sources +++ b/src/compiler/Makefile.sources @@ -340,6 +340,7 @@ NIR_FILES = \ nir/nir_opt_sink.c \ nir/nir_opt_trivial_continues.c \ nir/nir_opt_undef.c \ + nir/nir_opt_uniform_atomics.c \ nir/nir_opt_vectorize.c \ nir/nir_phi_builder.c \ nir/nir_phi_builder.h \ diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build index 3e578732e8b..d9899d29fc2 100644 --- a/src/compiler/nir/meson.build +++ b/src/compiler/nir/meson.build @@ -221,6 +221,7 @@ files_libnir = files( 'nir_opt_sink.c', 'nir_opt_trivial_continues.c', 'nir_opt_undef.c', + 'nir_opt_uniform_atomics.c', 'nir_opt_vectorize.c', 'nir_phi_builder.c', 'nir_phi_builder.h', diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 0bf59e44d36..15a85c4cd64 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -4900,6 +4900,8 @@ bool nir_opt_trivial_continues(nir_shader *shader); bool nir_opt_undef(nir_shader *shader); +bool nir_opt_uniform_atomics(nir_shader *shader); + typedef bool (*nir_opt_vectorize_cb)(const nir_instr *a, const nir_instr *b, void *data); bool nir_opt_vectorize(nir_shader *shader, nir_opt_vectorize_cb filter, diff --git a/src/compiler/nir/nir_opt_uniform_atomics.c b/src/compiler/nir/nir_opt_uniform_atomics.c new file mode 100644 index 00000000000..cb21ff92422 --- /dev/null +++ b/src/compiler/nir/nir_opt_uniform_atomics.c @@ -0,0 +1,257 @@ +/* + * Copyright © 2020 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +/* + * Optimizes atomics (with uniform offsets) using subgroup operations to ensure + * only one atomic operation is done per subgroup. So res = atomicAdd(addr, 1) + * would become something like: + * + * uint tmp = subgroupAdd(1); + * uint res; + * if (subgroupElect()) + * res = atomicAdd(addr, tmp); + * res = subgroupBroadcastFirst(res) + subgroupExclusiveAdd(1); + * + * This pass requires and preserves LCSSA and divergence information. + */ + +#include "nir/nir.h" +#include "nir/nir_builder.h" + +static nir_op +parse_atomic_op(nir_intrinsic_op op, unsigned *offset_src, unsigned *data_src) +{ + switch (op) { + #define OP(intrin, alu) \ + case nir_intrinsic_ssbo_atomic_##intrin: \ + *offset_src = 1; \ + *data_src = 2; \ + return nir_op_##alu; \ + case nir_intrinsic_shared_atomic_##intrin: \ + case nir_intrinsic_global_atomic_##intrin: \ + case nir_intrinsic_deref_atomic_##intrin: \ + *offset_src = 0; \ + *data_src = 1; \ + return nir_op_##alu; \ + return nir_op_##alu; \ + return nir_op_##alu; + OP(add, iadd) + OP(imin, imin) + OP(umin, umin) + OP(imax, imax) + OP(umax, umax) + OP(and, iand) + OP(or, ior) + OP(xor, ixor) + OP(fadd, fadd) + OP(fmin, fmin) + OP(fmax, fmax) + #undef OP + default: + return nir_num_opcodes; + } +} + +static nir_ssa_def * +emit_scalar_intrinsic(nir_builder *b, nir_intrinsic_op op, unsigned bit_size) +{ + nir_intrinsic_instr *intrin = nir_intrinsic_instr_create(b->shader, op); + nir_ssa_dest_init(&intrin->instr, &intrin->dest, 1, bit_size, NULL); + nir_builder_instr_insert(b, &intrin->instr); + return &intrin->dest.ssa; +} + +static nir_ssa_def * +emit_read_invocation(nir_builder *b, nir_ssa_def *data, nir_ssa_def *lane) +{ + nir_intrinsic_instr *ri = nir_intrinsic_instr_create( + b->shader, lane ? nir_intrinsic_read_invocation : nir_intrinsic_read_first_invocation); + nir_ssa_dest_init(&ri->instr, &ri->dest, 1, data->bit_size, NULL); + ri->num_components = 1; + ri->src[0] = nir_src_for_ssa(data); + if (lane) + ri->src[1] = nir_src_for_ssa(lane); + nir_builder_instr_insert(b, &ri->instr); + return &ri->dest.ssa; +} + +/* Perform a reduction and/or exclusive scan. */ +static void +reduce_data(nir_builder *b, nir_op op, nir_ssa_def *data, + nir_ssa_def **reduce, nir_ssa_def **scan) +{ + nir_intrinsic_op intrin_op = scan ? nir_intrinsic_exclusive_scan : nir_intrinsic_reduce; + nir_intrinsic_instr *intrin = + nir_intrinsic_instr_create(b->shader, intrin_op); + intrin->num_components = 1; + intrin->src[0] = nir_src_for_ssa(data); + nir_intrinsic_set_reduction_op(intrin, op); + nir_ssa_dest_init(&intrin->instr, &intrin->dest, 1, data->bit_size, NULL); + nir_builder_instr_insert(b, &intrin->instr); + + if (scan) + *scan = &intrin->dest.ssa; + + if (scan && reduce) { + *scan = &intrin->dest.ssa; + nir_ssa_def *last_lane = emit_scalar_intrinsic(b, nir_intrinsic_last_invocation, 32); + nir_ssa_def *res = nir_build_alu(b, op, *scan, data, NULL, NULL); + *reduce = emit_read_invocation(b, res, last_lane); + } else if (reduce) { + *reduce = &intrin->dest.ssa; + } +} + +static nir_ssa_def * +optimize_atomic(nir_builder *b, nir_intrinsic_instr *intrin, bool return_prev) +{ + unsigned offset_src, data_src; + nir_op op = parse_atomic_op(intrin->intrinsic, &offset_src, &data_src); + nir_ssa_def *data = intrin->src[data_src].ssa; + + /* Separate uniform reduction and scan is faster than doing a combined scan+reduce */ + bool combined_scan_reduce = return_prev && data->divergent; + nir_ssa_def *reduce = NULL, *scan = NULL; + reduce_data(b, op, data, &reduce, combined_scan_reduce ? &scan : NULL); + + nir_instr_rewrite_src(&intrin->instr, &intrin->src[data_src], nir_src_for_ssa(reduce)); + nir_update_instr_divergence(b->shader, &intrin->instr); + + nir_ssa_def *cond = emit_scalar_intrinsic(b, nir_intrinsic_elect, 1); + + nir_if *nif = nir_push_if(b, cond); + + nir_instr_remove(&intrin->instr); + nir_builder_instr_insert(b, &intrin->instr); + + if (return_prev) { + nir_push_else(b, nif); + + nir_ssa_def *undef = nir_ssa_undef(b, 1, intrin->dest.ssa.bit_size); + + nir_pop_if(b, nif); + nir_ssa_def *result = nir_if_phi(b, &intrin->dest.ssa, undef); + result = emit_read_invocation(b, result, NULL); + + if (!combined_scan_reduce) + reduce_data(b, op, data, NULL, &scan); + + return nir_build_alu(b, op, result, scan, NULL, NULL); + } else { + nir_pop_if(b, nif); + return NULL; + } +} + +static void +optimize_and_rewrite_atomic(nir_builder *b, nir_intrinsic_instr *intrin) +{ + nir_if *helper_nif = NULL; + if (b->shader->info.stage == MESA_SHADER_FRAGMENT) { + nir_ssa_def *helper = emit_scalar_intrinsic(b, nir_intrinsic_is_helper_invocation, 1); + helper_nif = nir_push_if(b, nir_inot(b, helper)); + } + + ASSERTED bool original_result_divergent = intrin->dest.ssa.divergent; + bool return_prev = !list_is_empty(&intrin->dest.ssa.uses) || + !list_is_empty(&intrin->dest.ssa.if_uses); + + nir_ssa_def old_result = intrin->dest.ssa; + list_replace(&intrin->dest.ssa.uses, &old_result.uses); + list_replace(&intrin->dest.ssa.if_uses, &old_result.if_uses); + nir_ssa_dest_init(&intrin->instr, &intrin->dest, 1, intrin->dest.ssa.bit_size, NULL); + + nir_ssa_def *result = optimize_atomic(b, intrin, return_prev); + + if (helper_nif) { + nir_push_else(b, helper_nif); + nir_ssa_def *undef = result ? nir_ssa_undef(b, 1, result->bit_size) : NULL; + nir_pop_if(b, helper_nif); + if (result) + result = nir_if_phi(b, result, undef); + } + + if (result) { + assert(result->divergent == original_result_divergent); + nir_ssa_def_rewrite_uses(&old_result, nir_src_for_ssa(result)); + } +} + +static bool +opt_uniform_atomics(nir_function_impl *impl) +{ + bool progress = false; + nir_builder b; + nir_builder_init(&b, impl); + b.update_divergence = true; + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + unsigned offset_src, data_src; + if (parse_atomic_op(intrin->intrinsic, &offset_src, &data_src) == nir_num_opcodes) + continue; + + if (nir_src_is_divergent(intrin->src[offset_src])) + continue; + + b.cursor = nir_before_instr(instr); + optimize_and_rewrite_atomic(&b, intrin); + progress = true; + } + } + + return progress; +} + +bool +nir_opt_uniform_atomics(nir_shader *shader) +{ + bool progress = false; + + /* A 1x1x1 workgroup only ever has one active lane, so there's no point in + * optimizing any atomics. + */ + if (shader->info.stage == MESA_SHADER_COMPUTE && !shader->info.cs.local_size_variable && + shader->info.cs.local_size[0] == 1 && shader->info.cs.local_size[1] == 1 && + shader->info.cs.local_size[2] == 1) + return false; + + nir_foreach_function(function, shader) { + if (!function->impl) + continue; + + if (opt_uniform_atomics(function->impl)) { + progress = true; + nir_metadata_preserve(function->impl, 0); + } else { + nir_metadata_preserve(function->impl, nir_metadata_all); + } + } + + return progress; +}