From 4d09cd7fa590cbd52d8772d5a251fab8b0874ab7 Mon Sep 17 00:00:00 2001 From: Konstantin Date: Sun, 4 Aug 2024 16:49:43 +0200 Subject: [PATCH] nir/lower_non_uniform_access: Group accesses using the same resource Avoids emitting the waterfall loop for every access if they use the same resource: waterfall_loop { access } waterfall_loop { access } -> waterfall_loop { access access } Totals from 276 (0.33% of 84770) affected shaders: MaxWaves: 3360 -> 3356 (-0.12%) Instrs: 3759927 -> 3730650 (-0.78%) CodeSize: 21125784 -> 20899580 (-1.07%) VGPRs: 23096 -> 23104 (+0.03%) Latency: 35593716 -> 35315455 (-0.78%); split: -0.78%, +0.00% InvThroughput: 7353071 -> 7297309 (-0.76%); split: -0.76%, +0.00% VClause: 120983 -> 118579 (-1.99%) SClause: 113073 -> 110671 (-2.12%) Copies: 358272 -> 348686 (-2.68%) Branches: 166706 -> 159500 (-4.32%) PreSGPRs: 18598 -> 18596 (-0.01%) PreVGPRs: 21417 -> 21424 (+0.03%); split: -0.01%, +0.04% VALU: 2354862 -> 2350053 (-0.20%) SALU: 582291 -> 567638 (-2.52%) SMEM: 139875 -> 137473 (-1.72%) Reviewed-by: Rhys Perry Part-of: --- src/compiler/nir/nir.h | 1 + .../nir/nir_lower_non_uniform_access.c | 263 ++++++++++++++---- 2 files changed, 208 insertions(+), 56 deletions(-) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index ca26c9114d1..8acccc74d80 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -6475,6 +6475,7 @@ enum nir_lower_non_uniform_access_type { nir_lower_non_uniform_texture_access = (1 << 2), nir_lower_non_uniform_image_access = (1 << 3), nir_lower_non_uniform_get_ssbo_size = (1 << 4), + nir_lower_non_uniform_access_type_count = 5, }; /* Given the nir_src used for the resource, return the channels which might be non-uniform. */ diff --git a/src/compiler/nir/nir_lower_non_uniform_access.c b/src/compiler/nir/nir_lower_non_uniform_access.c index 0f56c33d299..76180ed99df 100644 --- a/src/compiler/nir/nir_lower_non_uniform_access.c +++ b/src/compiler/nir/nir_lower_non_uniform_access.c @@ -24,18 +24,50 @@ #include "nir.h" #include "nir_builder.h" +#include "util/hash_table.h" +#include "util/u_dynarray.h" + struct nu_handle { - nir_src *src; nir_def *handle; nir_deref_instr *parent_deref; nir_def *first; }; +struct nu_handle_key { + uint32_t block_index; + uint32_t access_group; + uint32_t handle_count; + /* We can have at most one texture and one sampler handle */ + uint32_t handle_indixes[2]; + uint32_t access_type; + /* Optional instruction index for emitting separate loops for non-reorderable instructions. */ + uint32_t instr_index; +}; + +DERIVE_HASH_TABLE(nu_handle_key) + +struct nu_handle_data { + struct nu_handle handles[2]; + struct util_dynarray srcs; +}; + +struct nu_handle_src { + nir_src *srcs[2]; +}; + +struct nu_access_group_state { + uint32_t last_first_use; + uint32_t index; +}; + +struct nu_state { + struct hash_table *accesses; + struct nu_access_group_state access_groups[nir_lower_non_uniform_access_type_count]; +}; + static bool nu_handle_init(struct nu_handle *h, nir_src *src) { - h->src = src; - nir_deref_instr *deref = nir_src_as_deref(*src); if (deref) { if (deref->deref_type == nir_deref_type_var) @@ -65,11 +97,11 @@ nu_handle_init(struct nu_handle *h, nir_src *src) static nir_def * nu_handle_compare(const nir_lower_non_uniform_access_options *options, - nir_builder *b, struct nu_handle *handle) + nir_builder *b, struct nu_handle *handle, nir_src *src) { nir_component_mask_t channel_mask = ~0; if (options->callback) - channel_mask = options->callback(handle->src, options->callback_data); + channel_mask = options->callback(src, options->callback_data); channel_mask &= nir_component_mask(handle->handle->num_components); nir_def *channels[NIR_MAX_VEC_COMPONENTS]; @@ -89,21 +121,90 @@ nu_handle_compare(const nir_lower_non_uniform_access_options *options, } static void -nu_handle_rewrite(nir_builder *b, struct nu_handle *h) +nu_handle_rewrite(nir_builder *b, struct nu_handle *h, nir_src *src) { if (h->parent_deref) { /* Replicate the deref. */ nir_deref_instr *deref = nir_build_deref_array(b, h->parent_deref, h->first); - *(h->src) = nir_src_for_ssa(&deref->def); + nir_src_rewrite(src, &deref->def); } else { - *(h->src) = nir_src_for_ssa(h->first); + nir_src_rewrite(src, h->first); } } static bool -lower_non_uniform_tex_access(const nir_lower_non_uniform_access_options *options, - nir_builder *b, nir_tex_instr *tex) +get_first_use(nir_def *def, void *state) +{ + uint32_t *last_first_use = state; + nir_foreach_use(use, def) + *last_first_use = MIN2(*last_first_use, nir_src_parent_instr(use)->index); + + return true; +} + +static void +add_non_uniform_instr(struct nu_state *state, struct nu_handle *handles, + nir_src **srcs, uint32_t handle_count, bool group, + enum nir_lower_non_uniform_access_type access_type) +{ + nir_instr *instr = nir_src_parent_instr(srcs[0]); + + struct nu_access_group_state *access_group = &state->access_groups[ffs(access_type) - 1]; + + if (group) { + uint32_t first_use = UINT32_MAX; + nir_foreach_def(instr, get_first_use, &first_use); + + /* Avoid moving accesses below their first use. */ + if (instr->index >= access_group->last_first_use) { + access_group->last_first_use = first_use; + access_group->index++; + } else { + /* Adjust the access group scope so that every access dominates its first use. */ + access_group->last_first_use = MIN2(access_group->last_first_use, first_use); + } + } + + struct nu_handle_key key; + memset(&key, 0, sizeof(key)); + key.block_index = instr->block->index; + key.access_group = access_group->index; + key.access_type = access_type; + key.handle_count = handle_count; + + if (!group) + key.instr_index = instr->index; + + for (uint32_t i = 0; i < handle_count; i++) + key.handle_indixes[i] = handles[i].handle->parent_instr->index; + + struct hash_entry *entry = _mesa_hash_table_search(state->accesses, &key); + if (!entry) { + struct nu_handle_data *data = ralloc(state->accesses, struct nu_handle_data); + + for (uint32_t i = 0; i < handle_count; i++) + data->handles[i] = handles[i]; + + util_dynarray_init(&data->srcs, state->accesses); + + struct nu_handle_key *key_copy = ralloc(state->accesses, struct nu_handle_key); + memcpy(key_copy, &key, sizeof(key)); + + entry = _mesa_hash_table_insert(state->accesses, key_copy, data); + } + + struct nu_handle_data *data = entry->data; + + struct nu_handle_src src = { 0 }; + for (uint32_t i = 0; i < handle_count; i++) + src.srcs[i] = srcs[i]; + + util_dynarray_append(&data->srcs, struct nu_handle_src, src); +} + +static bool +lower_non_uniform_tex_access(struct nu_state *state, nir_tex_instr *tex) { if (!tex->texture_non_uniform && !tex->sampler_non_uniform) return false; @@ -111,6 +212,7 @@ lower_non_uniform_tex_access(const nir_lower_non_uniform_access_options *options /* We can have at most one texture and one sampler handle */ unsigned num_handles = 0; struct nu_handle handles[2]; + nir_src *srcs[2]; for (unsigned i = 0; i < tex->num_srcs; i++) { switch (tex->src[i].src_type) { case nir_tex_src_texture_offset: @@ -131,7 +233,8 @@ lower_non_uniform_tex_access(const nir_lower_non_uniform_access_options *options continue; } - assert(num_handles <= ARRAY_SIZE(handles)); + assert(num_handles < ARRAY_SIZE(handles)); + srcs[num_handles] = &tex->src[i].src; if (nu_handle_init(&handles[num_handles], &tex->src[i].src)) num_handles++; } @@ -143,72 +246,63 @@ lower_non_uniform_tex_access(const nir_lower_non_uniform_access_options *options return false; } - b->cursor = nir_instr_remove(&tex->instr); - - nir_push_loop(b); - - nir_def *all_equal_first = nir_imm_true(b); - for (unsigned i = 0; i < num_handles; i++) { - if (i && handles[i].handle == handles[0].handle) { - handles[i].first = handles[0].first; - continue; - } - - nir_def *equal_first = nu_handle_compare(options, b, &handles[i]); - all_equal_first = nir_iand(b, all_equal_first, equal_first); - } - - nir_push_if(b, all_equal_first); - - for (unsigned i = 0; i < num_handles; i++) - nu_handle_rewrite(b, &handles[i]); - - nir_builder_instr_insert(b, &tex->instr); - nir_jump(b, nir_jump_break); - tex->texture_non_uniform = false; tex->sampler_non_uniform = false; + add_non_uniform_instr(state, handles, srcs, num_handles, true, + nir_lower_non_uniform_texture_access); + return true; } static bool -lower_non_uniform_access_intrin(const nir_lower_non_uniform_access_options *options, - nir_builder *b, nir_intrinsic_instr *intrin, - unsigned handle_src) +lower_non_uniform_access_intrin(struct nu_state *state, nir_intrinsic_instr *intrin, + unsigned handle_src, enum nir_lower_non_uniform_access_type access_type) { if (!(nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)) return false; + nir_src *src = &intrin->src[handle_src]; + struct nu_handle handle; - if (!nu_handle_init(&handle, &intrin->src[handle_src])) { + if (!nu_handle_init(&handle, src)) { nir_intrinsic_set_access(intrin, nir_intrinsic_access(intrin) & ~ACCESS_NON_UNIFORM); return false; } - b->cursor = nir_instr_remove(&intrin->instr); - - nir_push_loop(b); - - nir_push_if(b, nu_handle_compare(options, b, &handle)); - - nu_handle_rewrite(b, &handle); - - nir_builder_instr_insert(b, &intrin->instr); - nir_jump(b, nir_jump_break); - nir_intrinsic_set_access(intrin, nir_intrinsic_access(intrin) & ~ACCESS_NON_UNIFORM); + add_non_uniform_instr(state, &handle, &src, 1, nir_intrinsic_can_reorder(intrin), + access_type); + return true; } +static void +handle_barrier(struct nu_state *state, bool affects_derivatives) +{ + enum nir_lower_non_uniform_access_type access_type = + nir_lower_non_uniform_ssbo_access | nir_lower_non_uniform_image_access; + + if (affects_derivatives) + access_type |= nir_lower_non_uniform_texture_access; + + u_foreach_bit(i, access_type) { + state->access_groups[i].last_first_use = 0; + } +} + static bool nir_lower_non_uniform_access_impl(nir_function_impl *impl, const nir_lower_non_uniform_access_options *options) { bool progress = false; - nir_builder b = nir_builder_create(impl); + struct nu_state state = { + .accesses = nu_handle_key_table_create(NULL), + }; + + nir_metadata_require(impl, nir_metadata_instr_index | nir_metadata_block_index); nir_foreach_block_safe(block, impl) { nir_foreach_instr_safe(instr, block) { @@ -216,7 +310,7 @@ nir_lower_non_uniform_access_impl(nir_function_impl *impl, case nir_instr_type_tex: { nir_tex_instr *tex = nir_instr_as_tex(instr); if ((options->types & nir_lower_non_uniform_texture_access) && - lower_non_uniform_tex_access(options, &b, tex)) + lower_non_uniform_tex_access(&state, tex)) progress = true; break; } @@ -224,9 +318,18 @@ nir_lower_non_uniform_access_impl(nir_function_impl *impl, case nir_instr_type_intrinsic: { nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { + case nir_intrinsic_terminate_if: + case nir_intrinsic_terminate: + case nir_intrinsic_demote_if: + case nir_intrinsic_demote: + case nir_intrinsic_barrier: + handle_barrier(&state, intrin->intrinsic == nir_intrinsic_terminate_if || + intrin->intrinsic == nir_intrinsic_terminate); + break; + case nir_intrinsic_load_ubo: if ((options->types & nir_lower_non_uniform_ubo_access) && - lower_non_uniform_access_intrin(options, &b, intrin, 0)) + lower_non_uniform_access_intrin(&state, intrin, 0, nir_lower_non_uniform_ubo_access)) progress = true; break; @@ -234,20 +337,20 @@ nir_lower_non_uniform_access_impl(nir_function_impl *impl, case nir_intrinsic_ssbo_atomic: case nir_intrinsic_ssbo_atomic_swap: if ((options->types & nir_lower_non_uniform_ssbo_access) && - lower_non_uniform_access_intrin(options, &b, intrin, 0)) + lower_non_uniform_access_intrin(&state, intrin, 0, nir_lower_non_uniform_ssbo_access)) progress = true; break; case nir_intrinsic_store_ssbo: /* SSBO Stores put the index in the second source */ if ((options->types & nir_lower_non_uniform_ssbo_access) && - lower_non_uniform_access_intrin(options, &b, intrin, 1)) + lower_non_uniform_access_intrin(&state, intrin, 1, nir_lower_non_uniform_ssbo_access)) progress = true; break; case nir_intrinsic_get_ssbo_size: if ((options->types & nir_lower_non_uniform_get_ssbo_size) && - lower_non_uniform_access_intrin(options, &b, intrin, 0)) + lower_non_uniform_access_intrin(&state, intrin, 0, nir_lower_non_uniform_get_ssbo_size)) progress = true; break; @@ -282,7 +385,7 @@ nir_lower_non_uniform_access_impl(nir_function_impl *impl, case nir_intrinsic_image_deref_samples_identical: case nir_intrinsic_image_deref_fragment_mask_load_amd: if ((options->types & nir_lower_non_uniform_image_access) && - lower_non_uniform_access_intrin(options, &b, intrin, 0)) + lower_non_uniform_access_intrin(&state, intrin, 0, nir_lower_non_uniform_image_access)) progress = true; break; @@ -293,6 +396,10 @@ nir_lower_non_uniform_access_impl(nir_function_impl *impl, break; } + case nir_instr_type_call: + handle_barrier(&state, true); + break; + default: /* Nothing to do */ break; @@ -300,6 +407,50 @@ nir_lower_non_uniform_access_impl(nir_function_impl *impl, } } + nir_builder b = nir_builder_create(impl); + + hash_table_foreach(state.accesses, entry) { + const struct nu_handle_key *key = entry->key; + struct nu_handle_data data = *(struct nu_handle_data *)entry->data; + + nir_src *first_src = util_dynarray_top_ptr(&data.srcs, struct nu_handle_src)->srcs[0]; + b.cursor = nir_after_instr(nir_src_parent_instr(first_src)); + + nir_push_loop(&b); + + nir_def *all_equal_first = NULL; + for (uint32_t i = 0; i < key->handle_count; i++) { + if (i && data.handles[i].handle == data.handles[0].handle) { + data.handles[i].first = data.handles[0].first; + continue; + } + + nir_def *equal_first = nu_handle_compare(options, &b, &data.handles[i], first_src); + if (i == 0) + all_equal_first = equal_first; + else + all_equal_first = nir_iand(&b, all_equal_first, equal_first); + } + + nir_push_if(&b, all_equal_first); + + util_dynarray_foreach(&data.srcs, struct nu_handle_src, src) { + for (uint32_t i = 0; i < key->handle_count; i++) + nu_handle_rewrite(&b, &data.handles[i], src->srcs[i]); + + nir_instr *instr = nir_src_parent_instr(src->srcs[0]); + nir_instr_remove(instr); + nir_builder_instr_insert(&b, instr); + } + + nir_jump(&b, nir_jump_break); + + nir_pop_if(&b, NULL); + nir_pop_loop(&b, NULL); + } + + _mesa_hash_table_destroy(state.accesses, NULL); + if (progress) nir_metadata_preserve(impl, nir_metadata_none);