nir/lower_non_uniform_access: Group accesses using the same resource

Avoids emitting the waterfall loop for every access if they use the same
resource:

waterfall_loop {
   access
}
waterfall_loop {
   access
}

->

waterfall_loop {
   access
   access
}

Totals from 276 (0.33% of 84770) affected shaders:
MaxWaves: 3360 -> 3356 (-0.12%)
Instrs: 3759927 -> 3730650 (-0.78%)
CodeSize: 21125784 -> 20899580 (-1.07%)
VGPRs: 23096 -> 23104 (+0.03%)
Latency: 35593716 -> 35315455 (-0.78%); split: -0.78%, +0.00%
InvThroughput: 7353071 -> 7297309 (-0.76%); split: -0.76%, +0.00%
VClause: 120983 -> 118579 (-1.99%)
SClause: 113073 -> 110671 (-2.12%)
Copies: 358272 -> 348686 (-2.68%)
Branches: 166706 -> 159500 (-4.32%)
PreSGPRs: 18598 -> 18596 (-0.01%)
PreVGPRs: 21417 -> 21424 (+0.03%); split: -0.01%, +0.04%
VALU: 2354862 -> 2350053 (-0.20%)
SALU: 582291 -> 567638 (-2.52%)
SMEM: 139875 -> 137473 (-1.72%)

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30509>
This commit is contained in:
Konstantin 2024-08-04 16:49:43 +02:00 committed by Marge Bot
parent c5e40a60f8
commit 4d09cd7fa5
2 changed files with 208 additions and 56 deletions

View file

@ -6475,6 +6475,7 @@ enum nir_lower_non_uniform_access_type {
nir_lower_non_uniform_texture_access = (1 << 2),
nir_lower_non_uniform_image_access = (1 << 3),
nir_lower_non_uniform_get_ssbo_size = (1 << 4),
nir_lower_non_uniform_access_type_count = 5,
};
/* Given the nir_src used for the resource, return the channels which might be non-uniform. */

View file

@ -24,18 +24,50 @@
#include "nir.h"
#include "nir_builder.h"
#include "util/hash_table.h"
#include "util/u_dynarray.h"
struct nu_handle {
nir_src *src;
nir_def *handle;
nir_deref_instr *parent_deref;
nir_def *first;
};
struct nu_handle_key {
uint32_t block_index;
uint32_t access_group;
uint32_t handle_count;
/* We can have at most one texture and one sampler handle */
uint32_t handle_indixes[2];
uint32_t access_type;
/* Optional instruction index for emitting separate loops for non-reorderable instructions. */
uint32_t instr_index;
};
DERIVE_HASH_TABLE(nu_handle_key)
struct nu_handle_data {
struct nu_handle handles[2];
struct util_dynarray srcs;
};
struct nu_handle_src {
nir_src *srcs[2];
};
struct nu_access_group_state {
uint32_t last_first_use;
uint32_t index;
};
struct nu_state {
struct hash_table *accesses;
struct nu_access_group_state access_groups[nir_lower_non_uniform_access_type_count];
};
static bool
nu_handle_init(struct nu_handle *h, nir_src *src)
{
h->src = src;
nir_deref_instr *deref = nir_src_as_deref(*src);
if (deref) {
if (deref->deref_type == nir_deref_type_var)
@ -65,11 +97,11 @@ nu_handle_init(struct nu_handle *h, nir_src *src)
static nir_def *
nu_handle_compare(const nir_lower_non_uniform_access_options *options,
nir_builder *b, struct nu_handle *handle)
nir_builder *b, struct nu_handle *handle, nir_src *src)
{
nir_component_mask_t channel_mask = ~0;
if (options->callback)
channel_mask = options->callback(handle->src, options->callback_data);
channel_mask = options->callback(src, options->callback_data);
channel_mask &= nir_component_mask(handle->handle->num_components);
nir_def *channels[NIR_MAX_VEC_COMPONENTS];
@ -89,21 +121,90 @@ nu_handle_compare(const nir_lower_non_uniform_access_options *options,
}
static void
nu_handle_rewrite(nir_builder *b, struct nu_handle *h)
nu_handle_rewrite(nir_builder *b, struct nu_handle *h, nir_src *src)
{
if (h->parent_deref) {
/* Replicate the deref. */
nir_deref_instr *deref =
nir_build_deref_array(b, h->parent_deref, h->first);
*(h->src) = nir_src_for_ssa(&deref->def);
nir_src_rewrite(src, &deref->def);
} else {
*(h->src) = nir_src_for_ssa(h->first);
nir_src_rewrite(src, h->first);
}
}
static bool
lower_non_uniform_tex_access(const nir_lower_non_uniform_access_options *options,
nir_builder *b, nir_tex_instr *tex)
get_first_use(nir_def *def, void *state)
{
uint32_t *last_first_use = state;
nir_foreach_use(use, def)
*last_first_use = MIN2(*last_first_use, nir_src_parent_instr(use)->index);
return true;
}
static void
add_non_uniform_instr(struct nu_state *state, struct nu_handle *handles,
nir_src **srcs, uint32_t handle_count, bool group,
enum nir_lower_non_uniform_access_type access_type)
{
nir_instr *instr = nir_src_parent_instr(srcs[0]);
struct nu_access_group_state *access_group = &state->access_groups[ffs(access_type) - 1];
if (group) {
uint32_t first_use = UINT32_MAX;
nir_foreach_def(instr, get_first_use, &first_use);
/* Avoid moving accesses below their first use. */
if (instr->index >= access_group->last_first_use) {
access_group->last_first_use = first_use;
access_group->index++;
} else {
/* Adjust the access group scope so that every access dominates its first use. */
access_group->last_first_use = MIN2(access_group->last_first_use, first_use);
}
}
struct nu_handle_key key;
memset(&key, 0, sizeof(key));
key.block_index = instr->block->index;
key.access_group = access_group->index;
key.access_type = access_type;
key.handle_count = handle_count;
if (!group)
key.instr_index = instr->index;
for (uint32_t i = 0; i < handle_count; i++)
key.handle_indixes[i] = handles[i].handle->parent_instr->index;
struct hash_entry *entry = _mesa_hash_table_search(state->accesses, &key);
if (!entry) {
struct nu_handle_data *data = ralloc(state->accesses, struct nu_handle_data);
for (uint32_t i = 0; i < handle_count; i++)
data->handles[i] = handles[i];
util_dynarray_init(&data->srcs, state->accesses);
struct nu_handle_key *key_copy = ralloc(state->accesses, struct nu_handle_key);
memcpy(key_copy, &key, sizeof(key));
entry = _mesa_hash_table_insert(state->accesses, key_copy, data);
}
struct nu_handle_data *data = entry->data;
struct nu_handle_src src = { 0 };
for (uint32_t i = 0; i < handle_count; i++)
src.srcs[i] = srcs[i];
util_dynarray_append(&data->srcs, struct nu_handle_src, src);
}
static bool
lower_non_uniform_tex_access(struct nu_state *state, nir_tex_instr *tex)
{
if (!tex->texture_non_uniform && !tex->sampler_non_uniform)
return false;
@ -111,6 +212,7 @@ lower_non_uniform_tex_access(const nir_lower_non_uniform_access_options *options
/* We can have at most one texture and one sampler handle */
unsigned num_handles = 0;
struct nu_handle handles[2];
nir_src *srcs[2];
for (unsigned i = 0; i < tex->num_srcs; i++) {
switch (tex->src[i].src_type) {
case nir_tex_src_texture_offset:
@ -131,7 +233,8 @@ lower_non_uniform_tex_access(const nir_lower_non_uniform_access_options *options
continue;
}
assert(num_handles <= ARRAY_SIZE(handles));
assert(num_handles < ARRAY_SIZE(handles));
srcs[num_handles] = &tex->src[i].src;
if (nu_handle_init(&handles[num_handles], &tex->src[i].src))
num_handles++;
}
@ -143,72 +246,63 @@ lower_non_uniform_tex_access(const nir_lower_non_uniform_access_options *options
return false;
}
b->cursor = nir_instr_remove(&tex->instr);
nir_push_loop(b);
nir_def *all_equal_first = nir_imm_true(b);
for (unsigned i = 0; i < num_handles; i++) {
if (i && handles[i].handle == handles[0].handle) {
handles[i].first = handles[0].first;
continue;
}
nir_def *equal_first = nu_handle_compare(options, b, &handles[i]);
all_equal_first = nir_iand(b, all_equal_first, equal_first);
}
nir_push_if(b, all_equal_first);
for (unsigned i = 0; i < num_handles; i++)
nu_handle_rewrite(b, &handles[i]);
nir_builder_instr_insert(b, &tex->instr);
nir_jump(b, nir_jump_break);
tex->texture_non_uniform = false;
tex->sampler_non_uniform = false;
add_non_uniform_instr(state, handles, srcs, num_handles, true,
nir_lower_non_uniform_texture_access);
return true;
}
static bool
lower_non_uniform_access_intrin(const nir_lower_non_uniform_access_options *options,
nir_builder *b, nir_intrinsic_instr *intrin,
unsigned handle_src)
lower_non_uniform_access_intrin(struct nu_state *state, nir_intrinsic_instr *intrin,
unsigned handle_src, enum nir_lower_non_uniform_access_type access_type)
{
if (!(nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM))
return false;
nir_src *src = &intrin->src[handle_src];
struct nu_handle handle;
if (!nu_handle_init(&handle, &intrin->src[handle_src])) {
if (!nu_handle_init(&handle, src)) {
nir_intrinsic_set_access(intrin, nir_intrinsic_access(intrin) & ~ACCESS_NON_UNIFORM);
return false;
}
b->cursor = nir_instr_remove(&intrin->instr);
nir_push_loop(b);
nir_push_if(b, nu_handle_compare(options, b, &handle));
nu_handle_rewrite(b, &handle);
nir_builder_instr_insert(b, &intrin->instr);
nir_jump(b, nir_jump_break);
nir_intrinsic_set_access(intrin, nir_intrinsic_access(intrin) & ~ACCESS_NON_UNIFORM);
add_non_uniform_instr(state, &handle, &src, 1, nir_intrinsic_can_reorder(intrin),
access_type);
return true;
}
static void
handle_barrier(struct nu_state *state, bool affects_derivatives)
{
enum nir_lower_non_uniform_access_type access_type =
nir_lower_non_uniform_ssbo_access | nir_lower_non_uniform_image_access;
if (affects_derivatives)
access_type |= nir_lower_non_uniform_texture_access;
u_foreach_bit(i, access_type) {
state->access_groups[i].last_first_use = 0;
}
}
static bool
nir_lower_non_uniform_access_impl(nir_function_impl *impl,
const nir_lower_non_uniform_access_options *options)
{
bool progress = false;
nir_builder b = nir_builder_create(impl);
struct nu_state state = {
.accesses = nu_handle_key_table_create(NULL),
};
nir_metadata_require(impl, nir_metadata_instr_index | nir_metadata_block_index);
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {
@ -216,7 +310,7 @@ nir_lower_non_uniform_access_impl(nir_function_impl *impl,
case nir_instr_type_tex: {
nir_tex_instr *tex = nir_instr_as_tex(instr);
if ((options->types & nir_lower_non_uniform_texture_access) &&
lower_non_uniform_tex_access(options, &b, tex))
lower_non_uniform_tex_access(&state, tex))
progress = true;
break;
}
@ -224,9 +318,18 @@ nir_lower_non_uniform_access_impl(nir_function_impl *impl,
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_terminate_if:
case nir_intrinsic_terminate:
case nir_intrinsic_demote_if:
case nir_intrinsic_demote:
case nir_intrinsic_barrier:
handle_barrier(&state, intrin->intrinsic == nir_intrinsic_terminate_if ||
intrin->intrinsic == nir_intrinsic_terminate);
break;
case nir_intrinsic_load_ubo:
if ((options->types & nir_lower_non_uniform_ubo_access) &&
lower_non_uniform_access_intrin(options, &b, intrin, 0))
lower_non_uniform_access_intrin(&state, intrin, 0, nir_lower_non_uniform_ubo_access))
progress = true;
break;
@ -234,20 +337,20 @@ nir_lower_non_uniform_access_impl(nir_function_impl *impl,
case nir_intrinsic_ssbo_atomic:
case nir_intrinsic_ssbo_atomic_swap:
if ((options->types & nir_lower_non_uniform_ssbo_access) &&
lower_non_uniform_access_intrin(options, &b, intrin, 0))
lower_non_uniform_access_intrin(&state, intrin, 0, nir_lower_non_uniform_ssbo_access))
progress = true;
break;
case nir_intrinsic_store_ssbo:
/* SSBO Stores put the index in the second source */
if ((options->types & nir_lower_non_uniform_ssbo_access) &&
lower_non_uniform_access_intrin(options, &b, intrin, 1))
lower_non_uniform_access_intrin(&state, intrin, 1, nir_lower_non_uniform_ssbo_access))
progress = true;
break;
case nir_intrinsic_get_ssbo_size:
if ((options->types & nir_lower_non_uniform_get_ssbo_size) &&
lower_non_uniform_access_intrin(options, &b, intrin, 0))
lower_non_uniform_access_intrin(&state, intrin, 0, nir_lower_non_uniform_get_ssbo_size))
progress = true;
break;
@ -282,7 +385,7 @@ nir_lower_non_uniform_access_impl(nir_function_impl *impl,
case nir_intrinsic_image_deref_samples_identical:
case nir_intrinsic_image_deref_fragment_mask_load_amd:
if ((options->types & nir_lower_non_uniform_image_access) &&
lower_non_uniform_access_intrin(options, &b, intrin, 0))
lower_non_uniform_access_intrin(&state, intrin, 0, nir_lower_non_uniform_image_access))
progress = true;
break;
@ -293,6 +396,10 @@ nir_lower_non_uniform_access_impl(nir_function_impl *impl,
break;
}
case nir_instr_type_call:
handle_barrier(&state, true);
break;
default:
/* Nothing to do */
break;
@ -300,6 +407,50 @@ nir_lower_non_uniform_access_impl(nir_function_impl *impl,
}
}
nir_builder b = nir_builder_create(impl);
hash_table_foreach(state.accesses, entry) {
const struct nu_handle_key *key = entry->key;
struct nu_handle_data data = *(struct nu_handle_data *)entry->data;
nir_src *first_src = util_dynarray_top_ptr(&data.srcs, struct nu_handle_src)->srcs[0];
b.cursor = nir_after_instr(nir_src_parent_instr(first_src));
nir_push_loop(&b);
nir_def *all_equal_first = NULL;
for (uint32_t i = 0; i < key->handle_count; i++) {
if (i && data.handles[i].handle == data.handles[0].handle) {
data.handles[i].first = data.handles[0].first;
continue;
}
nir_def *equal_first = nu_handle_compare(options, &b, &data.handles[i], first_src);
if (i == 0)
all_equal_first = equal_first;
else
all_equal_first = nir_iand(&b, all_equal_first, equal_first);
}
nir_push_if(&b, all_equal_first);
util_dynarray_foreach(&data.srcs, struct nu_handle_src, src) {
for (uint32_t i = 0; i < key->handle_count; i++)
nu_handle_rewrite(&b, &data.handles[i], src->srcs[i]);
nir_instr *instr = nir_src_parent_instr(src->srcs[0]);
nir_instr_remove(instr);
nir_builder_instr_insert(&b, instr);
}
nir_jump(&b, nir_jump_break);
nir_pop_if(&b, NULL);
nir_pop_loop(&b, NULL);
}
_mesa_hash_table_destroy(state.accesses, NULL);
if (progress)
nir_metadata_preserve(impl, nir_metadata_none);