mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-03-14 06:30:26 +01:00
nir/opt_load_store_vectorize: Allow to vectorize at most one entry of each type across blocks
The idea is to initialize the vectorization table with one entry from the previous blocks if it's the same for all predecessors. In order to not speculatively load out-of-bounds, backends need to set a new bounds_checked_modes option indicating variable modes for which per-component bounds checks are supported. Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39373>
This commit is contained in:
parent
0a07ea20e6
commit
4ca7ee7bd7
2 changed files with 89 additions and 9 deletions
|
|
@ -5530,6 +5530,7 @@ typedef struct nir_load_store_vectorize_options {
|
|||
unsigned (*round_up_components)(unsigned);
|
||||
nir_variable_mode modes;
|
||||
nir_variable_mode robust_modes;
|
||||
nir_variable_mode bounds_checked_modes; /* modes with per-component bounds-checking */
|
||||
void *cb_data;
|
||||
bool has_shared2_amd;
|
||||
bool round_up_store_components;
|
||||
|
|
|
|||
|
|
@ -172,7 +172,7 @@ struct entry_key {
|
|||
/* Information on a single memory operation. */
|
||||
struct entry {
|
||||
struct list_head head;
|
||||
unsigned index;
|
||||
int index;
|
||||
|
||||
struct entry_key *key;
|
||||
/* The constant offset is sign-extended to 64 bits. */
|
||||
|
|
@ -198,6 +198,10 @@ struct entry {
|
|||
nir_deref_instr *deref;
|
||||
};
|
||||
|
||||
struct block_ctx {
|
||||
struct entry *last_entry[nir_num_variable_modes];
|
||||
};
|
||||
|
||||
struct vectorize_ctx {
|
||||
linear_ctx *linear_mem_ctx;
|
||||
nir_shader *shader;
|
||||
|
|
@ -207,6 +211,7 @@ struct vectorize_ctx {
|
|||
struct list_head entries[nir_num_variable_modes];
|
||||
struct hash_table *loads[nir_num_variable_modes];
|
||||
struct hash_table *stores[nir_num_variable_modes];
|
||||
struct block_ctx *per_block_ctx;
|
||||
};
|
||||
|
||||
static unsigned
|
||||
|
|
@ -935,7 +940,9 @@ static void
|
|||
hoist_base_addr(nir_instr *instr, nir_instr *to_hoist)
|
||||
{
|
||||
/* Return if this instruction already dominates the first load. */
|
||||
if (to_hoist->block != instr->block || to_hoist->index <= instr->index)
|
||||
if (to_hoist->block->index < instr->block->index)
|
||||
return;
|
||||
if (to_hoist->block->index == instr->block->index && to_hoist->index <= instr->index)
|
||||
return;
|
||||
|
||||
/* Only the offset calculation (consisting of ALU and load_const)
|
||||
|
|
@ -1076,6 +1083,7 @@ vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx,
|
|||
first->align_mul = low->align_mul;
|
||||
first->align_offset = low->align_offset;
|
||||
|
||||
list_del(&second->head);
|
||||
nir_instr_remove(second->instr);
|
||||
}
|
||||
|
||||
|
|
@ -1382,10 +1390,6 @@ addition_wraps(uint64_t a, uint64_t b, unsigned bits)
|
|||
static bool
|
||||
check_for_robustness(struct vectorize_ctx *ctx, struct entry *low, uint64_t high_offset)
|
||||
{
|
||||
nir_variable_mode mode = get_variable_mode(low);
|
||||
if (!(mode & ctx->options->robust_modes))
|
||||
return false;
|
||||
|
||||
/* First, try to use alignment information in case the application provided some. If the addition
|
||||
* of the maximum offset of the low load and "high_offset" wraps around, we can't combine the low
|
||||
* and high loads.
|
||||
|
|
@ -1477,8 +1481,12 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
|
|||
return false;
|
||||
|
||||
uint64_t diff = get_offset_diff(low, high);
|
||||
if (check_for_robustness(ctx, low, diff))
|
||||
return false;
|
||||
nir_variable_mode mode = get_variable_mode(low);
|
||||
if ((mode & ctx->options->robust_modes) || high->index == -1) {
|
||||
/* Only vectorize robust modes or hoist low entries, if robustness is ensured. */
|
||||
if (check_for_robustness(ctx, low, diff))
|
||||
return false;
|
||||
}
|
||||
|
||||
/* don't attempt to vectorize accesses of row-major matrix columns */
|
||||
if (first->deref) {
|
||||
|
|
@ -1575,12 +1583,15 @@ try_vectorize_shared2(struct vectorize_ctx *ctx,
|
|||
|
||||
/* vectorize the accesses */
|
||||
uint32_t access = nir_intrinsic_access(first->intrin);
|
||||
struct entry *new_entry;
|
||||
if (first->is_store) {
|
||||
nir_def *low_val = low->intrin->src[low->info->value_src].ssa;
|
||||
nir_def *high_val = high->intrin->src[high->info->value_src].ssa;
|
||||
nir_def *val = nir_vec2(&b, nir_bitcast_vector(&b, low_val, low_size * 8u),
|
||||
nir_bitcast_vector(&b, high_val, low_size * 8u));
|
||||
nir_store_shared2_amd(&b, val, offset, .offset1 = diff / stride, .st64 = st64, .access = access);
|
||||
nir_intrinsic_instr *intrin =
|
||||
nir_store_shared2_amd(&b, val, offset, .offset1 = diff / stride, .st64 = st64, .access = access);
|
||||
new_entry = create_entry(ctx, get_info(nir_intrinsic_store_shared2_amd), intrin);
|
||||
} else {
|
||||
nir_def *new_def = nir_load_shared2_amd(&b, low_size * 8u, offset, .offset1 = diff / stride,
|
||||
.st64 = st64, .access = access);
|
||||
|
|
@ -1588,8 +1599,15 @@ try_vectorize_shared2(struct vectorize_ctx *ctx,
|
|||
nir_bitcast_vector(&b, nir_channel(&b, new_def, 0), low_bit_size));
|
||||
nir_def_rewrite_uses(&high->intrin->def,
|
||||
nir_bitcast_vector(&b, nir_channel(&b, new_def, 1), high_bit_size));
|
||||
new_entry = create_entry(ctx, get_info(nir_intrinsic_load_shared2_amd), nir_def_as_intrinsic(new_def));
|
||||
}
|
||||
|
||||
/* Add a new entry, so that alias checks stay intact. Remove the old entries,
|
||||
* so that they can't accidentally be vectorized any further.
|
||||
*/
|
||||
list_add(&new_entry->head, &(first->is_store ? second : first)->head);
|
||||
list_del(&first->head);
|
||||
list_del(&second->head);
|
||||
nir_instr_remove(first->instr);
|
||||
nir_instr_remove(second->instr);
|
||||
|
||||
|
|
@ -1808,6 +1826,48 @@ add_entry_to_hash_table(struct vectorize_ctx *ctx, struct entry *entry)
|
|||
util_dynarray_append(arr, entry);
|
||||
}
|
||||
|
||||
static void
|
||||
add_entries_from_predecessor(struct vectorize_ctx *ctx, nir_block *block)
|
||||
{
|
||||
nir_cf_node *parent = block->cf_node.parent;
|
||||
bool is_loop_header = false;
|
||||
if (parent->type == nir_cf_node_loop) {
|
||||
nir_loop *loop = nir_cf_node_as_loop(parent);
|
||||
if (block == nir_loop_first_block(loop))
|
||||
is_loop_header = true;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < nir_num_variable_modes; i++) {
|
||||
struct entry *entry = NULL;
|
||||
|
||||
if (is_loop_header) {
|
||||
/* If this is a loop header, just take the last entries of the preheader. */
|
||||
nir_block *preheader = nir_block_cf_tree_prev(block);
|
||||
entry = ctx->per_block_ctx[preheader->index].last_entry[i];
|
||||
} else {
|
||||
/* If all predecessor entries are the same, the entry dominates the block. */
|
||||
bool first_entry = true;
|
||||
set_foreach(&block->predecessors, set_entry) {
|
||||
nir_block *pred = (nir_block *)set_entry->key;
|
||||
if (!first_entry && entry != ctx->per_block_ctx[pred->index].last_entry[i]) {
|
||||
entry = NULL;
|
||||
break;
|
||||
}
|
||||
entry = ctx->per_block_ctx[pred->index].last_entry[i];
|
||||
first_entry = false;
|
||||
}
|
||||
}
|
||||
|
||||
/* Insert into list and hash table. */
|
||||
if (entry) {
|
||||
/* Ensure that the predecessor entry is always considered as first. */
|
||||
entry->index = -1;
|
||||
list_addtail(&entry->head, &ctx->entries[i]);
|
||||
add_entry_to_hash_table(ctx, entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
process_block(nir_function_impl *impl, struct vectorize_ctx *ctx, nir_block *block)
|
||||
{
|
||||
|
|
@ -1821,6 +1881,8 @@ process_block(nir_function_impl *impl, struct vectorize_ctx *ctx, nir_block *blo
|
|||
_mesa_hash_table_clear(ctx->stores[i], NULL);
|
||||
}
|
||||
|
||||
add_entries_from_predecessor(ctx, block);
|
||||
|
||||
/* create entries */
|
||||
unsigned next_index = 0;
|
||||
|
||||
|
|
@ -1860,6 +1922,20 @@ process_block(nir_function_impl *impl, struct vectorize_ctx *ctx, nir_block *blo
|
|||
for (unsigned i = 0; i < nir_num_variable_modes; i++) {
|
||||
progress |= vectorize_entries(ctx, impl, ctx->loads[i]);
|
||||
progress |= vectorize_entries(ctx, impl, ctx->stores[i]);
|
||||
|
||||
/* Remember the last load-entry of each type:
|
||||
* We are allowing to vectorize at most one entry across blocks.
|
||||
*/
|
||||
if (!list_is_empty(&ctx->entries[i])) {
|
||||
struct entry *entry = list_entry(ctx->entries[i].prev, struct entry, head);
|
||||
|
||||
/* For now, only allow per-component bounds-checked modes. We can do better
|
||||
* if we check alignment, whether the second entry can speculate or if the
|
||||
* second entry's block post-dominates the instruction's block.
|
||||
*/
|
||||
if (!entry->is_store && (get_variable_mode(entry) & ctx->options->bounds_checked_modes))
|
||||
ctx->per_block_ctx[block->index].last_entry[i] = entry;
|
||||
}
|
||||
}
|
||||
|
||||
return progress;
|
||||
|
|
@ -1888,6 +1964,9 @@ nir_opt_load_store_vectorize(nir_shader *shader, const nir_load_store_vectorize_
|
|||
nir_shader_index_vars(shader, options->modes);
|
||||
|
||||
nir_foreach_function_impl(impl, shader) {
|
||||
nir_metadata_require(impl, nir_metadata_block_index);
|
||||
ctx->per_block_ctx = rzalloc_array(ctx, struct block_ctx, impl->num_blocks);
|
||||
|
||||
if (options->modes & nir_var_function_temp)
|
||||
nir_function_impl_index_vars(impl);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue