mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 18:18:06 +02:00
nir/opt_load_store_vectorize: create load_shared2_amd/store_shared2_amd
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13778>
This commit is contained in:
parent
dc835626b3
commit
778fc176b1
2 changed files with 106 additions and 15 deletions
|
|
@ -5426,6 +5426,7 @@ typedef struct {
|
|||
nir_variable_mode modes;
|
||||
nir_variable_mode robust_modes;
|
||||
void *cb_data;
|
||||
bool has_shared2_amd;
|
||||
} nir_load_store_vectorize_options;
|
||||
|
||||
bool nir_opt_load_store_vectorize(nir_shader *shader, const nir_load_store_vectorize_options *options);
|
||||
|
|
|
|||
|
|
@ -1104,9 +1104,7 @@ is_strided_vector(const struct glsl_type *type)
|
|||
}
|
||||
|
||||
static bool
|
||||
try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
|
||||
struct entry *low, struct entry *high,
|
||||
struct entry *first, struct entry *second)
|
||||
can_vectorize(struct vectorize_ctx *ctx, struct entry *first, struct entry *second)
|
||||
{
|
||||
if (!(get_variable_mode(first) & ctx->options->modes) ||
|
||||
!(get_variable_mode(second) & ctx->options->modes))
|
||||
|
|
@ -1115,16 +1113,27 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
|
|||
if (check_for_aliasing(ctx, first, second))
|
||||
return false;
|
||||
|
||||
uint64_t diff = high->offset_signed - low->offset_signed;
|
||||
if (check_for_robustness(ctx, low, diff))
|
||||
return false;
|
||||
|
||||
/* we can only vectorize non-volatile loads/stores of the same type and with
|
||||
* the same access */
|
||||
if (first->info != second->info || first->access != second->access ||
|
||||
(first->access & ACCESS_VOLATILE) || first->info->is_atomic)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
|
||||
struct entry *low, struct entry *high,
|
||||
struct entry *first, struct entry *second)
|
||||
{
|
||||
if (!can_vectorize(ctx, first, second))
|
||||
return false;
|
||||
|
||||
uint64_t diff = high->offset_signed - low->offset_signed;
|
||||
if (check_for_robustness(ctx, low, diff))
|
||||
return false;
|
||||
|
||||
/* don't attempt to vectorize accesses of row-major matrix columns */
|
||||
if (first->deref) {
|
||||
const struct glsl_type *first_type = first->deref->type;
|
||||
|
|
@ -1175,6 +1184,76 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
try_vectorize_shared2(nir_function_impl *impl, struct vectorize_ctx *ctx,
|
||||
struct entry *low, struct entry *high,
|
||||
struct entry *first, struct entry *second)
|
||||
{
|
||||
if (!can_vectorize(ctx, first, second) || first->deref)
|
||||
return false;
|
||||
|
||||
unsigned low_bit_size = get_bit_size(low);
|
||||
unsigned high_bit_size = get_bit_size(high);
|
||||
unsigned low_size = low->intrin->num_components * low_bit_size / 8;
|
||||
unsigned high_size = high->intrin->num_components * high_bit_size / 8;
|
||||
if ((low_size != 4 && low_size != 8) || (high_size != 4 && high_size != 8))
|
||||
return false;
|
||||
if (low_size != high_size)
|
||||
return false;
|
||||
if (low->align_mul % low_size || low->align_offset % low_size)
|
||||
return false;
|
||||
if (high->align_mul % low_size || high->align_offset % low_size)
|
||||
return false;
|
||||
|
||||
uint64_t diff = high->offset_signed - low->offset_signed;
|
||||
bool st64 = diff % (64 * low_size) == 0;
|
||||
unsigned stride = st64 ? 64 * low_size : low_size;
|
||||
if (diff % stride || diff > 255 * stride)
|
||||
return false;
|
||||
|
||||
/* try to avoid creating accesses we can't combine additions/offsets into */
|
||||
if (high->offset > 255 * stride || (st64 && high->offset % stride))
|
||||
return false;
|
||||
|
||||
if (first->is_store) {
|
||||
if (nir_intrinsic_write_mask(low->intrin) != BITFIELD_MASK(low->intrin->num_components))
|
||||
return false;
|
||||
if (nir_intrinsic_write_mask(high->intrin) != BITFIELD_MASK(high->intrin->num_components))
|
||||
return false;
|
||||
}
|
||||
|
||||
/* vectorize the accesses */
|
||||
nir_builder b;
|
||||
nir_builder_init(&b, impl);
|
||||
|
||||
b.cursor = nir_after_instr(first->is_store ? second->instr : first->instr);
|
||||
|
||||
nir_ssa_def *offset = first->intrin->src[first->is_store].ssa;
|
||||
offset = nir_iadd_imm(&b, offset, nir_intrinsic_base(first->intrin));
|
||||
if (first != low)
|
||||
offset = nir_iadd_imm(&b, offset, -(int)diff);
|
||||
|
||||
if (first->is_store) {
|
||||
nir_ssa_def *low_val = low->intrin->src[low->info->value_src].ssa;
|
||||
nir_ssa_def *high_val = high->intrin->src[high->info->value_src].ssa;
|
||||
nir_ssa_def *val = nir_vec2(&b, nir_bitcast_vector(&b, low_val, low_size * 8u),
|
||||
nir_bitcast_vector(&b, high_val, low_size * 8u));
|
||||
nir_store_shared2_amd(&b, val, offset, .offset1=diff/stride, .st64=st64);
|
||||
} else {
|
||||
nir_ssa_def *new_def = nir_load_shared2_amd(&b, low_size * 8u, offset, .offset1=diff/stride,
|
||||
.st64=st64);
|
||||
nir_ssa_def_rewrite_uses(&low->intrin->dest.ssa,
|
||||
nir_bitcast_vector(&b, nir_channel(&b, new_def, 0), low_bit_size));
|
||||
nir_ssa_def_rewrite_uses(&high->intrin->dest.ssa,
|
||||
nir_bitcast_vector(&b, nir_channel(&b, new_def, 1), high_bit_size));
|
||||
}
|
||||
|
||||
nir_instr_remove(first->instr);
|
||||
nir_instr_remove(second->instr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
update_align(struct entry *entry)
|
||||
{
|
||||
|
|
@ -1204,17 +1283,28 @@ vectorize_sorted_entries(struct vectorize_ctx *ctx, nir_function_impl *impl,
|
|||
if (!high)
|
||||
continue;
|
||||
|
||||
uint64_t diff = high->offset_signed - low->offset_signed;
|
||||
if (diff > get_bit_size(low) / 8u * low->intrin->num_components)
|
||||
break;
|
||||
|
||||
struct entry *first = low->index < high->index ? low : high;
|
||||
struct entry *second = low->index < high->index ? high : low;
|
||||
|
||||
if (try_vectorize(impl, ctx, low, high, first, second)) {
|
||||
low = low->is_store ? second : first;
|
||||
*util_dynarray_element(arr, struct entry *, second_idx) = NULL;
|
||||
progress = true;
|
||||
uint64_t diff = high->offset_signed - low->offset_signed;
|
||||
bool separate = diff > get_bit_size(low) / 8u * low->intrin->num_components;
|
||||
if (separate) {
|
||||
if (!ctx->options->has_shared2_amd ||
|
||||
get_variable_mode(first) != nir_var_mem_shared)
|
||||
break;
|
||||
|
||||
if (try_vectorize_shared2(impl, ctx, low, high, first, second)) {
|
||||
low = NULL;
|
||||
*util_dynarray_element(arr, struct entry *, second_idx) = NULL;
|
||||
progress = true;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (try_vectorize(impl, ctx, low, high, first, second)) {
|
||||
low = low->is_store ? second : first;
|
||||
*util_dynarray_element(arr, struct entry *, second_idx) = NULL;
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue