From 8ce43b7765a43c23c7f7f64628534fad51a4ef62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 15 Jun 2024 00:16:08 -0400 Subject: [PATCH] nir/opt_load_store_vectorize: add entry::num_components We will represent vec6..vec7, vec9..vec15 loads with 8 and 16 components respectively, so we need to track how many components we really use. This is a prerequisite for optimal merging up to vec16. Example: Step 1: vec4 + vec3 ==> vec7as8 (last component unused) Step 2: vec1 + vec7as8 ==> vec8 (last unused component dropped) Without using the number of components read, the same example would end up doing: Step 1: vec4 + vec3 ==> vec8 Step 2: vec1 + vec8 ==> vec9 (fail) Reviewed-by: Alyssa Rosenzweig Reviewed-by: Kenneth Graunke Reviewed-by: Rhys Perry Part-of: --- src/compiler/nir/nir.h | 6 ++++ .../nir/nir_opt_load_store_vectorize.c | 30 +++++++++++-------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 9d6801350eb..1b0304d5401 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -5068,6 +5068,12 @@ nir_component_mask_t nir_src_components_read(const nir_src *src); nir_component_mask_t nir_def_components_read(const nir_def *def); bool nir_def_all_uses_are_fsat(const nir_def *def); +static inline int +nir_def_last_component_read(nir_def *def) +{ + return (int)util_last_bit(nir_def_components_read(def)) - 1; +} + static inline bool nir_def_is_unused(nir_def *ssa) { diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c index e095b3189c8..ede9fe8a243 100644 --- a/src/compiler/nir/nir_opt_load_store_vectorize.c +++ b/src/compiler/nir/nir_opt_load_store_vectorize.c @@ -151,6 +151,7 @@ struct entry { nir_instr *instr; nir_intrinsic_instr *intrin; + unsigned num_components; const struct intrinsic_info *info; enum gl_access_qualifier access; bool is_store; @@ -553,6 +554,7 @@ create_entry(void *mem_ctx, entry->instr = &intrin->instr; entry->info = info; entry->is_store = entry->info->value_src >= 0; + entry->num_components = intrin->num_components; if (entry->info->deref_src >= 0) { entry->deref = nir_src_as_deref(intrin->src[entry->info->deref_src]); @@ -646,8 +648,8 @@ new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size, return false; if (low->is_store) { - unsigned low_size = low->intrin->num_components * get_bit_size(low); - unsigned high_size = high->intrin->num_components * get_bit_size(high); + unsigned low_size = low->num_components * get_bit_size(low); + unsigned high_size = high->num_components * get_bit_size(high); if (low_size % new_bit_size != 0) return false; @@ -737,6 +739,7 @@ vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx, /* update the intrinsic */ first->intrin->num_components = new_num_components; + first->num_components = nir_def_last_component_read(data) + 1; const struct intrinsic_info *info = first->info; @@ -795,7 +798,7 @@ vectorize_stores(nir_builder *b, struct vectorize_ctx *ctx, unsigned new_bit_size, unsigned new_num_components, unsigned high_start) { - ASSERTED unsigned low_size = low->intrin->num_components * get_bit_size(low); + ASSERTED unsigned low_size = low->num_components * get_bit_size(low); assert(low_size % new_bit_size == 0); b->cursor = nir_before_instr(second->instr); @@ -842,6 +845,7 @@ vectorize_stores(nir_builder *b, struct vectorize_ctx *ctx, if (nir_intrinsic_has_write_mask(second->intrin)) nir_intrinsic_set_write_mask(second->intrin, write_mask); second->intrin->num_components = data->num_components; + second->num_components = data->num_components; const struct intrinsic_info *info = second->info; assert(info->value_src >= 0); @@ -961,11 +965,11 @@ may_alias(nir_shader *shader, struct entry *a, struct entry *b) /* TODO: we can look closer at the entry keys */ int64_t diff = compare_entries(a, b); if (diff != INT64_MAX) { - /* with atomics, intrin->num_components can be 0 */ + /* with atomics, nir_intrinsic_instr::num_components can be 0 */ if (diff < 0) - return llabs(diff) < MAX2(b->intrin->num_components, 1u) * (get_bit_size(b) / 8u); + return llabs(diff) < MAX2(b->num_components, 1u) * (get_bit_size(b) / 8u); else - return diff < MAX2(a->intrin->num_components, 1u) * (get_bit_size(a) / 8u); + return diff < MAX2(a->num_components, 1u) * (get_bit_size(a) / 8u); } /* TODO: we can use deref information */ @@ -1131,8 +1135,8 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx, /* gather information */ unsigned low_bit_size = get_bit_size(low); unsigned high_bit_size = get_bit_size(high); - unsigned low_size = low->intrin->num_components * low_bit_size; - unsigned high_size = high->intrin->num_components * high_bit_size; + unsigned low_size = low->num_components * low_bit_size; + unsigned high_size = high->num_components * high_bit_size; unsigned new_size = MAX2(diff * 8u + high_size, low_size); /* find a good bit size for the new load/store */ @@ -1179,8 +1183,8 @@ try_vectorize_shared2(struct vectorize_ctx *ctx, unsigned low_bit_size = get_bit_size(low); unsigned high_bit_size = get_bit_size(high); - unsigned low_size = low->intrin->num_components * low_bit_size / 8; - unsigned high_size = high->intrin->num_components * high_bit_size / 8; + unsigned low_size = low->num_components * low_bit_size / 8; + unsigned high_size = high->num_components * high_bit_size / 8; if ((low_size != 4 && low_size != 8) || (high_size != 4 && high_size != 8)) return false; if (low_size != high_size) @@ -1201,9 +1205,9 @@ try_vectorize_shared2(struct vectorize_ctx *ctx, return false; if (first->is_store) { - if (get_write_mask(low->intrin) != BITFIELD_MASK(low->intrin->num_components)) + if (get_write_mask(low->intrin) != BITFIELD_MASK(low->num_components)) return false; - if (get_write_mask(high->intrin) != BITFIELD_MASK(high->intrin->num_components)) + if (get_write_mask(high->intrin) != BITFIELD_MASK(high->num_components)) return false; } @@ -1269,7 +1273,7 @@ vectorize_sorted_entries(struct vectorize_ctx *ctx, nir_function_impl *impl, struct entry *second = low->index < high->index ? high : low; uint64_t diff = high->offset_signed - low->offset_signed; - bool separate = diff > get_bit_size(low) / 8u * low->intrin->num_components; + bool separate = diff > get_bit_size(low) / 8u * low->num_components; if (separate) { if (!ctx->options->has_shared2_amd || get_variable_mode(first) != nir_var_mem_shared)