From 5712fc48a99d072c2ac6edffd7c8bb7fb0c50916 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Tue, 19 Nov 2024 01:30:28 -0800 Subject: [PATCH] nir: Allow large overfetching holes in the load store vectorizer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The load_*_uniform_block_intel intrinsics always load either 8x or 16x 32-bit components worth of data (so 32 byte increments). This leads to cases where we load a few components from one vec8, followed by a few components of an adjacent vec8. We want to combine those into a vec16 load, as that loads a whole cacheline at a time, and requires less hoops to calculate addresses and request memory loads. So, we allow 7 * 4 = 28 bytes of holes, which handles vec8+vec8 where only the .x component is read. Most drivers and intrinsics will not want such large holes. I thought about adding a per-intrinsic max_hole to the core code, but decided that since we already have driver callbacks, we can just rely on them to reject what makes sense to them. No driver callbacks currently allow holes, so this should not currently affect any drivers. But any work in progress branches may need to be updated to reject larger holes. Reviewed-by: Marek Olšák Reviewed-by: Alyssa Rosenzweig Reviewed-by: Lionel Landwerlin Part-of: --- src/compiler/nir/nir_opt_load_store_vectorize.c | 7 ++++--- src/compiler/nir/tests/load_store_vectorizer_tests.cpp | 2 -- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c index 928ba233f09..4b476f97ca1 100644 --- a/src/compiler/nir/nir_opt_load_store_vectorize.c +++ b/src/compiler/nir/nir_opt_load_store_vectorize.c @@ -1326,13 +1326,14 @@ vectorize_sorted_entries(struct vectorize_ctx *ctx, nir_function_impl *impl, struct entry *second = low->index < high->index ? high : low; uint64_t diff = high->offset_signed - low->offset_signed; - /* Allow overfetching by 4 bytes, which can be rejected - * by the callback if needed. + /* Allow overfetching by 28 bytes, which can be rejected by the + * callback if needed. Driver callbacks will likely want to + * restrict this to a smaller value, say 4 bytes (or none). */ unsigned max_hole = first->is_store || (ctx->options->has_shared2_amd && - get_variable_mode(first) == nir_var_mem_shared) ? 0 : 4; + get_variable_mode(first) == nir_var_mem_shared) ? 0 : 28; unsigned low_size = get_bit_size(low) / 8u * low->num_components; bool separate = diff > max_hole + low_size; diff --git a/src/compiler/nir/tests/load_store_vectorizer_tests.cpp b/src/compiler/nir/tests/load_store_vectorizer_tests.cpp index feae1c51259..66bc813623f 100644 --- a/src/compiler/nir/tests/load_store_vectorizer_tests.cpp +++ b/src/compiler/nir/tests/load_store_vectorizer_tests.cpp @@ -346,8 +346,6 @@ bool nir_load_store_vectorize_test::mem_vectorize_callback( { nir_load_store_vectorize_test *test = (nir_load_store_vectorize_test *)data; - assert(hole_size <= 4); - if (hole_size > test->max_hole_size || (!test->overfetch && !nir_num_components_valid(num_components))) return false;