nir/opt_load_store_vectorize: allow a 4-byte hole between 2 loads

If there is a 4-byte hole between 2 loads, drivers can now optionally
vectorize the loads by including the hole between them, e.g.:
    4B load + 4B hole + 8B load --> 16B load

All vectorize callbacks already reject all holes, but AMD will want to
allow it.

radeonsi+ACO with the new vectorization callback:

TOTALS FROM AFFECTED SHADERS (25248/58918)
  VGPRs: 871116 -> 871872 (0.09 %)
  Spilled SGPRs: 397 -> 407 (2.52 %)
  Code Size: 43074536 -> 42496352 (-1.34 %) bytes

Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29398>
This commit is contained in:
Marek Olšák 2024-05-24 14:47:39 -04:00 committed by Marge Bot
parent 80c156422d
commit a44e5cfccf
2 changed files with 100 additions and 3 deletions

View file

@ -652,9 +652,14 @@ new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size,
if (new_bit_size / common_bit_size > NIR_MAX_VEC_COMPONENTS)
return false;
unsigned low_size = low->intrin->num_components * get_bit_size(low) / 8;
/* The hole size can be less than 0 if low and high instructions overlap. */
unsigned hole_size =
MAX2(high->offset_signed - (low->offset_signed + low_size), 0);
if (!ctx->options->callback(low->align_mul,
low->align_offset,
new_bit_size, new_num_components, 0,
new_bit_size, new_num_components, hole_size,
low->intrin, high->intrin,
ctx->options->cb_data))
return false;
@ -1320,7 +1325,16 @@ vectorize_sorted_entries(struct vectorize_ctx *ctx, nir_function_impl *impl,
struct entry *second = low->index < high->index ? high : low;
uint64_t diff = high->offset_signed - low->offset_signed;
bool separate = diff > get_bit_size(low) / 8u * low->num_components;
/* Allow overfetching by 4 bytes, which can be rejected
* by the callback if needed.
*/
unsigned max_hole =
first->is_store ||
(ctx->options->has_shared2_amd &&
get_variable_mode(first) == nir_var_mem_shared) ? 0 : 4;
unsigned low_size = get_bit_size(low) / 8u * low->num_components;
bool separate = diff > max_hole + low_size;
if (separate) {
if (!ctx->options->has_shared2_amd ||
get_variable_mode(first) != nir_var_mem_shared)

View file

@ -83,6 +83,7 @@ protected:
std::map<unsigned, nir_def*> res_map;
unsigned max_components = 4;
bool overfetch = false;
unsigned max_hole_size = 0;
};
std::string
@ -345,7 +346,9 @@ bool nir_load_store_vectorize_test::mem_vectorize_callback(
{
nir_load_store_vectorize_test *test = (nir_load_store_vectorize_test *)data;
if (hole_size ||
assert(hole_size <= 4);
if (hole_size > test->max_hole_size ||
(!test->overfetch && !nir_num_components_valid(num_components)))
return false;
@ -2177,3 +2180,83 @@ TEST_F(nir_load_store_vectorize_test, ubo_vec7as8_vec1)
/* TODO: This is not merged by the pass, but we could implement it. */
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
}
TEST_F(nir_load_store_vectorize_test, ubo_vec2_hole1_vec1_disallowed)
{
create_load(nir_var_mem_ubo, 0, 0, 0x1, 32, 2);
create_load(nir_var_mem_ubo, 0, 12, 0x2, 32, 1);
nir_validate_shader(b->shader, NULL);
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo));
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
}
TEST_F(nir_load_store_vectorize_test, ubo_vec2_hole1_vec1)
{
create_load(nir_var_mem_ubo, 0, 0, 0x1, 32, 2);
create_load(nir_var_mem_ubo, 0, 12, 0x2, 32, 1);
nir_validate_shader(b->shader, NULL);
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
this->max_hole_size = 4;
EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo));
this->max_hole_size = 0;
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 1);
nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ubo, 0);
ASSERT_EQ(load->def.bit_size, 32);
ASSERT_EQ(load->def.num_components, 4);
ASSERT_EQ(nir_intrinsic_range_base(load), 0);
ASSERT_EQ(nir_intrinsic_range(load), 16);
ASSERT_EQ(nir_def_components_read(&load->def), 1 | 2 | 8);
ASSERT_EQ(nir_src_as_uint(load->src[1]), 0);
EXPECT_INSTR_SWIZZLES(movs[0x1], load, "xy");
EXPECT_INSTR_SWIZZLES(movs[0x2], load, "w");
}
TEST_F(nir_load_store_vectorize_test, ubo_vec2_hole2_vec4_disallowed)
{
create_load(nir_var_mem_ubo, 0, 0, 0x1, 32, 2);
create_load(nir_var_mem_ubo, 0, 16, 0x2, 32, 1);
nir_validate_shader(b->shader, NULL);
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
/* The pass only allows 4-byte holes. */
this->max_hole_size = 8;
EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo));
this->max_hole_size = 0;
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
}
TEST_F(nir_load_store_vectorize_test, ubo_vec3_hole1_vec3)
{
create_load(nir_var_mem_ubo, 0, 0, 0x1, 32, 3);
create_load(nir_var_mem_ubo, 0, 16, 0x2, 32, 3);
nir_validate_shader(b->shader, NULL);
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
this->overfetch = true;
this->max_hole_size = 4;
EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo));
this->max_hole_size = 0;
this->overfetch = false;
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 1);
nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ubo, 0);
ASSERT_EQ(load->def.bit_size, 32);
ASSERT_EQ(load->def.num_components, 8);
ASSERT_EQ(nir_intrinsic_range_base(load), 0);
ASSERT_EQ(nir_intrinsic_range(load), 32);
ASSERT_EQ(nir_def_components_read(&load->def), 0x77);
ASSERT_EQ(nir_src_as_uint(load->src[1]), 0);
EXPECT_INSTR_SWIZZLES(movs[0x1], load, "xyz");
EXPECT_INSTR_SWIZZLES(movs[0x2], load, "efg");
}