mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-03 13:40:11 +01:00
nir/opt_load_store_vectorize: allow a 4-byte hole between 2 loads
If there is a 4-byte hole between 2 loads, drivers can now optionally
vectorize the loads by including the hole between them, e.g.:
4B load + 4B hole + 8B load --> 16B load
All vectorize callbacks already reject all holes, but AMD will want to
allow it.
radeonsi+ACO with the new vectorization callback:
TOTALS FROM AFFECTED SHADERS (25248/58918)
VGPRs: 871116 -> 871872 (0.09 %)
Spilled SGPRs: 397 -> 407 (2.52 %)
Code Size: 43074536 -> 42496352 (-1.34 %) bytes
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29398>
This commit is contained in:
parent
80c156422d
commit
a44e5cfccf
2 changed files with 100 additions and 3 deletions
|
|
@ -652,9 +652,14 @@ new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size,
|
|||
if (new_bit_size / common_bit_size > NIR_MAX_VEC_COMPONENTS)
|
||||
return false;
|
||||
|
||||
unsigned low_size = low->intrin->num_components * get_bit_size(low) / 8;
|
||||
/* The hole size can be less than 0 if low and high instructions overlap. */
|
||||
unsigned hole_size =
|
||||
MAX2(high->offset_signed - (low->offset_signed + low_size), 0);
|
||||
|
||||
if (!ctx->options->callback(low->align_mul,
|
||||
low->align_offset,
|
||||
new_bit_size, new_num_components, 0,
|
||||
new_bit_size, new_num_components, hole_size,
|
||||
low->intrin, high->intrin,
|
||||
ctx->options->cb_data))
|
||||
return false;
|
||||
|
|
@ -1320,7 +1325,16 @@ vectorize_sorted_entries(struct vectorize_ctx *ctx, nir_function_impl *impl,
|
|||
struct entry *second = low->index < high->index ? high : low;
|
||||
|
||||
uint64_t diff = high->offset_signed - low->offset_signed;
|
||||
bool separate = diff > get_bit_size(low) / 8u * low->num_components;
|
||||
/* Allow overfetching by 4 bytes, which can be rejected
|
||||
* by the callback if needed.
|
||||
*/
|
||||
unsigned max_hole =
|
||||
first->is_store ||
|
||||
(ctx->options->has_shared2_amd &&
|
||||
get_variable_mode(first) == nir_var_mem_shared) ? 0 : 4;
|
||||
unsigned low_size = get_bit_size(low) / 8u * low->num_components;
|
||||
bool separate = diff > max_hole + low_size;
|
||||
|
||||
if (separate) {
|
||||
if (!ctx->options->has_shared2_amd ||
|
||||
get_variable_mode(first) != nir_var_mem_shared)
|
||||
|
|
|
|||
|
|
@ -83,6 +83,7 @@ protected:
|
|||
std::map<unsigned, nir_def*> res_map;
|
||||
unsigned max_components = 4;
|
||||
bool overfetch = false;
|
||||
unsigned max_hole_size = 0;
|
||||
};
|
||||
|
||||
std::string
|
||||
|
|
@ -345,7 +346,9 @@ bool nir_load_store_vectorize_test::mem_vectorize_callback(
|
|||
{
|
||||
nir_load_store_vectorize_test *test = (nir_load_store_vectorize_test *)data;
|
||||
|
||||
if (hole_size ||
|
||||
assert(hole_size <= 4);
|
||||
|
||||
if (hole_size > test->max_hole_size ||
|
||||
(!test->overfetch && !nir_num_components_valid(num_components)))
|
||||
return false;
|
||||
|
||||
|
|
@ -2177,3 +2180,83 @@ TEST_F(nir_load_store_vectorize_test, ubo_vec7as8_vec1)
|
|||
/* TODO: This is not merged by the pass, but we could implement it. */
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
|
||||
}
|
||||
|
||||
TEST_F(nir_load_store_vectorize_test, ubo_vec2_hole1_vec1_disallowed)
|
||||
{
|
||||
create_load(nir_var_mem_ubo, 0, 0, 0x1, 32, 2);
|
||||
create_load(nir_var_mem_ubo, 0, 12, 0x2, 32, 1);
|
||||
|
||||
nir_validate_shader(b->shader, NULL);
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
|
||||
|
||||
EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo));
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
|
||||
}
|
||||
|
||||
TEST_F(nir_load_store_vectorize_test, ubo_vec2_hole1_vec1)
|
||||
{
|
||||
create_load(nir_var_mem_ubo, 0, 0, 0x1, 32, 2);
|
||||
create_load(nir_var_mem_ubo, 0, 12, 0x2, 32, 1);
|
||||
|
||||
nir_validate_shader(b->shader, NULL);
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
|
||||
|
||||
this->max_hole_size = 4;
|
||||
EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo));
|
||||
this->max_hole_size = 0;
|
||||
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 1);
|
||||
|
||||
nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ubo, 0);
|
||||
ASSERT_EQ(load->def.bit_size, 32);
|
||||
ASSERT_EQ(load->def.num_components, 4);
|
||||
ASSERT_EQ(nir_intrinsic_range_base(load), 0);
|
||||
ASSERT_EQ(nir_intrinsic_range(load), 16);
|
||||
ASSERT_EQ(nir_def_components_read(&load->def), 1 | 2 | 8);
|
||||
ASSERT_EQ(nir_src_as_uint(load->src[1]), 0);
|
||||
EXPECT_INSTR_SWIZZLES(movs[0x1], load, "xy");
|
||||
EXPECT_INSTR_SWIZZLES(movs[0x2], load, "w");
|
||||
}
|
||||
|
||||
TEST_F(nir_load_store_vectorize_test, ubo_vec2_hole2_vec4_disallowed)
|
||||
{
|
||||
create_load(nir_var_mem_ubo, 0, 0, 0x1, 32, 2);
|
||||
create_load(nir_var_mem_ubo, 0, 16, 0x2, 32, 1);
|
||||
|
||||
nir_validate_shader(b->shader, NULL);
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
|
||||
|
||||
/* The pass only allows 4-byte holes. */
|
||||
this->max_hole_size = 8;
|
||||
EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo));
|
||||
this->max_hole_size = 0;
|
||||
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
|
||||
}
|
||||
|
||||
TEST_F(nir_load_store_vectorize_test, ubo_vec3_hole1_vec3)
|
||||
{
|
||||
create_load(nir_var_mem_ubo, 0, 0, 0x1, 32, 3);
|
||||
create_load(nir_var_mem_ubo, 0, 16, 0x2, 32, 3);
|
||||
|
||||
nir_validate_shader(b->shader, NULL);
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 2);
|
||||
|
||||
this->overfetch = true;
|
||||
this->max_hole_size = 4;
|
||||
EXPECT_TRUE(run_vectorizer(nir_var_mem_ubo));
|
||||
this->max_hole_size = 0;
|
||||
this->overfetch = false;
|
||||
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ubo), 1);
|
||||
|
||||
nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ubo, 0);
|
||||
ASSERT_EQ(load->def.bit_size, 32);
|
||||
ASSERT_EQ(load->def.num_components, 8);
|
||||
ASSERT_EQ(nir_intrinsic_range_base(load), 0);
|
||||
ASSERT_EQ(nir_intrinsic_range(load), 32);
|
||||
ASSERT_EQ(nir_def_components_read(&load->def), 0x77);
|
||||
ASSERT_EQ(nir_src_as_uint(load->src[1]), 0);
|
||||
EXPECT_INSTR_SWIZZLES(movs[0x1], load, "xyz");
|
||||
EXPECT_INSTR_SWIZZLES(movs[0x2], load, "efg");
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue