mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-22 20:00:10 +01:00
nir/load_store_vectorizer: fix check_for_robustness() with indirect loads
fossil-db (GFX10.3, robustness2 enabled): Totals from 13958 (9.54% of 146267) affected shaders: VGPRs: 609168 -> 624304 (+2.48%); split: -0.05%, +2.53% CodeSize: 48229504 -> 48488392 (+0.54%); split: -0.02%, +0.56% MaxWaves: 354426 -> 349448 (-1.40%); split: +0.00%, -1.41% Instrs: 9332093 -> 9375053 (+0.46%); split: -0.03%, +0.49% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7295>
This commit is contained in:
parent
674b0af3b3
commit
2e7bceb220
2 changed files with 132 additions and 11 deletions
|
|
@ -997,20 +997,65 @@ check_for_aliasing(struct vectorize_ctx *ctx, struct entry *first, struct entry
|
|||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
check_for_robustness(struct vectorize_ctx *ctx, struct entry *low)
|
||||
static uint64_t
|
||||
calc_gcd(uint64_t a, uint64_t b)
|
||||
{
|
||||
nir_variable_mode mode = get_variable_mode(low);
|
||||
if (mode & ctx->options->robust_modes) {
|
||||
unsigned low_bit_size = get_bit_size(low);
|
||||
unsigned low_size = low->intrin->num_components * low_bit_size;
|
||||
|
||||
/* don't attempt to vectorize accesses if the offset can overflow. */
|
||||
/* TODO: handle indirect accesses. */
|
||||
return low->offset_signed < 0 && low->offset_signed + low_size >= 0;
|
||||
while (b != 0) {
|
||||
int tmp_a = a;
|
||||
a = b;
|
||||
b = tmp_a % b;
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
round_down(uint64_t a, uint64_t b)
|
||||
{
|
||||
return a / b * b;
|
||||
}
|
||||
|
||||
static bool
|
||||
addition_wraps(uint64_t a, uint64_t b, unsigned bits)
|
||||
{
|
||||
uint64_t mask = BITFIELD64_MASK(bits);
|
||||
return ((a + b) & mask) < (a & mask);
|
||||
}
|
||||
|
||||
/* Return true if the addition of "low"'s offset and "high_offset" could wrap
|
||||
* around.
|
||||
*
|
||||
* This is to prevent a situation where the hardware considers the high load
|
||||
* out-of-bounds after vectorization if the low load is out-of-bounds, even if
|
||||
* the wrap-around from the addition could make the high load in-bounds.
|
||||
*/
|
||||
static bool
|
||||
check_for_robustness(struct vectorize_ctx *ctx, struct entry *low, uint64_t high_offset)
|
||||
{
|
||||
nir_variable_mode mode = get_variable_mode(low);
|
||||
if (!(mode & ctx->options->robust_modes))
|
||||
return false;
|
||||
|
||||
/* First, try to use alignment information in case the application provided some. If the addition
|
||||
* of the maximum offset of the low load and "high_offset" wraps around, we can't combine the low
|
||||
* and high loads.
|
||||
*/
|
||||
uint64_t max_low = round_down(UINT64_MAX, low->align_mul) + low->align_offset;
|
||||
if (!addition_wraps(max_low, high_offset, 64))
|
||||
return false;
|
||||
|
||||
/* Second, use information about the factors from address calculation (offset_defs_mul). These
|
||||
* are not guaranteed to be power-of-2.
|
||||
*/
|
||||
uint64_t stride = 0;
|
||||
for (unsigned i = 0; i < low->key->offset_def_count; i++)
|
||||
stride = calc_gcd(low->key->offset_defs_mul[i], stride);
|
||||
|
||||
unsigned addition_bits = low->intrin->src[low->info->base_src].ssa->bit_size;
|
||||
/* low's offset must be a multiple of "stride" plus "low->offset". */
|
||||
max_low = low->offset;
|
||||
if (stride)
|
||||
max_low = round_down(BITFIELD64_MASK(addition_bits), stride) + (low->offset % stride);
|
||||
return addition_wraps(max_low, high_offset, addition_bits);
|
||||
}
|
||||
|
||||
static bool
|
||||
|
|
@ -1037,7 +1082,8 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
|
|||
if (check_for_aliasing(ctx, first, second))
|
||||
return false;
|
||||
|
||||
if (check_for_robustness(ctx, low))
|
||||
uint64_t diff = high->offset_signed - low->offset_signed;
|
||||
if (check_for_robustness(ctx, low, diff))
|
||||
return false;
|
||||
|
||||
/* we can only vectorize non-volatile loads/stores of the same type and with
|
||||
|
|
@ -1055,7 +1101,6 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
|
|||
}
|
||||
|
||||
/* gather information */
|
||||
uint64_t diff = high->offset_signed - low->offset_signed;
|
||||
unsigned low_bit_size = get_bit_size(low);
|
||||
unsigned high_bit_size = get_bit_size(high);
|
||||
unsigned low_size = low->intrin->num_components * low_bit_size;
|
||||
|
|
|
|||
|
|
@ -1859,6 +1859,82 @@ TEST_F(nir_load_store_vectorize_test, ssbo_offset_overflow_robust)
|
|||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2);
|
||||
}
|
||||
|
||||
TEST_F(nir_load_store_vectorize_test, ssbo_offset_overflow_robust_indirect_stride1)
|
||||
{
|
||||
nir_ssa_def *offset = nir_load_local_invocation_index(b);
|
||||
create_indirect_load(nir_var_mem_ssbo, 0, offset, 0x1);
|
||||
create_indirect_load(nir_var_mem_ssbo, 0, nir_iadd_imm(b, offset, 4), 0x2);
|
||||
|
||||
nir_validate_shader(b->shader, NULL);
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2);
|
||||
|
||||
EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo, false, nir_var_mem_ssbo));
|
||||
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2);
|
||||
}
|
||||
|
||||
TEST_F(nir_load_store_vectorize_test, ssbo_offset_overflow_robust_indirect_stride8)
|
||||
{
|
||||
nir_ssa_def *offset = nir_load_local_invocation_index(b);
|
||||
offset = nir_imul_imm(b, offset, 8);
|
||||
create_indirect_load(nir_var_mem_ssbo, 0, offset, 0x1);
|
||||
create_indirect_load(nir_var_mem_ssbo, 0, nir_iadd_imm(b, offset, 4), 0x2);
|
||||
|
||||
nir_validate_shader(b->shader, NULL);
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2);
|
||||
|
||||
EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo, false, nir_var_mem_ssbo));
|
||||
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1);
|
||||
}
|
||||
|
||||
TEST_F(nir_load_store_vectorize_test, ssbo_offset_overflow_robust_indirect_stride12)
|
||||
{
|
||||
nir_ssa_def *offset = nir_load_local_invocation_index(b);
|
||||
offset = nir_imul_imm(b, offset, 12);
|
||||
create_indirect_load(nir_var_mem_ssbo, 0, offset, 0x1);
|
||||
nir_ssa_def *offset_4 = nir_iadd_imm(b, offset, 4);
|
||||
create_indirect_load(nir_var_mem_ssbo, 0, offset_4, 0x2);
|
||||
create_indirect_load(nir_var_mem_ssbo, 0, nir_iadd_imm(b, offset, 8), 0x3);
|
||||
|
||||
nir_validate_shader(b->shader, NULL);
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 3);
|
||||
|
||||
EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo, false, nir_var_mem_ssbo));
|
||||
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2);
|
||||
|
||||
nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ssbo, 0);
|
||||
ASSERT_EQ(load->dest.ssa.bit_size, 32);
|
||||
ASSERT_EQ(load->dest.ssa.num_components, 1);
|
||||
ASSERT_EQ(load->src[1].ssa, offset);
|
||||
EXPECT_INSTR_SWIZZLES(movs[0x1], load, "x");
|
||||
|
||||
load = get_intrinsic(nir_intrinsic_load_ssbo, 1);
|
||||
ASSERT_EQ(load->dest.ssa.bit_size, 32);
|
||||
ASSERT_EQ(load->dest.ssa.num_components, 2);
|
||||
ASSERT_EQ(load->src[1].ssa, offset_4);
|
||||
EXPECT_INSTR_SWIZZLES(movs[0x2], load, "x");
|
||||
EXPECT_INSTR_SWIZZLES(movs[0x3], load, "y");
|
||||
}
|
||||
|
||||
TEST_F(nir_load_store_vectorize_test, ssbo_offset_overflow_robust_indirect_stride16)
|
||||
{
|
||||
nir_ssa_def *offset = nir_load_local_invocation_index(b);
|
||||
offset = nir_imul_imm(b, offset, 16);
|
||||
create_indirect_load(nir_var_mem_ssbo, 0, offset, 0x1);
|
||||
create_indirect_load(nir_var_mem_ssbo, 0, nir_iadd_imm(b, offset, 4), 0x2);
|
||||
create_indirect_load(nir_var_mem_ssbo, 0, nir_iadd_imm(b, offset, 8), 0x3);
|
||||
create_indirect_load(nir_var_mem_ssbo, 0, nir_iadd_imm(b, offset, 12), 0x4);
|
||||
|
||||
nir_validate_shader(b->shader, NULL);
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 4);
|
||||
|
||||
EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo, false, nir_var_mem_ssbo));
|
||||
|
||||
ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1);
|
||||
}
|
||||
|
||||
TEST_F(nir_load_store_vectorize_test, ubo_alignment_16_4)
|
||||
{
|
||||
nir_ssa_def *offset = nir_load_local_invocation_index(b);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue