amd/lower_mem_access_bit_sizes: lower all SMEM instructions to supported sizes

This creates more SMEM instruction, mostly because vec3 64bit are being split
instead of overfetched.

Totals from 442 (0.55% of 79839) affected shaders: (Navi48)

Instrs: 288998 -> 289469 (+0.16%); split: -0.04%, +0.21%
CodeSize: 1538212 -> 1541460 (+0.21%); split: -0.03%, +0.24%
Latency: 3010072 -> 3009373 (-0.02%); split: -0.04%, +0.01%
InvThroughput: 885572 -> 885564 (-0.00%); split: -0.00%, +0.00%
VClause: 6900 -> 6885 (-0.22%); split: -0.28%, +0.06%
SClause: 4457 -> 4469 (+0.27%); split: -0.18%, +0.45%
VALU: 162473 -> 162469 (-0.00%)
SALU: 42871 -> 42855 (-0.04%); split: -0.05%, +0.01%
SMEM: 6893 -> 7239 (+5.02%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37843>
This commit is contained in:
Daniel Schürmann 2025-10-08 18:49:09 +02:00 committed by Marge Bot
parent 9553e56c67
commit fbf0399517

View file

@ -81,6 +81,39 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui
return res; return res;
} }
if (is_smem) {
/* Round up subdword loads if unsupported. */
const bool supported_subdword = cb_data->gfx_level >= GFX12 && intrin != nir_intrinsic_load_push_constant;
if (bit_size < 32 && (bytes >= 3 || !supported_subdword))
bytes = align(bytes, 4);
/* Generally, require an alignment of 4. */
res.align = MIN2(4, bytes);
bit_size = MAX2(bit_size, res.align * 8);
/* Maximum SMEM load size is 512 bits (16 dwords). */
bytes = MIN2(bytes, 64);
/* Lower unsupported sizes. */
if (!util_is_power_of_two_nonzero(bytes) && (cb_data->gfx_level < GFX12 || bytes != 12)) {
const uint8_t larger = util_next_power_of_two(bytes);
const uint8_t smaller = larger / 2;
const bool is_buffer_load = intrin == nir_intrinsic_load_ubo ||
intrin == nir_intrinsic_load_ssbo ||
intrin == nir_intrinsic_load_constant;
const bool is_aligned = align_mul % smaller == 0;
/* Overfetch up to 1 dword if this is a bounds-checked buffer load or the access is aligned. */
bool overfetch = bytes + 4 >= larger && (is_buffer_load || is_aligned);
bytes = overfetch ? larger : smaller;
res.align = is_aligned ? smaller : res.align;
}
res.num_components = DIV_ROUND_UP(bytes, bit_size / 8);
res.bit_size = bit_size;
res.shift = nir_mem_access_shift_method_shift64;
return res;
}
/* Make 8-bit accesses 16-bit if possible */ /* Make 8-bit accesses 16-bit if possible */
if (is_load && bit_size == 8 && combined_align >= 2 && bytes % 2 == 0) if (is_load && bit_size == 8 && combined_align >= 2 && bytes % 2 == 0)
bit_size = 16; bit_size = 16;
@ -94,8 +127,6 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui
if (cb_data->use_llvm && access & (ACCESS_COHERENT | ACCESS_VOLATILE) && if (cb_data->use_llvm && access & (ACCESS_COHERENT | ACCESS_VOLATILE) &&
(intrin == nir_intrinsic_load_global || intrin == nir_intrinsic_store_global)) (intrin == nir_intrinsic_load_global || intrin == nir_intrinsic_store_global))
max_components = 1; max_components = 1;
else if (is_smem)
max_components = MIN2(512 / bit_size, 16);
res.num_components = MIN2(DIV_ROUND_UP(bytes, bit_size / 8), max_components); res.num_components = MIN2(DIV_ROUND_UP(bytes, bit_size / 8), max_components);
res.bit_size = bit_size; res.bit_size = bit_size;
@ -105,11 +136,8 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui
if (!is_load) if (!is_load)
return res; return res;
/* Lower 8/16-bit loads to 32-bit, unless it's a VMEM (or SMEM on GFX12+) scalar load. */ /* Lower 8/16-bit loads to 32-bit, unless it's a scalar load. */
const bool supported_subdword = res.num_components == 1 &&
const bool supports_scalar_subdword =
!is_smem || (cb_data->gfx_level >= GFX12 && intrin != nir_intrinsic_load_push_constant);
const bool supported_subdword = res.num_components == 1 && supports_scalar_subdword &&
(!cb_data->use_llvm || intrin != nir_intrinsic_load_ubo); (!cb_data->use_llvm || intrin != nir_intrinsic_load_ubo);
if (res.bit_size >= 32 || supported_subdword) if (res.bit_size >= 32 || supported_subdword)
@ -122,7 +150,7 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui
if (align_mul < 4) { if (align_mul < 4) {
/* If we split the load, only lower it to 32-bit if this is a SMEM load. */ /* If we split the load, only lower it to 32-bit if this is a SMEM load. */
const unsigned chunk_bytes = align(bytes, 4) - max_pad; const unsigned chunk_bytes = align(bytes, 4) - max_pad;
if (!is_smem && chunk_bytes < bytes) if (chunk_bytes < bytes)
return res; return res;
} }
@ -135,7 +163,7 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui
res.num_components = MIN2(res.num_components, max_components); res.num_components = MIN2(res.num_components, max_components);
res.bit_size = 32; res.bit_size = 32;
res.align = 4; res.align = 4;
res.shift = is_smem ? res.shift : nir_mem_access_shift_method_bytealign_amd; res.shift = nir_mem_access_shift_method_bytealign_amd;
return res; return res;
} }