pan/compiler: Lower unaligned scratch memory accesses

Using OpenCL size/alignment requirements we might get some types
with a size bigger than their alignment.  This breaks the current TLS
load/stores that expect 16-byte alignment for 16-byte load/stores. This
problem probably hasn't surfaced yet because we reassigned OpenCL scratch
in 16-byte slots, but will break if we compact the layout.

Signed-off-by: Lorenzo Rossi <lorenzo.rossi@collabora.com>
Reviewed-by: Christoph Pillmayer <christoph.pillmayer@arm.com>
Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40924>
This commit is contained in:
Lorenzo Rossi 2026-04-10 14:44:17 +02:00 committed by Marge Bot
parent ac23e3c6c5
commit 408d03291d

View file

@ -656,18 +656,148 @@ bifrost_nir_lower_load_output(nir_shader *nir)
nir_metadata_control_flow, NULL);
}
static bool
bytes_can_straddle_boundary(unsigned bytes, unsigned align_mul,
unsigned align_off, unsigned boundary)
{
/* addr = k*align_mul + align_off */
assert(IS_POT(align_mul));
assert(align_off < align_mul);
assert(IS_POT(boundary));
if (align_mul >= boundary)
return (align_off % boundary) + bytes > boundary;
else
return align_off + bytes > align_mul;
}
static nir_mem_access_size_align
size_align_for_bytes(uint8_t bytes, uint8_t bit_size,
uint32_t align_mul, uint32_t align_off)
{
/* Grab the largest power of two which divides bytes */
assert(bytes != 0);
uint8_t bytes_max_bit_size = 8 << (ffs(bytes) - 1);
/* Clamp the bit size if needed */
bit_size = MIN3(bit_size, bytes_max_bit_size, 64);
return (nir_mem_access_size_align) {
.num_components = (bytes * 8) / bit_size,
.bit_size = bit_size,
.align = nir_combined_align(align_mul, align_off),
.shift = nir_mem_access_shift_method_scalar,
};
}
static nir_mem_access_size_align
scratch_access_size_align_v9(uint8_t bytes, uint8_t bit_size,
uint32_t align_mul, uint32_t align_off)
{
/* On v9-v10, we must never straddle the 16-byte boundary. */
while (bytes_can_straddle_boundary(bytes, align_mul, align_off, 16))
bytes--;
return size_align_for_bytes(bytes, bit_size, align_mul, align_off);
}
static nir_mem_access_size_align
scratch_access_size_align_v11(uint8_t bytes, uint8_t bit_size,
uint32_t align_mul, uint32_t align_off,
bool is_store)
{
assert(align_off < align_mul);
/* v11+ has complex rules based on how many bytes are accessed, check if the
* access is legal, otherwise reduce the amount of bytes accessed.
*/
for (; bytes > 1; bytes--) {
switch (bytes) {
case 2:
/* Must not straddle 4 bytes boundaries */
if (bytes_can_straddle_boundary(bytes, align_mul, align_off, 4))
continue;
return size_align_for_bytes(1, bit_size, align_mul, align_off);
case 3:
/* No restrictions for store */
if (is_store)
return size_align_for_bytes(3, 8, align_mul, align_off);
/* We can do 3-byte loads as long as they're aligned to 4 bytes.
* If align_off == 1, nir_lower_mem_access_bit_sizes() will upgrade
* to an aligned 4-byte load and shift the result.
*/
if (align_mul >= 4 && align_off <= 1)
return size_align_for_bytes(3, 8, 4, 0);
/* Otherwise, we have to split the load */
continue;
case 4:
case 8:
case 16:
/* Must be 4-byte aligned */
if (nir_combined_align(align_mul, align_off) < 4)
continue;
return size_align_for_bytes(bytes, bit_size, align_mul, align_off);
case 6:
if (is_store) {
/* Stores must not straddle 4 bytes boundaries. */
if (bytes_can_straddle_boundary(bytes, align_mul, align_off, 4))
continue;
return size_align_for_bytes(6, bit_size, align_mul, align_off);
} else {
/* Loads must be aligned to 4 bytes. */
if (nir_combined_align(align_mul, align_off) < 4)
continue;
return size_align_for_bytes(6, bit_size, 4, 0);
}
default: /* 5, 7, 9-15 */
/* All other sizes have to fall back to a smaller load */
continue;
}
}
/* 1-byte accesses are always unrestricted */
assert(bytes == 1);
return size_align_for_bytes(1, 8, align_mul, align_off);
}
static nir_mem_access_size_align
mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
uint8_t bit_size, uint32_t align_mul,
uint32_t align_offset, bool offset_is_const,
enum gl_access_qualifier access, const void *cb_data)
{
uint64_t gpu_id = *(uint64_t *)cb_data;
uint32_t align = nir_combined_align(align_mul, align_offset);
assert(util_is_power_of_two_nonzero(align));
bool is_scratch = intrin == nir_intrinsic_load_scratch ||
intrin == nir_intrinsic_store_scratch;
/* No more than 16 bytes at a time. */
bytes = MIN2(bytes, 16);
/* TLS memory requires special alignment, handle it separately */
if (pan_arch(gpu_id) >= 9 && is_scratch) {
bool is_store = intrin == nir_intrinsic_store_scratch;
if (pan_arch(gpu_id) >= 11)
return scratch_access_size_align_v11(bytes, bit_size, align_mul,
align_offset, is_store);
else
return scratch_access_size_align_v9(bytes, bit_size, align_mul,
align_offset);
}
/* All loads must be aligned up to the next power of two of their byte
* size. If we have insufficient alignment, split into smaller loads. */
unsigned required_align = util_next_power_of_two(bytes);
@ -758,6 +888,7 @@ bifrost_postprocess_nir(nir_shader *nir, uint64_t gpu_id)
nir_var_shader_temp | nir_var_function_temp |
nir_var_mem_global | nir_var_mem_shared,
.callback = mem_access_size_align_cb,
.cb_data = (void *) &gpu_id,
};
NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &mem_size_options);