From 72e0eda2601beccf60ca4c0cde16e72b5a468050 Mon Sep 17 00:00:00 2001 From: Olivia Lee Date: Mon, 6 Apr 2026 11:14:09 -0700 Subject: [PATCH] pan/bi: fix memory access alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Memory accesses need to be aligned up to the next power of two of the full access size. Component count and bit-size don't matter to the hardware, only the total size. shader-db results are pretty much what you would expect, there are a few shaders that have increased LS instructions as a result of splitting accesses to satisfy alignment requirements that were previously ignored. The one surprising thing is that there are several shaders that have reduced uniform usage. Looking at some of these individually, what happened is that splitting UBO loads early allowed the compiler to eliminate loads from unused ranges of the access. total instrs in shared programs: 719166 -> 719174 (<.01%) instrs in affected programs: 2355 -> 2363 (0.34%) helped: 4 HURT: 6 helped stats (abs) min: 1.0 max: 9.0 x̄: 3.00 x̃: 1 helped stats (rel) min: 0.36% max: 6.52% x̄: 1.99% x̃: 0.54% HURT stats (abs) min: 1.0 max: 4.0 x̄: 3.33 x̃: 4 HURT stats (rel) min: 0.65% max: 2.13% x̄: 1.38% x̃: 1.48% 95% mean confidence interval for instrs value: -2.14 3.74 95% mean confidence interval for instrs %-change: -1.76% 1.82% Inconclusive result (value mean confidence interval includes 0). total cycles in shared programs: 30210.83 -> 30218.81 (0.03%) cycles in affected programs: 50 -> 57.99 (15.97%) helped: 2 HURT: 6 helped stats (abs) min: 0.0078129999999999589 max: 0.070312000000000041 x̄: 0.04 x̃: 0 helped stats (rel) min: 1.10% max: 10.23% x̄: 5.66% x̃: 5.66% HURT stats (abs) min: 0.03125 max: 5.0 x̄: 1.34 x̃: 1 HURT stats (rel) min: 2.38% max: 25.00% x̄: 13.05% x̃: 14.26% 95% mean confidence interval for cycles value: -0.42 2.41 95% mean confidence interval for cycles %-change: -1.74% 18.49% Inconclusive result (value mean confidence interval includes 0). total cvt in shared programs: 2385.91 -> 2385.91 (<.01%) cvt in affected programs: 11.14 -> 11.14 (<.01%) helped: 5 HURT: 4 helped stats (abs) min: 0.0078119999999999301 max: 0.070312000000000041 x̄: 0.02 x̃: 0 helped stats (rel) min: 0.27% max: 10.23% x̄: 2.61% x̃: 0.82% HURT stats (abs) min: 0.01562600000000014 max: 0.03125 x̄: 0.03 x̃: 0 HURT stats (rel) min: 1.31% max: 2.75% x̄: 2.21% x̃: 2.40% 95% mean confidence interval for cvt value: -0.02 0.02 95% mean confidence interval for cvt %-change: -3.51% 2.58% Inconclusive result (value mean confidence interval includes 0). total ls in shared programs: 25871 -> 25879 (0.03%) ls in affected programs: 46 -> 54 (17.39%) helped: 0 HURT: 4 HURT stats (abs) min: 1.0 max: 5.0 x̄: 2.00 x̃: 1 HURT stats (rel) min: 10.00% max: 25.00% x̄: 18.38% x̃: 19.26% 95% mean confidence interval for ls value: -1.18 5.18 95% mean confidence interval for ls %-change: 8.46% 28.30% Inconclusive result (value mean confidence interval includes 0). total code size in shared programs: 6302848 -> 6302976 (<.01%) code size in affected programs: 1536 -> 1664 (8.33%) helped: 0 HURT: 1 total registers used in shared programs: 117324 -> 117329 (<.01%) registers used in affected programs: 45 -> 50 (11.11%) helped: 1 HURT: 2 helped stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1 helped stats (rel) min: 6.25% max: 6.25% x̄: 6.25% x̃: 6.25% HURT stats (abs) min: 2.0 max: 4.0 x̄: 3.00 x̃: 3 HURT stats (rel) min: 12.50% max: 30.77% x̄: 21.63% x̃: 21.63% total uniforms used in shared programs: 78538 -> 78274 (-0.34%) uniforms used in affected programs: 2688 -> 2424 (-9.82%) helped: 104 HURT: 4 helped stats (abs) min: 1.0 max: 18.0 x̄: 2.65 x̃: 2 helped stats (rel) min: 1.96% max: 54.55% x̄: 12.78% x̃: 11.11% HURT stats (abs) min: 1.0 max: 5.0 x̄: 3.00 x̃: 3 HURT stats (rel) min: 3.70% max: 16.13% x̄: 9.92% x̃: 9.92% 95% mean confidence interval for uniforms used value: -3.01 -1.88 95% mean confidence interval for uniforms used %-change: -14.15% -9.74% Uniforms used are helped. Total CPU time (seconds): 73.26 -> 74.48 (1.67%) Signed-off-by: Olivia Lee Fixes: 2f2738dc9020 (pan/bi: Use nir_lower_mem_access_bit_sizes) Reviewed-by: Eric R. Smith Part-of: --- src/panfrost/compiler/bifrost/bifrost_nir.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/panfrost/compiler/bifrost/bifrost_nir.c b/src/panfrost/compiler/bifrost/bifrost_nir.c index 7f81d48b6a9..7760b54e1dc 100644 --- a/src/panfrost/compiler/bifrost/bifrost_nir.c +++ b/src/panfrost/compiler/bifrost/bifrost_nir.c @@ -668,6 +668,14 @@ mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes, /* No more than 16 bytes at a time. */ bytes = MIN2(bytes, 16); + /* All loads must be aligned up to the next power of two of their byte + * size. If we have insufficient alignment, split into smaller loads. */ + unsigned required_align = util_next_power_of_two(bytes); + if (align < required_align) { + bytes = align; + required_align = bytes; + } + /* If the number of bytes is a multiple of 4, use 32-bit loads. Else if it's * a multiple of 2, use 16-bit loads. Else use 8-bit loads. * @@ -701,15 +709,13 @@ mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes, } bit_size = MAX2(bit_size, 32); - align = 4; - } else { - align = bit_size / 8; + required_align = 4; } return (nir_mem_access_size_align){ .num_components = num_comps, .bit_size = bit_size, - .align = align, + .align = required_align, .shift = nir_mem_access_shift_method_scalar, }; }