From 92133bb0ab1dc046046b26b17345a409fdfff162 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 29 Dec 2025 14:48:46 -0500 Subject: [PATCH] amd: demystify various optimizations we already have for memory channels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Explain why we do what we do, and use the radeon_info field properly. Reviewed-by: Samuel Pitoiset Acked-by: Timur Kristóf Part-of: --- src/amd/common/ac_gpu_info.c | 2 +- src/amd/common/ac_gpu_info.h | 68 +++++++++++++++++++ src/amd/common/ac_shader_util.c | 18 +++-- .../nir/ac_nir_meta_cs_clear_copy_buffer.c | 9 ++- 4 files changed, 87 insertions(+), 10 deletions(-) diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index 079408365f4..071db231393 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -2390,7 +2390,7 @@ void ac_get_task_info(const struct radeon_info *info, * 64K | 550 | 574 | +4.3% * # Adding 256 mitigates the performance loss from increasing num_entries. */ - const uint32_t payload_entry_size = 16384 + 256; + const uint32_t payload_entry_size = 16384 + info->pipe_interleave_bytes; const uint16_t num_entries = get_task_num_entries(info->family); const uint32_t draw_ring_bytes = num_entries * AC_TASK_DRAW_ENTRY_BYTES; const uint32_t payload_ring_bytes = num_entries * payload_entry_size; diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index 4330f9203d7..b288194d0c9 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -19,6 +19,74 @@ extern "C" { #define AMD_MAX_SA_PER_SE 2 #define AMD_MAX_WGP 60 +/* Memory is divided among memory channels such that each 256B maps to a different memory channel + * and the memory channel index increments with each 256B block, which wraps around to 0 after + * the last memory channel index. + * + * For example, with 16 memory channels, address bits 8:11 contain the memory channel index. + * Let's call them "channel address bits". The number of memory channels can be a non-power-of-two + * on some chips. + * + * AMD GPUs usually assign 16 bits of memory bus to 1 memory channel. For example, 192-bit GDDR + * memory bus has 12 memory channels. APUs usually have 1 memory channel per 32 bits or 64 bits + * of memory bus. The physical memory channels don't always map 1:1 to AMD GPU memory channels. + * + * Memory channels are like separate cores. The advertised bandwidth and cache sizes are always + * for all memory channels combined. That means that each channel has only 1/num_memory_channels + * bandwidth and 1/memory_channels cache size. If all memory accesses unluckily end up in the same + * channel for all running shaders, the available memory bandwidth is only 1/num_memory_channels + * and the available cache size is also only 1/num_memory_channels. With 16 memory channels, that + * would be 16x worse cache and memory performance. + * + * Strategies to distribute work among all memory channels evenly: + * + * - Ring element sizes should be set to an odd multiple of 256 to make sure each element starts on + * a different memory channel. This is similar to how LDS banks work, but the granularity is 256B + * instead of 4B here. The simplest way to do that is that if the ring element size is > 256, + * apply "|= 256;" to it. The scratch ring and the task shader payload ring do this. + * + * - For tree data structures in memory, try to randomize channel address bits, which can be done by + * making sure that tree nodes start on an odd multiple of 256. All possible numbers of + * ((address / 256) % num_memory_channels) should be represented equally in the node addresses. + * + * - If we have a ring buffer where we can't set the ring element size (e.g. TCS outputs where it's + * set to 32K), each workgroup should write at least (num_memory_channels * 256) of TCS outputs + * in bytes, and ideally twice that amount, to make sure each workgroup doesn't leave some memory + * channels (and thus bandwidth) completely unused or underutilized. We could also shift + * the placement of TCS outputs to a random 256*i offset within each 32K segment instead. Our TCS + * workgroup size calculation takes this into account. + * + * - radeon_surf::tile_swizzle is a random number that randomizes channel address bits to make sure + * some fixed image coordinates (x,y) map to a different memory channel for each image, so if + * a shader were to access multiple images at some fixed image coordinates (x,y) with the same + * bpp, each image would load from a different channel if radeon_surf::tile_swizzle is different. + * If multiple render targets are bound, it's recommended that they all have different tile_swizzle, + * so that MRT0 goes to channel A, MRT1 goes to channel B (A != B), etc. Other than that, image + * tiling does the optimal thing for us. The main purpose of 4K and bigger tiling is to distribute + * work among all memory channels evenly. Linear and 256B tiling generally don't do that. + * + * - Performance is also affected by how many memory channels a VMEM instruction or a clause + * intersects. Stores are more sensitive to this than loads because they are often globally + * coherent. For example, a 32-lane VMEM store can store to address range=128..640 (size=512), + * which stores data to 3 memory channels, while storing to address range=256..768 stores the same + * amount of data to only 2 memory channels. The latter case has better performance (less VMEM + * latency) when all memory channels are already busy because the wave only has to wait for replies + * from 2 channels instead of 3, and 1 channel has less work to do. Examples are: + * - Our clear_buffer and copy_buffer compute shaders where the store address of lane 0 is always + * a multiple of 256, so that each subgroup always stores to a 256B-aligned memory region of + * size 256*N. + * - Our image clear and blit compute shaders where the stored adress range of each compute + * subgroup is always aligned to 256B and stores 256*N. That's accomplished by making compute + * subgroups always clear or copy whole 256B image tiles, whose dimensions differ between tiling + * modes. + * - Vertex 0 of each TCS output starts on an address aligned to 256 to make TCS output stores + * from each subgroup always store 256B-aligned blocks of 256*N bytes. + * + * Number 256 comes from GB_ADDR_CONFIG.PIPE_INTERLEAVE_SIZE and is stored in + * radeon_info::pipe_interleave_bytes. It's always 256 on all GCN and RDNA chips. "Pipe" means + * a memory channel in this context. + */ + struct amdgpu_gpu_info; struct drm_amdgpu_info_device; diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index cfb9ddb0d32..c1b0fb82c6d 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -908,7 +908,8 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims return CLAMP(workgroup_size, 1, 256); } -static unsigned get_tcs_wg_output_mem_size(uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs, +static unsigned get_tcs_wg_output_mem_size(const struct radeon_info *info, + uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs, uint32_t num_mem_tcs_patch_outputs, uint32_t num_patches) { /* Align each per-vertex and per-patch output to 16 vec4 elements = 256B. It's most optimal when @@ -918,8 +919,9 @@ static unsigned get_tcs_wg_output_mem_size(uint32_t num_tcs_output_cp, uint32_t * in wave64 will cover 4 channels (1024B). If an output was only aligned to 128B, wave64 could * cover 5 channels (128B .. 1.125K) instead of 4, which could increase VMEM latency. */ - unsigned mem_one_pervertex_output = align(16 * num_tcs_output_cp * num_patches, 256); - unsigned mem_one_perpatch_output = align(16 * num_patches, 256); + unsigned mem_one_pervertex_output = align(16 * num_tcs_output_cp * num_patches, + info->pipe_interleave_bytes); + unsigned mem_one_perpatch_output = align(16 * num_patches, info->pipe_interleave_bytes); return mem_one_pervertex_output * num_mem_tcs_outputs + mem_one_perpatch_output * num_mem_tcs_patch_outputs; @@ -955,20 +957,20 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu num_patches = MIN2(num_patches, 16); /* recommended */ /* Make sure the output data fits in the offchip buffer */ - unsigned mem_size = get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs, + unsigned mem_size = get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs, num_mem_tcs_patch_outputs, num_patches); if (mem_size > info->hs_offchip_workgroup_dw_size * 4) { /* Find the number of patches that fit in memory. Each output is aligned separately, * so this division won't return a precise result. */ num_patches = info->hs_offchip_workgroup_dw_size * 4 / - get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs, + get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs, num_mem_tcs_patch_outputs, 1); - assert(get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs, + assert(get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs, num_mem_tcs_patch_outputs, num_patches) <= info->hs_offchip_workgroup_dw_size * 4); - while (get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs, + while (get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs, num_mem_tcs_patch_outputs, num_patches + 1) <= info->hs_offchip_workgroup_dw_size * 4) num_patches++; @@ -1043,6 +1045,8 @@ ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t bytes_per_w /* Add 1 scratch item to make the number of items odd. This should improve * scratch performance by more randomly distributing scratch waves among * memory channels. + * + * On GFX11+, this is exactly "|= info->pipe_interleave_bytes". */ if (bytes_per_wave) bytes_per_wave |= info->scratch_wavesize_granularity; diff --git a/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c b/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c index 1f41cf9d670..49a40adc5e4 100644 --- a/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c +++ b/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c @@ -568,10 +568,15 @@ ac_prepare_cs_clear_copy_buffer(const struct ac_cs_clear_copy_buffer_options *op * the beginning a 256B block and clear/copy whole 256B blocks. Clearing/copying a 256B block * partially for each wave is inefficient, which happens when dst_offset isn't aligned to 256. * Clearing/copying whole 256B blocks per wave isn't possible if dwords_per_thread isn't 2^n. + * + * pipe_interleave_bytes is 256. */ unsigned start_thread = - dst_offset_bound % 256 && util_is_power_of_two_nonzero(dwords_per_thread) ? - DIV_ROUND_UP(256 - dst_offset_bound % 256, dwords_per_thread * 4) : 0; + dst_offset_bound % options->info->pipe_interleave_bytes && + util_is_power_of_two_nonzero(dwords_per_thread) ? + DIV_ROUND_UP(options->info->pipe_interleave_bytes - + dst_offset_bound % options->info->pipe_interleave_bytes, + dwords_per_thread * 4) : 0; out->shader_key.has_start_thread = start_thread != 0; /* Set the value of the last thread ID, so that the shader knows which thread is the last one. */