mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-06-17 17:28:19 +02:00
amd: demystify various optimizations we already have for memory channels
Explain why we do what we do, and use the radeon_info field properly. Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Acked-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39120>
This commit is contained in:
parent
75166dff1d
commit
92133bb0ab
4 changed files with 87 additions and 10 deletions
|
|
@ -2390,7 +2390,7 @@ void ac_get_task_info(const struct radeon_info *info,
|
|||
* 64K | 550 | 574 | +4.3%
|
||||
* # Adding 256 mitigates the performance loss from increasing num_entries.
|
||||
*/
|
||||
const uint32_t payload_entry_size = 16384 + 256;
|
||||
const uint32_t payload_entry_size = 16384 + info->pipe_interleave_bytes;
|
||||
const uint16_t num_entries = get_task_num_entries(info->family);
|
||||
const uint32_t draw_ring_bytes = num_entries * AC_TASK_DRAW_ENTRY_BYTES;
|
||||
const uint32_t payload_ring_bytes = num_entries * payload_entry_size;
|
||||
|
|
|
|||
|
|
@ -19,6 +19,74 @@ extern "C" {
|
|||
#define AMD_MAX_SA_PER_SE 2
|
||||
#define AMD_MAX_WGP 60
|
||||
|
||||
/* Memory is divided among memory channels such that each 256B maps to a different memory channel
|
||||
* and the memory channel index increments with each 256B block, which wraps around to 0 after
|
||||
* the last memory channel index.
|
||||
*
|
||||
* For example, with 16 memory channels, address bits 8:11 contain the memory channel index.
|
||||
* Let's call them "channel address bits". The number of memory channels can be a non-power-of-two
|
||||
* on some chips.
|
||||
*
|
||||
* AMD GPUs usually assign 16 bits of memory bus to 1 memory channel. For example, 192-bit GDDR
|
||||
* memory bus has 12 memory channels. APUs usually have 1 memory channel per 32 bits or 64 bits
|
||||
* of memory bus. The physical memory channels don't always map 1:1 to AMD GPU memory channels.
|
||||
*
|
||||
* Memory channels are like separate cores. The advertised bandwidth and cache sizes are always
|
||||
* for all memory channels combined. That means that each channel has only 1/num_memory_channels
|
||||
* bandwidth and 1/memory_channels cache size. If all memory accesses unluckily end up in the same
|
||||
* channel for all running shaders, the available memory bandwidth is only 1/num_memory_channels
|
||||
* and the available cache size is also only 1/num_memory_channels. With 16 memory channels, that
|
||||
* would be 16x worse cache and memory performance.
|
||||
*
|
||||
* Strategies to distribute work among all memory channels evenly:
|
||||
*
|
||||
* - Ring element sizes should be set to an odd multiple of 256 to make sure each element starts on
|
||||
* a different memory channel. This is similar to how LDS banks work, but the granularity is 256B
|
||||
* instead of 4B here. The simplest way to do that is that if the ring element size is > 256,
|
||||
* apply "|= 256;" to it. The scratch ring and the task shader payload ring do this.
|
||||
*
|
||||
* - For tree data structures in memory, try to randomize channel address bits, which can be done by
|
||||
* making sure that tree nodes start on an odd multiple of 256. All possible numbers of
|
||||
* ((address / 256) % num_memory_channels) should be represented equally in the node addresses.
|
||||
*
|
||||
* - If we have a ring buffer where we can't set the ring element size (e.g. TCS outputs where it's
|
||||
* set to 32K), each workgroup should write at least (num_memory_channels * 256) of TCS outputs
|
||||
* in bytes, and ideally twice that amount, to make sure each workgroup doesn't leave some memory
|
||||
* channels (and thus bandwidth) completely unused or underutilized. We could also shift
|
||||
* the placement of TCS outputs to a random 256*i offset within each 32K segment instead. Our TCS
|
||||
* workgroup size calculation takes this into account.
|
||||
*
|
||||
* - radeon_surf::tile_swizzle is a random number that randomizes channel address bits to make sure
|
||||
* some fixed image coordinates (x,y) map to a different memory channel for each image, so if
|
||||
* a shader were to access multiple images at some fixed image coordinates (x,y) with the same
|
||||
* bpp, each image would load from a different channel if radeon_surf::tile_swizzle is different.
|
||||
* If multiple render targets are bound, it's recommended that they all have different tile_swizzle,
|
||||
* so that MRT0 goes to channel A, MRT1 goes to channel B (A != B), etc. Other than that, image
|
||||
* tiling does the optimal thing for us. The main purpose of 4K and bigger tiling is to distribute
|
||||
* work among all memory channels evenly. Linear and 256B tiling generally don't do that.
|
||||
*
|
||||
* - Performance is also affected by how many memory channels a VMEM instruction or a clause
|
||||
* intersects. Stores are more sensitive to this than loads because they are often globally
|
||||
* coherent. For example, a 32-lane VMEM store can store to address range=128..640 (size=512),
|
||||
* which stores data to 3 memory channels, while storing to address range=256..768 stores the same
|
||||
* amount of data to only 2 memory channels. The latter case has better performance (less VMEM
|
||||
* latency) when all memory channels are already busy because the wave only has to wait for replies
|
||||
* from 2 channels instead of 3, and 1 channel has less work to do. Examples are:
|
||||
* - Our clear_buffer and copy_buffer compute shaders where the store address of lane 0 is always
|
||||
* a multiple of 256, so that each subgroup always stores to a 256B-aligned memory region of
|
||||
* size 256*N.
|
||||
* - Our image clear and blit compute shaders where the stored adress range of each compute
|
||||
* subgroup is always aligned to 256B and stores 256*N. That's accomplished by making compute
|
||||
* subgroups always clear or copy whole 256B image tiles, whose dimensions differ between tiling
|
||||
* modes.
|
||||
* - Vertex 0 of each TCS output starts on an address aligned to 256 to make TCS output stores
|
||||
* from each subgroup always store 256B-aligned blocks of 256*N bytes.
|
||||
*
|
||||
* Number 256 comes from GB_ADDR_CONFIG.PIPE_INTERLEAVE_SIZE and is stored in
|
||||
* radeon_info::pipe_interleave_bytes. It's always 256 on all GCN and RDNA chips. "Pipe" means
|
||||
* a memory channel in this context.
|
||||
*/
|
||||
|
||||
struct amdgpu_gpu_info;
|
||||
struct drm_amdgpu_info_device;
|
||||
|
||||
|
|
|
|||
|
|
@ -908,7 +908,8 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims
|
|||
return CLAMP(workgroup_size, 1, 256);
|
||||
}
|
||||
|
||||
static unsigned get_tcs_wg_output_mem_size(uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
|
||||
static unsigned get_tcs_wg_output_mem_size(const struct radeon_info *info,
|
||||
uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
|
||||
uint32_t num_mem_tcs_patch_outputs, uint32_t num_patches)
|
||||
{
|
||||
/* Align each per-vertex and per-patch output to 16 vec4 elements = 256B. It's most optimal when
|
||||
|
|
@ -918,8 +919,9 @@ static unsigned get_tcs_wg_output_mem_size(uint32_t num_tcs_output_cp, uint32_t
|
|||
* in wave64 will cover 4 channels (1024B). If an output was only aligned to 128B, wave64 could
|
||||
* cover 5 channels (128B .. 1.125K) instead of 4, which could increase VMEM latency.
|
||||
*/
|
||||
unsigned mem_one_pervertex_output = align(16 * num_tcs_output_cp * num_patches, 256);
|
||||
unsigned mem_one_perpatch_output = align(16 * num_patches, 256);
|
||||
unsigned mem_one_pervertex_output = align(16 * num_tcs_output_cp * num_patches,
|
||||
info->pipe_interleave_bytes);
|
||||
unsigned mem_one_perpatch_output = align(16 * num_patches, info->pipe_interleave_bytes);
|
||||
|
||||
return mem_one_pervertex_output * num_mem_tcs_outputs +
|
||||
mem_one_perpatch_output * num_mem_tcs_patch_outputs;
|
||||
|
|
@ -955,20 +957,20 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
|
|||
num_patches = MIN2(num_patches, 16); /* recommended */
|
||||
|
||||
/* Make sure the output data fits in the offchip buffer */
|
||||
unsigned mem_size = get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
|
||||
unsigned mem_size = get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs,
|
||||
num_mem_tcs_patch_outputs, num_patches);
|
||||
if (mem_size > info->hs_offchip_workgroup_dw_size * 4) {
|
||||
/* Find the number of patches that fit in memory. Each output is aligned separately,
|
||||
* so this division won't return a precise result.
|
||||
*/
|
||||
num_patches = info->hs_offchip_workgroup_dw_size * 4 /
|
||||
get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
|
||||
get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs,
|
||||
num_mem_tcs_patch_outputs, 1);
|
||||
assert(get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
|
||||
assert(get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs,
|
||||
num_mem_tcs_patch_outputs, num_patches) <=
|
||||
info->hs_offchip_workgroup_dw_size * 4);
|
||||
|
||||
while (get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
|
||||
while (get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs,
|
||||
num_mem_tcs_patch_outputs, num_patches + 1) <=
|
||||
info->hs_offchip_workgroup_dw_size * 4)
|
||||
num_patches++;
|
||||
|
|
@ -1043,6 +1045,8 @@ ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t bytes_per_w
|
|||
/* Add 1 scratch item to make the number of items odd. This should improve
|
||||
* scratch performance by more randomly distributing scratch waves among
|
||||
* memory channels.
|
||||
*
|
||||
* On GFX11+, this is exactly "|= info->pipe_interleave_bytes".
|
||||
*/
|
||||
if (bytes_per_wave)
|
||||
bytes_per_wave |= info->scratch_wavesize_granularity;
|
||||
|
|
|
|||
|
|
@ -568,10 +568,15 @@ ac_prepare_cs_clear_copy_buffer(const struct ac_cs_clear_copy_buffer_options *op
|
|||
* the beginning a 256B block and clear/copy whole 256B blocks. Clearing/copying a 256B block
|
||||
* partially for each wave is inefficient, which happens when dst_offset isn't aligned to 256.
|
||||
* Clearing/copying whole 256B blocks per wave isn't possible if dwords_per_thread isn't 2^n.
|
||||
*
|
||||
* pipe_interleave_bytes is 256.
|
||||
*/
|
||||
unsigned start_thread =
|
||||
dst_offset_bound % 256 && util_is_power_of_two_nonzero(dwords_per_thread) ?
|
||||
DIV_ROUND_UP(256 - dst_offset_bound % 256, dwords_per_thread * 4) : 0;
|
||||
dst_offset_bound % options->info->pipe_interleave_bytes &&
|
||||
util_is_power_of_two_nonzero(dwords_per_thread) ?
|
||||
DIV_ROUND_UP(options->info->pipe_interleave_bytes -
|
||||
dst_offset_bound % options->info->pipe_interleave_bytes,
|
||||
dwords_per_thread * 4) : 0;
|
||||
out->shader_key.has_start_thread = start_thread != 0;
|
||||
|
||||
/* Set the value of the last thread ID, so that the shader knows which thread is the last one. */
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue