From 92133bb0ab1dc046046b26b17345a409fdfff162 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 29 Dec 2025 14:48:46 -0500
Subject: [PATCH] amd: demystify various optimizations we already have for
 memory channels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Explain why we do what we do, and use the radeon_info field properly.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Acked-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39120>
---
 src/amd/common/ac_gpu_info.c                  |  2 +-
 src/amd/common/ac_gpu_info.h                  | 68 +++++++++++++++++++
 src/amd/common/ac_shader_util.c               | 18 +++--
 .../nir/ac_nir_meta_cs_clear_copy_buffer.c    |  9 ++-
 4 files changed, 87 insertions(+), 10 deletions(-)

diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
index 079408365f4..071db231393 100644
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -2390,7 +2390,7 @@ void ac_get_task_info(const struct radeon_info *info,
     *    64K        | 550               | 574               | +4.3%
     *    # Adding 256 mitigates the performance loss from increasing num_entries.
     */
-   const uint32_t payload_entry_size = 16384 + 256;
+   const uint32_t payload_entry_size = 16384 + info->pipe_interleave_bytes;
    const uint16_t num_entries = get_task_num_entries(info->family);
    const uint32_t draw_ring_bytes = num_entries * AC_TASK_DRAW_ENTRY_BYTES;
    const uint32_t payload_ring_bytes = num_entries * payload_entry_size;
diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h
index 4330f9203d7..b288194d0c9 100644
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -19,6 +19,74 @@ extern "C" {
 #define AMD_MAX_SA_PER_SE  2
 #define AMD_MAX_WGP        60
 
+/* Memory is divided among memory channels such that each 256B maps to a different memory channel
+ * and the memory channel index increments with each 256B block, which wraps around to 0 after
+ * the last memory channel index.
+ *
+ * For example, with 16 memory channels, address bits 8:11 contain the memory channel index.
+ * Let's call them "channel address bits". The number of memory channels can be a non-power-of-two
+ * on some chips.
+ *
+ * AMD GPUs usually assign 16 bits of memory bus to 1 memory channel. For example, 192-bit GDDR
+ * memory bus has 12 memory channels. APUs usually have 1 memory channel per 32 bits or 64 bits
+ * of memory bus. The physical memory channels don't always map 1:1 to AMD GPU memory channels.
+ *
+ * Memory channels are like separate cores. The advertised bandwidth and cache sizes are always
+ * for all memory channels combined. That means that each channel has only 1/num_memory_channels
+ * bandwidth and 1/memory_channels cache size. If all memory accesses unluckily end up in the same
+ * channel for all running shaders, the available memory bandwidth is only 1/num_memory_channels
+ * and the available cache size is also only 1/num_memory_channels. With 16 memory channels, that
+ * would be 16x worse cache and memory performance.
+ *
+ * Strategies to distribute work among all memory channels evenly:
+ *
+ * - Ring element sizes should be set to an odd multiple of 256 to make sure each element starts on
+ *   a different memory channel. This is similar to how LDS banks work, but the granularity is 256B
+ *   instead of 4B here. The simplest way to do that is that if the ring element size is > 256,
+ *   apply "|= 256;" to it. The scratch ring and the task shader payload ring do this.
+ *
+ * - For tree data structures in memory, try to randomize channel address bits, which can be done by
+ *   making sure that tree nodes start on an odd multiple of 256. All possible numbers of
+ *   ((address / 256) % num_memory_channels) should be represented equally in the node addresses.
+ *
+ * - If we have a ring buffer where we can't set the ring element size (e.g. TCS outputs where it's
+ *   set to 32K), each workgroup should write at least (num_memory_channels * 256) of TCS outputs
+ *   in bytes, and ideally twice that amount, to make sure each workgroup doesn't leave some memory
+ *   channels (and thus bandwidth) completely unused or underutilized. We could also shift
+ *   the placement of TCS outputs to a random 256*i offset within each 32K segment instead. Our TCS
+ *   workgroup size calculation takes this into account.
+ *
+ * - radeon_surf::tile_swizzle is a random number that randomizes channel address bits to make sure
+ *   some fixed image coordinates (x,y) map to a different memory channel for each image, so if
+ *   a shader were to access multiple images at some fixed image coordinates (x,y) with the same
+ *   bpp, each image would load from a different channel if radeon_surf::tile_swizzle is different.
+ *   If multiple render targets are bound, it's recommended that they all have different tile_swizzle,
+ *   so that MRT0 goes to channel A, MRT1 goes to channel B (A != B), etc. Other than that, image
+ *   tiling does the optimal thing for us. The main purpose of 4K and bigger tiling is to distribute
+ *   work among all memory channels evenly. Linear and 256B tiling generally don't do that.
+ *
+ * - Performance is also affected by how many memory channels a VMEM instruction or a clause
+ *   intersects. Stores are more sensitive to this than loads because they are often globally
+ *   coherent. For example, a 32-lane VMEM store can store to address range=128..640 (size=512),
+ *   which stores data to 3 memory channels, while storing to address range=256..768 stores the same
+ *   amount of data to only 2 memory channels. The latter case has better performance (less VMEM
+ *   latency) when all memory channels are already busy because the wave only has to wait for replies
+ *   from 2 channels instead of 3, and 1 channel has less work to do. Examples are:
+ *   - Our clear_buffer and copy_buffer compute shaders where the store address of lane 0 is always
+ *     a multiple of 256, so that each subgroup always stores to a 256B-aligned memory region of
+ *     size 256*N.
+ *   - Our image clear and blit compute shaders where the stored adress range of each compute
+ *     subgroup is always aligned to 256B and stores 256*N. That's accomplished by making compute
+ *     subgroups always clear or copy whole 256B image tiles, whose dimensions differ between tiling
+ *     modes.
+ *   - Vertex 0 of each TCS output starts on an address aligned to 256 to make TCS output stores
+ *     from each subgroup always store 256B-aligned blocks of 256*N bytes.
+ *
+ * Number 256 comes from GB_ADDR_CONFIG.PIPE_INTERLEAVE_SIZE and is stored in
+ * radeon_info::pipe_interleave_bytes. It's always 256 on all GCN and RDNA chips. "Pipe" means
+ * a memory channel in this context.
+ */
+
 struct amdgpu_gpu_info;
 struct drm_amdgpu_info_device;
 
diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c
index cfb9ddb0d32..c1b0fb82c6d 100644
--- a/src/amd/common/ac_shader_util.c
+++ b/src/amd/common/ac_shader_util.c
@@ -908,7 +908,8 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims
    return CLAMP(workgroup_size, 1, 256);
 }
 
-static unsigned get_tcs_wg_output_mem_size(uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
+static unsigned get_tcs_wg_output_mem_size(const struct radeon_info *info,
+                                           uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
                                            uint32_t num_mem_tcs_patch_outputs, uint32_t num_patches)
 {
    /* Align each per-vertex and per-patch output to 16 vec4 elements = 256B. It's most optimal when
@@ -918,8 +919,9 @@ static unsigned get_tcs_wg_output_mem_size(uint32_t num_tcs_output_cp, uint32_t
     * in wave64 will cover 4 channels (1024B). If an output was only aligned to 128B, wave64 could
     * cover 5 channels (128B .. 1.125K) instead of 4, which could increase VMEM latency.
     */
-   unsigned mem_one_pervertex_output = align(16 * num_tcs_output_cp * num_patches, 256);
-   unsigned mem_one_perpatch_output = align(16 * num_patches, 256);
+   unsigned mem_one_pervertex_output = align(16 * num_tcs_output_cp * num_patches,
+                                             info->pipe_interleave_bytes);
+   unsigned mem_one_perpatch_output = align(16 * num_patches, info->pipe_interleave_bytes);
 
    return mem_one_pervertex_output * num_mem_tcs_outputs +
           mem_one_perpatch_output * num_mem_tcs_patch_outputs;
@@ -955,20 +957,20 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
       num_patches = MIN2(num_patches, 16); /* recommended */
 
    /* Make sure the output data fits in the offchip buffer */
-   unsigned mem_size = get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
+   unsigned mem_size = get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs,
                                                   num_mem_tcs_patch_outputs, num_patches);
    if (mem_size > info->hs_offchip_workgroup_dw_size * 4) {
       /* Find the number of patches that fit in memory. Each output is aligned separately,
        * so this division won't return a precise result.
        */
       num_patches = info->hs_offchip_workgroup_dw_size * 4 /
-                    get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
+                    get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs,
                                                num_mem_tcs_patch_outputs, 1);
-      assert(get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
+      assert(get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs,
                                         num_mem_tcs_patch_outputs, num_patches) <=
              info->hs_offchip_workgroup_dw_size * 4);
 
-      while (get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
+      while (get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs,
                                         num_mem_tcs_patch_outputs, num_patches + 1) <=
              info->hs_offchip_workgroup_dw_size * 4)
          num_patches++;
@@ -1043,6 +1045,8 @@ ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t bytes_per_w
    /* Add 1 scratch item to make the number of items odd. This should improve
     * scratch performance by more randomly distributing scratch waves among
     * memory channels.
+    *
+    * On GFX11+, this is exactly "|= info->pipe_interleave_bytes".
     */
    if (bytes_per_wave)
       bytes_per_wave |= info->scratch_wavesize_granularity;
diff --git a/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c b/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c
index 1f41cf9d670..49a40adc5e4 100644
--- a/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c
+++ b/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c
@@ -568,10 +568,15 @@ ac_prepare_cs_clear_copy_buffer(const struct ac_cs_clear_copy_buffer_options *op
     * the beginning a 256B block and clear/copy whole 256B blocks. Clearing/copying a 256B block
     * partially for each wave is inefficient, which happens when dst_offset isn't aligned to 256.
     * Clearing/copying whole 256B blocks per wave isn't possible if dwords_per_thread isn't 2^n.
+    *
+    * pipe_interleave_bytes is 256.
     */
    unsigned start_thread =
-      dst_offset_bound % 256 && util_is_power_of_two_nonzero(dwords_per_thread) ?
-            DIV_ROUND_UP(256 - dst_offset_bound % 256, dwords_per_thread * 4) : 0;
+      dst_offset_bound % options->info->pipe_interleave_bytes &&
+      util_is_power_of_two_nonzero(dwords_per_thread) ?
+            DIV_ROUND_UP(options->info->pipe_interleave_bytes -
+                         dst_offset_bound % options->info->pipe_interleave_bytes,
+                         dwords_per_thread * 4) : 0;
    out->shader_key.has_start_thread = start_thread != 0;
 
    /* Set the value of the last thread ID, so that the shader knows which thread is the last one. */