From 13cfd0176c7a6368934f54f6372c9cbd4a0753c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 5 Jan 2026 16:07:47 -0500 Subject: [PATCH] ac/gpu_info: add #define AMD_MEMCHANNEL_INTERLEAVE_BYTES radeon_info::pipe_interleave_bytes is renamed to r600_pipe_interleave_bytes where it can be 512 on some chips. Reviewed-by: Samuel Pitoiset Part-of: --- src/amd/common/ac_gpu_info.c | 9 +++++---- src/amd/common/ac_gpu_info.h | 8 ++++---- src/amd/common/ac_shader_util.c | 17 ++++++++--------- src/amd/common/ac_surface.c | 2 +- .../nir/ac_nir_meta_cs_clear_copy_buffer.c | 8 +++----- src/gallium/drivers/r600/evergreen_state.c | 2 +- src/gallium/drivers/r600/r600_pipe_common.c | 2 +- src/gallium/drivers/r600/r600_texture.c | 4 ++-- .../winsys/radeon/drm/radeon_drm_surface.c | 6 ++++-- .../winsys/radeon/drm/radeon_drm_winsys.c | 6 +++--- 10 files changed, 32 insertions(+), 32 deletions(-) diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index 071db231393..a70fd18dc04 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -946,11 +946,13 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, info->gb_addr_config = 0; info->num_tile_pipes = 1 << G_0098F8_NUM_PIPES(info->gb_addr_config); - info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config); + assert((256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config)) == + AMD_MEMCHANNEL_INTERLEAVE_BYTES); } else { unsigned pipe_config = G_009910_PIPE_CONFIG(amdinfo.gb_tile_mode[CIK_TILE_MODE_COLOR_2D]); info->num_tile_pipes = ac_pipe_config_to_num_pipes(pipe_config); - info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config); + assert((256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config)) == + AMD_MEMCHANNEL_INTERLEAVE_BYTES); } info->r600_has_virtual_memory = true; @@ -1992,7 +1994,6 @@ void ac_print_gpu_info(FILE *f, const struct radeon_info *info, int fd) fprintf(f, " pa_sc_tile_steering_override = 0x%x\n", info->pa_sc_tile_steering_override); fprintf(f, " max_render_backends = %i\n", info->max_render_backends); fprintf(f, " num_tile_pipes = %i\n", info->num_tile_pipes); - fprintf(f, " pipe_interleave_bytes = %i\n", info->pipe_interleave_bytes); fprintf(f, " enabled_rb_mask = 0x%" PRIx64 "\n", info->enabled_rb_mask); fprintf(f, " max_alignment = %u\n", (unsigned)info->max_alignment); fprintf(f, " pbb_max_alloc_count = %u\n", info->pbb_max_alloc_count); @@ -2390,7 +2391,7 @@ void ac_get_task_info(const struct radeon_info *info, * 64K | 550 | 574 | +4.3% * # Adding 256 mitigates the performance loss from increasing num_entries. */ - const uint32_t payload_entry_size = 16384 + info->pipe_interleave_bytes; + const uint32_t payload_entry_size = 16384 + AMD_MEMCHANNEL_INTERLEAVE_BYTES; const uint16_t num_entries = get_task_num_entries(info->family); const uint32_t draw_ring_bytes = num_entries * AC_TASK_DRAW_ENTRY_BYTES; const uint32_t payload_ring_bytes = num_entries * payload_entry_size; diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index b288194d0c9..ff8231facb5 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -82,10 +82,10 @@ extern "C" { * - Vertex 0 of each TCS output starts on an address aligned to 256 to make TCS output stores * from each subgroup always store 256B-aligned blocks of 256*N bytes. * - * Number 256 comes from GB_ADDR_CONFIG.PIPE_INTERLEAVE_SIZE and is stored in - * radeon_info::pipe_interleave_bytes. It's always 256 on all GCN and RDNA chips. "Pipe" means - * a memory channel in this context. + * Number 256 comes from GB_ADDR_CONFIG.PIPE_INTERLEAVE_SIZE. It's always 256 on all GCN and RDNA + * chips. "Pipe" means a memory channel in this context. */ +#define AMD_MEMCHANNEL_INTERLEAVE_BYTES 256 /* always equal to GB_ADDR_CONFIG.PIPE_INTERLEAVE_SIZE */ struct amdgpu_gpu_info; struct drm_amdgpu_info_device; @@ -417,12 +417,12 @@ struct radeon_info { uint32_t r600_gb_backend_map; /* R600 harvest config */ bool r600_gb_backend_map_valid; uint32_t r600_num_banks; + uint32_t r600_pipe_interleave_bytes; uint32_t mc_arb_ramcfg; uint32_t gb_addr_config; uint32_t pa_sc_tile_steering_override; /* CLEAR_STATE also sets this */ uint32_t max_render_backends; /* number of render backends incl. disabled ones */ uint32_t num_tile_pipes; /* pipe count from PIPE_CONFIG */ - uint32_t pipe_interleave_bytes; uint64_t enabled_rb_mask; /* bitmask of enabled physical RBs, up to max_render_backends bits */ uint64_t max_alignment; /* from addrlib */ uint32_t pbb_max_alloc_count; diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index c1b0fb82c6d..7ac97337f82 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -908,8 +908,7 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims return CLAMP(workgroup_size, 1, 256); } -static unsigned get_tcs_wg_output_mem_size(const struct radeon_info *info, - uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs, +static unsigned get_tcs_wg_output_mem_size(uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs, uint32_t num_mem_tcs_patch_outputs, uint32_t num_patches) { /* Align each per-vertex and per-patch output to 16 vec4 elements = 256B. It's most optimal when @@ -920,8 +919,8 @@ static unsigned get_tcs_wg_output_mem_size(const struct radeon_info *info, * cover 5 channels (128B .. 1.125K) instead of 4, which could increase VMEM latency. */ unsigned mem_one_pervertex_output = align(16 * num_tcs_output_cp * num_patches, - info->pipe_interleave_bytes); - unsigned mem_one_perpatch_output = align(16 * num_patches, info->pipe_interleave_bytes); + AMD_MEMCHANNEL_INTERLEAVE_BYTES); + unsigned mem_one_perpatch_output = align(16 * num_patches, AMD_MEMCHANNEL_INTERLEAVE_BYTES); return mem_one_pervertex_output * num_mem_tcs_outputs + mem_one_perpatch_output * num_mem_tcs_patch_outputs; @@ -957,20 +956,20 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu num_patches = MIN2(num_patches, 16); /* recommended */ /* Make sure the output data fits in the offchip buffer */ - unsigned mem_size = get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs, + unsigned mem_size = get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs, num_mem_tcs_patch_outputs, num_patches); if (mem_size > info->hs_offchip_workgroup_dw_size * 4) { /* Find the number of patches that fit in memory. Each output is aligned separately, * so this division won't return a precise result. */ num_patches = info->hs_offchip_workgroup_dw_size * 4 / - get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs, + get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs, num_mem_tcs_patch_outputs, 1); - assert(get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs, + assert(get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs, num_mem_tcs_patch_outputs, num_patches) <= info->hs_offchip_workgroup_dw_size * 4); - while (get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs, + while (get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs, num_mem_tcs_patch_outputs, num_patches + 1) <= info->hs_offchip_workgroup_dw_size * 4) num_patches++; @@ -1046,7 +1045,7 @@ ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t bytes_per_w * scratch performance by more randomly distributing scratch waves among * memory channels. * - * On GFX11+, this is exactly "|= info->pipe_interleave_bytes". + * On GFX11+, this is exactly "|= AMD_MEMCHANNEL_INTERLEAVE_BYTES". */ if (bytes_per_wave) bytes_per_wave |= info->scratch_wavesize_granularity; diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c index 130157cc9dd..1f8dda9ab2d 100644 --- a/src/amd/common/ac_surface.c +++ b/src/amd/common/ac_surface.c @@ -1276,7 +1276,7 @@ static int gfx6_surface_settings(struct ac_addrlib *addrlib, const struct radeon static void ac_compute_cmask(const struct radeon_info *info, const struct ac_surf_config *config, struct radeon_surf *surf) { - unsigned pipe_interleave_bytes = info->pipe_interleave_bytes; + unsigned pipe_interleave_bytes = AMD_MEMCHANNEL_INTERLEAVE_BYTES; unsigned num_pipes = info->num_tile_pipes; unsigned cl_width, cl_height; diff --git a/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c b/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c index 49a40adc5e4..4d4410311ea 100644 --- a/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c +++ b/src/amd/common/nir/ac_nir_meta_cs_clear_copy_buffer.c @@ -568,14 +568,12 @@ ac_prepare_cs_clear_copy_buffer(const struct ac_cs_clear_copy_buffer_options *op * the beginning a 256B block and clear/copy whole 256B blocks. Clearing/copying a 256B block * partially for each wave is inefficient, which happens when dst_offset isn't aligned to 256. * Clearing/copying whole 256B blocks per wave isn't possible if dwords_per_thread isn't 2^n. - * - * pipe_interleave_bytes is 256. */ unsigned start_thread = - dst_offset_bound % options->info->pipe_interleave_bytes && + dst_offset_bound % AMD_MEMCHANNEL_INTERLEAVE_BYTES && util_is_power_of_two_nonzero(dwords_per_thread) ? - DIV_ROUND_UP(options->info->pipe_interleave_bytes - - dst_offset_bound % options->info->pipe_interleave_bytes, + DIV_ROUND_UP(AMD_MEMCHANNEL_INTERLEAVE_BYTES - + dst_offset_bound % AMD_MEMCHANNEL_INTERLEAVE_BYTES, dwords_per_thread * 4) : 0; out->shader_key.has_start_thread = start_thread != 0; diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 229b1980b6e..75f7357b10a 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -1076,7 +1076,7 @@ static void evergreen_set_color_surface_buffer(struct r600_context *rctx, const struct util_format_description *desc; unsigned block_size = util_format_get_blocksize(res->b.b.format); unsigned pitch_alignment = - MAX2(64, rctx->screen->b.info.pipe_interleave_bytes / block_size); + MAX2(64, rctx->screen->b.info.r600_pipe_interleave_bytes / block_size); unsigned pitch = align(res->b.b.width0, pitch_alignment); int i; unsigned width_elements; diff --git a/src/gallium/drivers/r600/r600_pipe_common.c b/src/gallium/drivers/r600/r600_pipe_common.c index 647d20cccdd..69d0995431d 100644 --- a/src/gallium/drivers/r600/r600_pipe_common.c +++ b/src/gallium/drivers/r600/r600_pipe_common.c @@ -1078,7 +1078,7 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, printf("r600_num_banks = %i\n", rscreen->info.r600_num_banks); printf("num_render_backends = %i\n", rscreen->info.max_render_backends); printf("num_tile_pipes = %i\n", rscreen->info.num_tile_pipes); - printf("pipe_interleave_bytes = %i\n", rscreen->info.pipe_interleave_bytes); + printf("pipe_interleave_bytes = %i\n", rscreen->info.r600_pipe_interleave_bytes); printf("enabled_rb_mask = 0x%" PRIx64 "\n", rscreen->info.enabled_rb_mask); printf("max_alignment = %u\n", (unsigned)rscreen->info.max_alignment); } diff --git a/src/gallium/drivers/r600/r600_texture.c b/src/gallium/drivers/r600/r600_texture.c index e94af3e5a92..c27bbbc00b6 100644 --- a/src/gallium/drivers/r600/r600_texture.c +++ b/src/gallium/drivers/r600/r600_texture.c @@ -723,7 +723,7 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen, unsigned element_bits = 4; unsigned cmask_cache_bits = 1024; unsigned num_pipes = rscreen->info.num_tile_pipes; - unsigned pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes; + unsigned pipe_interleave_bytes = rscreen->info.r600_pipe_interleave_bytes; unsigned elements_per_macro_tile = (cmask_cache_bits / element_bits) * num_pipes; unsigned pixels_per_macro_tile = elements_per_macro_tile * cmask_tile_elements; @@ -843,7 +843,7 @@ static void r600_texture_get_htile_size(struct r600_common_screen *rscreen, slice_elements = (width * height) / (8 * 8); slice_bytes = slice_elements * 4; - pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes; + pipe_interleave_bytes = rscreen->info.r600_pipe_interleave_bytes; base_align = num_pipes * pipe_interleave_bytes; rtex->surface.meta_alignment_log2 = util_logbase2(base_align); diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_surface.c b/src/gallium/winsys/radeon/drm/radeon_drm_surface.c index 1421cc386e5..5b7d3249e21 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_surface.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_surface.c @@ -204,7 +204,8 @@ static void si_compute_cmask(const struct radeon_info *info, const struct ac_surf_config *config, struct radeon_surf *surf) { - unsigned pipe_interleave_bytes = info->pipe_interleave_bytes; + unsigned pipe_interleave_bytes = info->gfx_level >= GFX6 ? AMD_MEMCHANNEL_INTERLEAVE_BYTES : + info->r600_pipe_interleave_bytes; unsigned num_pipes = info->num_tile_pipes; unsigned cl_width, cl_height; @@ -315,7 +316,8 @@ static void si_compute_htile(const struct radeon_info *info, slice_elements = (width * height) / (8 * 8); slice_bytes = slice_elements * 4; - pipe_interleave_bytes = info->pipe_interleave_bytes; + pipe_interleave_bytes = info->gfx_level >= GFX6 ? AMD_MEMCHANNEL_INTERLEAVE_BYTES : + info->r600_pipe_interleave_bytes; base_align = num_pipes * pipe_interleave_bytes; surf->meta_alignment_log2 = util_logbase2(base_align); diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c index 242869fd94a..bf951fc46b7 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c @@ -465,13 +465,13 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws) 4 << ((tiling_config & 0xf0) >> 4) : 4 << ((tiling_config & 0x30) >> 4); - ws->info.pipe_interleave_bytes = + ws->info.r600_pipe_interleave_bytes = ws->info.gfx_level >= EVERGREEN ? 256 << ((tiling_config & 0xf00) >> 8) : 256 << ((tiling_config & 0xc0) >> 6); - if (!ws->info.pipe_interleave_bytes) - ws->info.pipe_interleave_bytes = + if (!ws->info.r600_pipe_interleave_bytes) + ws->info.r600_pipe_interleave_bytes = ws->info.gfx_level >= EVERGREEN ? 512 : 256; radeon_get_drm_value(ws->fd, RADEON_INFO_NUM_TILE_PIPES, NULL,