ac/gpu_info: add #define AMD_MEMCHANNEL_INTERLEAVE_BYTES

radeon_info::pipe_interleave_bytes is renamed to r600_pipe_interleave_bytes
where it can be 512 on some chips.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39120>
This commit is contained in:
Marek Olšák 2026-01-05 16:07:47 -05:00 committed by Marge Bot
parent 92133bb0ab
commit 13cfd0176c
10 changed files with 32 additions and 32 deletions

View file

@ -946,11 +946,13 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
info->gb_addr_config = 0;
info->num_tile_pipes = 1 << G_0098F8_NUM_PIPES(info->gb_addr_config);
info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config);
assert((256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config)) ==
AMD_MEMCHANNEL_INTERLEAVE_BYTES);
} else {
unsigned pipe_config = G_009910_PIPE_CONFIG(amdinfo.gb_tile_mode[CIK_TILE_MODE_COLOR_2D]);
info->num_tile_pipes = ac_pipe_config_to_num_pipes(pipe_config);
info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config);
assert((256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config)) ==
AMD_MEMCHANNEL_INTERLEAVE_BYTES);
}
info->r600_has_virtual_memory = true;
@ -1992,7 +1994,6 @@ void ac_print_gpu_info(FILE *f, const struct radeon_info *info, int fd)
fprintf(f, " pa_sc_tile_steering_override = 0x%x\n", info->pa_sc_tile_steering_override);
fprintf(f, " max_render_backends = %i\n", info->max_render_backends);
fprintf(f, " num_tile_pipes = %i\n", info->num_tile_pipes);
fprintf(f, " pipe_interleave_bytes = %i\n", info->pipe_interleave_bytes);
fprintf(f, " enabled_rb_mask = 0x%" PRIx64 "\n", info->enabled_rb_mask);
fprintf(f, " max_alignment = %u\n", (unsigned)info->max_alignment);
fprintf(f, " pbb_max_alloc_count = %u\n", info->pbb_max_alloc_count);
@ -2390,7 +2391,7 @@ void ac_get_task_info(const struct radeon_info *info,
* 64K | 550 | 574 | +4.3%
* # Adding 256 mitigates the performance loss from increasing num_entries.
*/
const uint32_t payload_entry_size = 16384 + info->pipe_interleave_bytes;
const uint32_t payload_entry_size = 16384 + AMD_MEMCHANNEL_INTERLEAVE_BYTES;
const uint16_t num_entries = get_task_num_entries(info->family);
const uint32_t draw_ring_bytes = num_entries * AC_TASK_DRAW_ENTRY_BYTES;
const uint32_t payload_ring_bytes = num_entries * payload_entry_size;

View file

@ -82,10 +82,10 @@ extern "C" {
* - Vertex 0 of each TCS output starts on an address aligned to 256 to make TCS output stores
* from each subgroup always store 256B-aligned blocks of 256*N bytes.
*
* Number 256 comes from GB_ADDR_CONFIG.PIPE_INTERLEAVE_SIZE and is stored in
* radeon_info::pipe_interleave_bytes. It's always 256 on all GCN and RDNA chips. "Pipe" means
* a memory channel in this context.
* Number 256 comes from GB_ADDR_CONFIG.PIPE_INTERLEAVE_SIZE. It's always 256 on all GCN and RDNA
* chips. "Pipe" means a memory channel in this context.
*/
#define AMD_MEMCHANNEL_INTERLEAVE_BYTES 256 /* always equal to GB_ADDR_CONFIG.PIPE_INTERLEAVE_SIZE */
struct amdgpu_gpu_info;
struct drm_amdgpu_info_device;
@ -417,12 +417,12 @@ struct radeon_info {
uint32_t r600_gb_backend_map; /* R600 harvest config */
bool r600_gb_backend_map_valid;
uint32_t r600_num_banks;
uint32_t r600_pipe_interleave_bytes;
uint32_t mc_arb_ramcfg;
uint32_t gb_addr_config;
uint32_t pa_sc_tile_steering_override; /* CLEAR_STATE also sets this */
uint32_t max_render_backends; /* number of render backends incl. disabled ones */
uint32_t num_tile_pipes; /* pipe count from PIPE_CONFIG */
uint32_t pipe_interleave_bytes;
uint64_t enabled_rb_mask; /* bitmask of enabled physical RBs, up to max_render_backends bits */
uint64_t max_alignment; /* from addrlib */
uint32_t pbb_max_alloc_count;

View file

@ -908,8 +908,7 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims
return CLAMP(workgroup_size, 1, 256);
}
static unsigned get_tcs_wg_output_mem_size(const struct radeon_info *info,
uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
static unsigned get_tcs_wg_output_mem_size(uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
uint32_t num_mem_tcs_patch_outputs, uint32_t num_patches)
{
/* Align each per-vertex and per-patch output to 16 vec4 elements = 256B. It's most optimal when
@ -920,8 +919,8 @@ static unsigned get_tcs_wg_output_mem_size(const struct radeon_info *info,
* cover 5 channels (128B .. 1.125K) instead of 4, which could increase VMEM latency.
*/
unsigned mem_one_pervertex_output = align(16 * num_tcs_output_cp * num_patches,
info->pipe_interleave_bytes);
unsigned mem_one_perpatch_output = align(16 * num_patches, info->pipe_interleave_bytes);
AMD_MEMCHANNEL_INTERLEAVE_BYTES);
unsigned mem_one_perpatch_output = align(16 * num_patches, AMD_MEMCHANNEL_INTERLEAVE_BYTES);
return mem_one_pervertex_output * num_mem_tcs_outputs +
mem_one_perpatch_output * num_mem_tcs_patch_outputs;
@ -957,20 +956,20 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
num_patches = MIN2(num_patches, 16); /* recommended */
/* Make sure the output data fits in the offchip buffer */
unsigned mem_size = get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs,
unsigned mem_size = get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
num_mem_tcs_patch_outputs, num_patches);
if (mem_size > info->hs_offchip_workgroup_dw_size * 4) {
/* Find the number of patches that fit in memory. Each output is aligned separately,
* so this division won't return a precise result.
*/
num_patches = info->hs_offchip_workgroup_dw_size * 4 /
get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs,
get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
num_mem_tcs_patch_outputs, 1);
assert(get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs,
assert(get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
num_mem_tcs_patch_outputs, num_patches) <=
info->hs_offchip_workgroup_dw_size * 4);
while (get_tcs_wg_output_mem_size(info, num_tcs_output_cp, num_mem_tcs_outputs,
while (get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
num_mem_tcs_patch_outputs, num_patches + 1) <=
info->hs_offchip_workgroup_dw_size * 4)
num_patches++;
@ -1046,7 +1045,7 @@ ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t bytes_per_w
* scratch performance by more randomly distributing scratch waves among
* memory channels.
*
* On GFX11+, this is exactly "|= info->pipe_interleave_bytes".
* On GFX11+, this is exactly "|= AMD_MEMCHANNEL_INTERLEAVE_BYTES".
*/
if (bytes_per_wave)
bytes_per_wave |= info->scratch_wavesize_granularity;

View file

@ -1276,7 +1276,7 @@ static int gfx6_surface_settings(struct ac_addrlib *addrlib, const struct radeon
static void ac_compute_cmask(const struct radeon_info *info, const struct ac_surf_config *config,
struct radeon_surf *surf)
{
unsigned pipe_interleave_bytes = info->pipe_interleave_bytes;
unsigned pipe_interleave_bytes = AMD_MEMCHANNEL_INTERLEAVE_BYTES;
unsigned num_pipes = info->num_tile_pipes;
unsigned cl_width, cl_height;

View file

@ -568,14 +568,12 @@ ac_prepare_cs_clear_copy_buffer(const struct ac_cs_clear_copy_buffer_options *op
* the beginning a 256B block and clear/copy whole 256B blocks. Clearing/copying a 256B block
* partially for each wave is inefficient, which happens when dst_offset isn't aligned to 256.
* Clearing/copying whole 256B blocks per wave isn't possible if dwords_per_thread isn't 2^n.
*
* pipe_interleave_bytes is 256.
*/
unsigned start_thread =
dst_offset_bound % options->info->pipe_interleave_bytes &&
dst_offset_bound % AMD_MEMCHANNEL_INTERLEAVE_BYTES &&
util_is_power_of_two_nonzero(dwords_per_thread) ?
DIV_ROUND_UP(options->info->pipe_interleave_bytes -
dst_offset_bound % options->info->pipe_interleave_bytes,
DIV_ROUND_UP(AMD_MEMCHANNEL_INTERLEAVE_BYTES -
dst_offset_bound % AMD_MEMCHANNEL_INTERLEAVE_BYTES,
dwords_per_thread * 4) : 0;
out->shader_key.has_start_thread = start_thread != 0;

View file

@ -1076,7 +1076,7 @@ static void evergreen_set_color_surface_buffer(struct r600_context *rctx,
const struct util_format_description *desc;
unsigned block_size = util_format_get_blocksize(res->b.b.format);
unsigned pitch_alignment =
MAX2(64, rctx->screen->b.info.pipe_interleave_bytes / block_size);
MAX2(64, rctx->screen->b.info.r600_pipe_interleave_bytes / block_size);
unsigned pitch = align(res->b.b.width0, pitch_alignment);
int i;
unsigned width_elements;

View file

@ -1078,7 +1078,7 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
printf("r600_num_banks = %i\n", rscreen->info.r600_num_banks);
printf("num_render_backends = %i\n", rscreen->info.max_render_backends);
printf("num_tile_pipes = %i\n", rscreen->info.num_tile_pipes);
printf("pipe_interleave_bytes = %i\n", rscreen->info.pipe_interleave_bytes);
printf("pipe_interleave_bytes = %i\n", rscreen->info.r600_pipe_interleave_bytes);
printf("enabled_rb_mask = 0x%" PRIx64 "\n", rscreen->info.enabled_rb_mask);
printf("max_alignment = %u\n", (unsigned)rscreen->info.max_alignment);
}

View file

@ -723,7 +723,7 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen,
unsigned element_bits = 4;
unsigned cmask_cache_bits = 1024;
unsigned num_pipes = rscreen->info.num_tile_pipes;
unsigned pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
unsigned pipe_interleave_bytes = rscreen->info.r600_pipe_interleave_bytes;
unsigned elements_per_macro_tile = (cmask_cache_bits / element_bits) * num_pipes;
unsigned pixels_per_macro_tile = elements_per_macro_tile * cmask_tile_elements;
@ -843,7 +843,7 @@ static void r600_texture_get_htile_size(struct r600_common_screen *rscreen,
slice_elements = (width * height) / (8 * 8);
slice_bytes = slice_elements * 4;
pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
pipe_interleave_bytes = rscreen->info.r600_pipe_interleave_bytes;
base_align = num_pipes * pipe_interleave_bytes;
rtex->surface.meta_alignment_log2 = util_logbase2(base_align);

View file

@ -204,7 +204,8 @@ static void si_compute_cmask(const struct radeon_info *info,
const struct ac_surf_config *config,
struct radeon_surf *surf)
{
unsigned pipe_interleave_bytes = info->pipe_interleave_bytes;
unsigned pipe_interleave_bytes = info->gfx_level >= GFX6 ? AMD_MEMCHANNEL_INTERLEAVE_BYTES :
info->r600_pipe_interleave_bytes;
unsigned num_pipes = info->num_tile_pipes;
unsigned cl_width, cl_height;
@ -315,7 +316,8 @@ static void si_compute_htile(const struct radeon_info *info,
slice_elements = (width * height) / (8 * 8);
slice_bytes = slice_elements * 4;
pipe_interleave_bytes = info->pipe_interleave_bytes;
pipe_interleave_bytes = info->gfx_level >= GFX6 ? AMD_MEMCHANNEL_INTERLEAVE_BYTES :
info->r600_pipe_interleave_bytes;
base_align = num_pipes * pipe_interleave_bytes;
surf->meta_alignment_log2 = util_logbase2(base_align);

View file

@ -465,13 +465,13 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws)
4 << ((tiling_config & 0xf0) >> 4) :
4 << ((tiling_config & 0x30) >> 4);
ws->info.pipe_interleave_bytes =
ws->info.r600_pipe_interleave_bytes =
ws->info.gfx_level >= EVERGREEN ?
256 << ((tiling_config & 0xf00) >> 8) :
256 << ((tiling_config & 0xc0) >> 6);
if (!ws->info.pipe_interleave_bytes)
ws->info.pipe_interleave_bytes =
if (!ws->info.r600_pipe_interleave_bytes)
ws->info.r600_pipe_interleave_bytes =
ws->info.gfx_level >= EVERGREEN ? 512 : 256;
radeon_get_drm_value(ws->fd, RADEON_INFO_NUM_TILE_PIPES, NULL,