diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index 191d71678cc..9c47659282e 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -2501,8 +2501,38 @@ void ac_get_task_info(const struct radeon_info *info, { /* Size of each payload entry in the task payload ring. * Spec requires minimum 16K bytes. + * + * Add 256B to make consecutive payloads start on different memory channels to increase memory + * performance. (each 256B region maps to a different memory channel) + * + * Navi48 improvement from adding 256B to the payload entry size: + * (using https://github.com/zeux/niagara/discussions/41 at commit 745700c) + * + * With cluster culling (press K): + * |FPS for 16K |FPS for 16K+256 | + * num_entries|(payload ring size)|(payload ring size)|diff for +256 + * -----------|-------------------|-------------------|------------- + * 1K | 582 (16 MB) | 582 (17 MB) | +0% + * 2K | 587 (32 MB) | 591 (33 MB) | +0.7% + * 4K | 608 (64 MB) | 611 (65 MB) | +0.5% + * 8K | 653 (128 MB) | 660 (130 MB) | +1.1% + * 16K | 765 (256 MB) | 789 (260 MB) | +3.1% + * 32K | 880 (512 MB) | 984 (520 MB) | +11.8% + * 64K | 874 (1024 MB) | 970 (1040 MB) | +11% + * + * Without cluster culling (don't press K): + * num_entries|FPS for 16K |FPS for 16K+256 |diff for +256 + * -----------|-------------------|-------------------|------------- + * 1K | 578 | 578 | +0% + * 2K | 578 | 578 | +0% + * 4K | 574 | 578 | +0.7% + * 8K | 573 | 578 | +0.9% + * 16K | 565 | 579 | +2.4% + * 32K | 550 | 574 | +4.3% + * 64K | 550 | 574 | +4.3% + * # Adding 256 mitigates the performance loss from increasing num_entries. */ - const uint32_t payload_entry_size = 16384; + const uint32_t payload_entry_size = 16384 + 256; const uint16_t num_entries = get_task_num_entries(info->family); const uint32_t draw_ring_bytes = num_entries * AC_TASK_DRAW_ENTRY_BYTES; const uint32_t payload_ring_bytes = num_entries * payload_entry_size;