diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index e867228dec0..212dc7ef359 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -1858,3 +1858,42 @@ void ac_get_hs_info(struct radeon_info *info, hs->tess_offchip_ring_offset = align(hs->tess_factor_ring_size, 64 * 1024); hs->tess_offchip_ring_size = hs->max_offchip_buffers * hs->tess_offchip_block_dw_size * 4; } + +static uint16_t get_task_num_entries(enum radeon_family fam) +{ + /* Number of task shader ring entries. Needs to be a power of two. + * Use a low number on smaller chips so we don't waste space, + * but keep it high on bigger chips so it doesn't inhibit parallelism. + * + * This number is compiled into task/mesh shaders as a constant. + * In order to ensure this works fine with the shader cache, we must + * base this decision on the chip family, not the number of CUs in + * the current GPU. (So, the cache remains consistent for all + * chips in the same family.) + */ + switch (fam) { + case CHIP_VANGOGH: + case CHIP_NAVI24: + case CHIP_REMBRANDT: + return 256; + case CHIP_NAVI21: + case CHIP_NAVI22: + case CHIP_NAVI23: + default: + return 1024; + } +} + +void ac_get_task_info(struct radeon_info *info, + struct ac_task_info *task_info) +{ + const uint16_t num_entries = get_task_num_entries(info->family); + const uint32_t draw_ring_bytes = num_entries * AC_TASK_DRAW_ENTRY_BYTES; + const uint32_t payload_ring_bytes = num_entries * AC_TASK_PAYLOAD_ENTRY_BYTES; + + /* Ensure that the addresses of each ring are 256 byte aligned. */ + task_info->num_entries = num_entries; + task_info->draw_ring_offset = ALIGN(AC_TASK_CTRLBUF_BYTES, 256); + task_info->payload_ring_offset = ALIGN(task_info->draw_ring_offset + draw_ring_bytes, 256); + task_info->bo_size_bytes = task_info->payload_ring_offset + payload_ring_bytes; +} diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index 07f1cbb556f..9bcaf74d3a0 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -274,6 +274,50 @@ struct ac_hs_info { void ac_get_hs_info(struct radeon_info *info, struct ac_hs_info *hs); +/* Task rings BO layout information. + * This BO is shared between GFX and ACE queues so that the ACE and GFX + * firmware can cooperate on task->mesh dispatches and is also used to + * store the task payload which is passed to mesh shaders. + * + * The driver only needs to create this BO once, + * and it will always be able to accomodate the maximum needed + * task payload size. + * + * The following memory layout is used: + * 1. Control buffer: 9 DWORDs, 256 byte aligned + * Used by the firmware to maintain the current state. + * (padding) + * 2. Draw ring: 4 DWORDs per entry, 256 byte aligned + * Task shaders store the mesh dispatch size here. + * (padding) + * 3. Payload ring: 16K bytes per entry, 256 byte aligned. + * This is where task payload is stored by task shaders and + * read by mesh shaders. + * + */ +struct ac_task_info { + uint32_t draw_ring_offset; + uint32_t payload_ring_offset; + uint32_t bo_size_bytes; + uint16_t num_entries; +}; + +/* Size of each payload entry in the task payload ring. + * Spec requires minimum 16K bytes. + */ +#define AC_TASK_PAYLOAD_ENTRY_BYTES 16384 + +/* Size of each draw entry in the task draw ring. + * 4 DWORDs per entry. + */ +#define AC_TASK_DRAW_ENTRY_BYTES 16 + +/* Size of the task control buffer. 9 DWORDs. */ +#define AC_TASK_CTRLBUF_BYTES 36 + +void ac_get_task_info(struct radeon_info *info, + struct ac_task_info *task_info); + #ifdef __cplusplus } #endif diff --git a/src/amd/vulkan/radv_constants.h b/src/amd/vulkan/radv_constants.h index c40330f3bc3..4b6d3b9667b 100644 --- a/src/amd/vulkan/radv_constants.h +++ b/src/amd/vulkan/radv_constants.h @@ -91,11 +91,6 @@ */ #define RADV_MAX_MEMORY_ALLOCATION_SIZE 0xFFFFFFFCull -/* Size of each payload entry in the task payload ring. - * Spec requires minimum 16K bytes. - */ -#define RADV_TASK_PAYLOAD_ENTRY_BYTES 16384 - /* Number of invocations in each subgroup. */ #define RADV_SUBGROUP_SIZE 64 diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index d19a41ae345..4f3d7d730cc 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -834,24 +834,7 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm ac_get_gs_table_depth(device->rad_info.gfx_level, device->rad_info.family); ac_get_hs_info(&device->rad_info, &device->hs); - - /* Number of task shader ring entries. Needs to be a power of two. - * Use a low number on smaller chips so we don't waste space, - * but keep it high on bigger chips so it doesn't inhibit parallelism. - */ - switch (device->rad_info.family) { - case CHIP_VANGOGH: - case CHIP_NAVI24: - case CHIP_REMBRANDT: - device->task_num_entries = 256; - break; - case CHIP_NAVI21: - case CHIP_NAVI22: - case CHIP_NAVI23: - default: - device->task_num_entries = 1024; - break; - } + ac_get_task_info(&device->rad_info, &device->task_info); *device_out = device; diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index d2295187b20..2f01c6e573f 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -332,9 +332,7 @@ struct radv_physical_device { uint32_t gs_table_depth; struct ac_hs_info hs; - - /* Number of entries in the task shader ring buffers. */ - uint32_t task_num_entries; + struct ac_task_info task_info; }; struct radv_instance { diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 21174c2ed10..20dd59c3c83 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -1094,12 +1094,12 @@ radv_lower_io_to_mem(struct radv_device *device, struct radv_pipeline_stage *sta return true; } else if (nir->info.stage == MESA_SHADER_TASK) { ac_nir_apply_first_task_to_task_shader(nir); - ac_nir_lower_task_outputs_to_mem(nir, RADV_TASK_PAYLOAD_ENTRY_BYTES, - device->physical_device->task_num_entries); + ac_nir_lower_task_outputs_to_mem(nir, AC_TASK_PAYLOAD_ENTRY_BYTES, + device->physical_device->task_info.num_entries); return true; } else if (nir->info.stage == MESA_SHADER_MESH) { - ac_nir_lower_mesh_inputs_to_mem(nir, RADV_TASK_PAYLOAD_ENTRY_BYTES, - device->physical_device->task_num_entries); + ac_nir_lower_mesh_inputs_to_mem(nir, AC_TASK_PAYLOAD_ENTRY_BYTES, + device->physical_device->task_info.num_entries); return true; }