ac: Add task shader ring information.

Similarly to tessellation rings information, move the task
rings info to ac_gpu_info.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16737>
This commit is contained in:
Timur Kristóf 2022-05-31 13:20:23 +02:00 committed by Marge Bot
parent 086e499b47
commit ac5ab8d227
6 changed files with 89 additions and 30 deletions

View file

@ -1858,3 +1858,42 @@ void ac_get_hs_info(struct radeon_info *info,
hs->tess_offchip_ring_offset = align(hs->tess_factor_ring_size, 64 * 1024);
hs->tess_offchip_ring_size = hs->max_offchip_buffers * hs->tess_offchip_block_dw_size * 4;
}
static uint16_t get_task_num_entries(enum radeon_family fam)
{
/* Number of task shader ring entries. Needs to be a power of two.
* Use a low number on smaller chips so we don't waste space,
* but keep it high on bigger chips so it doesn't inhibit parallelism.
*
* This number is compiled into task/mesh shaders as a constant.
* In order to ensure this works fine with the shader cache, we must
* base this decision on the chip family, not the number of CUs in
* the current GPU. (So, the cache remains consistent for all
* chips in the same family.)
*/
switch (fam) {
case CHIP_VANGOGH:
case CHIP_NAVI24:
case CHIP_REMBRANDT:
return 256;
case CHIP_NAVI21:
case CHIP_NAVI22:
case CHIP_NAVI23:
default:
return 1024;
}
}
void ac_get_task_info(struct radeon_info *info,
struct ac_task_info *task_info)
{
const uint16_t num_entries = get_task_num_entries(info->family);
const uint32_t draw_ring_bytes = num_entries * AC_TASK_DRAW_ENTRY_BYTES;
const uint32_t payload_ring_bytes = num_entries * AC_TASK_PAYLOAD_ENTRY_BYTES;
/* Ensure that the addresses of each ring are 256 byte aligned. */
task_info->num_entries = num_entries;
task_info->draw_ring_offset = ALIGN(AC_TASK_CTRLBUF_BYTES, 256);
task_info->payload_ring_offset = ALIGN(task_info->draw_ring_offset + draw_ring_bytes, 256);
task_info->bo_size_bytes = task_info->payload_ring_offset + payload_ring_bytes;
}

View file

@ -274,6 +274,50 @@ struct ac_hs_info {
void ac_get_hs_info(struct radeon_info *info,
struct ac_hs_info *hs);
/* Task rings BO layout information.
* This BO is shared between GFX and ACE queues so that the ACE and GFX
* firmware can cooperate on task->mesh dispatches and is also used to
* store the task payload which is passed to mesh shaders.
*
* The driver only needs to create this BO once,
* and it will always be able to accomodate the maximum needed
* task payload size.
*
* The following memory layout is used:
* 1. Control buffer: 9 DWORDs, 256 byte aligned
* Used by the firmware to maintain the current state.
* (padding)
* 2. Draw ring: 4 DWORDs per entry, 256 byte aligned
* Task shaders store the mesh dispatch size here.
* (padding)
* 3. Payload ring: 16K bytes per entry, 256 byte aligned.
* This is where task payload is stored by task shaders and
* read by mesh shaders.
*
*/
struct ac_task_info {
uint32_t draw_ring_offset;
uint32_t payload_ring_offset;
uint32_t bo_size_bytes;
uint16_t num_entries;
};
/* Size of each payload entry in the task payload ring.
* Spec requires minimum 16K bytes.
*/
#define AC_TASK_PAYLOAD_ENTRY_BYTES 16384
/* Size of each draw entry in the task draw ring.
* 4 DWORDs per entry.
*/
#define AC_TASK_DRAW_ENTRY_BYTES 16
/* Size of the task control buffer. 9 DWORDs. */
#define AC_TASK_CTRLBUF_BYTES 36
void ac_get_task_info(struct radeon_info *info,
struct ac_task_info *task_info);
#ifdef __cplusplus
}
#endif

View file

@ -91,11 +91,6 @@
*/
#define RADV_MAX_MEMORY_ALLOCATION_SIZE 0xFFFFFFFCull
/* Size of each payload entry in the task payload ring.
* Spec requires minimum 16K bytes.
*/
#define RADV_TASK_PAYLOAD_ENTRY_BYTES 16384
/* Number of invocations in each subgroup. */
#define RADV_SUBGROUP_SIZE 64

View file

@ -834,24 +834,7 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm
ac_get_gs_table_depth(device->rad_info.gfx_level, device->rad_info.family);
ac_get_hs_info(&device->rad_info, &device->hs);
/* Number of task shader ring entries. Needs to be a power of two.
* Use a low number on smaller chips so we don't waste space,
* but keep it high on bigger chips so it doesn't inhibit parallelism.
*/
switch (device->rad_info.family) {
case CHIP_VANGOGH:
case CHIP_NAVI24:
case CHIP_REMBRANDT:
device->task_num_entries = 256;
break;
case CHIP_NAVI21:
case CHIP_NAVI22:
case CHIP_NAVI23:
default:
device->task_num_entries = 1024;
break;
}
ac_get_task_info(&device->rad_info, &device->task_info);
*device_out = device;

View file

@ -332,9 +332,7 @@ struct radv_physical_device {
uint32_t gs_table_depth;
struct ac_hs_info hs;
/* Number of entries in the task shader ring buffers. */
uint32_t task_num_entries;
struct ac_task_info task_info;
};
struct radv_instance {

View file

@ -1094,12 +1094,12 @@ radv_lower_io_to_mem(struct radv_device *device, struct radv_pipeline_stage *sta
return true;
} else if (nir->info.stage == MESA_SHADER_TASK) {
ac_nir_apply_first_task_to_task_shader(nir);
ac_nir_lower_task_outputs_to_mem(nir, RADV_TASK_PAYLOAD_ENTRY_BYTES,
device->physical_device->task_num_entries);
ac_nir_lower_task_outputs_to_mem(nir, AC_TASK_PAYLOAD_ENTRY_BYTES,
device->physical_device->task_info.num_entries);
return true;
} else if (nir->info.stage == MESA_SHADER_MESH) {
ac_nir_lower_mesh_inputs_to_mem(nir, RADV_TASK_PAYLOAD_ENTRY_BYTES,
device->physical_device->task_num_entries);
ac_nir_lower_mesh_inputs_to_mem(nir, AC_TASK_PAYLOAD_ENTRY_BYTES,
device->physical_device->task_info.num_entries);
return true;
}