radv: Introduce ring info structure for queues, refactor preamble generation.

Put the shader ring information into a separate structure.
Also clean up how this information is used when generating the preambles
and add a few comments that explain how it works.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16395>
This commit is contained in:
Timur Kristóf 2022-05-05 22:46:38 +02:00 committed by Marge Bot
parent e39a5f2b9f
commit 37abbaee36
2 changed files with 94 additions and 118 deletions

View file

@ -3985,11 +3985,7 @@ radv_init_compute_state(struct radeon_cmdbuf *cs, struct radv_queue *queue)
}
static VkResult
radv_update_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave,
uint32_t scratch_waves, uint32_t compute_scratch_size_per_wave,
uint32_t compute_scratch_waves, uint32_t esgs_ring_size,
uint32_t gsvs_ring_size, bool needs_tess_rings, bool needs_gds,
bool needs_gds_oa, bool needs_sample_positions)
radv_update_preamble_cs(struct radv_queue *queue, const struct radv_queue_ring_info *needs)
{
struct radeon_winsys_bo *scratch_bo = queue->scratch_bo;
struct radeon_winsys_bo *descriptor_bo = queue->descriptor_bo;
@ -4000,40 +3996,14 @@ radv_update_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave
struct radeon_winsys_bo *gds_bo = queue->gds_bo;
struct radeon_winsys_bo *gds_oa_bo = queue->gds_oa_bo;
struct radeon_cmdbuf *dest_cs[3] = {0};
uint32_t ring_bo_flags = RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING;
const uint32_t ring_bo_flags = RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING;
VkResult result = VK_SUCCESS;
const bool add_tess_rings = !queue->has_tess_rings && needs_tess_rings;
const bool add_gds = !queue->has_gds && needs_gds;
const bool add_gds_oa = !queue->has_gds_oa && needs_gds_oa;
const bool add_sample_positions = !queue->has_sample_positions && needs_sample_positions;
const bool add_sample_positions = !queue->ring_info.sample_positions && needs->sample_positions;
const uint32_t scratch_size = needs->scratch_size_per_wave * needs->scratch_waves;
const uint32_t queue_scratch_size =
queue->ring_info.scratch_size_per_wave * queue->ring_info.scratch_waves;
scratch_size_per_wave = MAX2(scratch_size_per_wave, queue->scratch_size_per_wave);
if (scratch_size_per_wave)
scratch_waves = MIN2(scratch_waves, UINT32_MAX / scratch_size_per_wave);
else
scratch_waves = 0;
compute_scratch_size_per_wave =
MAX2(compute_scratch_size_per_wave, queue->compute_scratch_size_per_wave);
if (compute_scratch_size_per_wave)
compute_scratch_waves =
MIN2(compute_scratch_waves, UINT32_MAX / compute_scratch_size_per_wave);
else
compute_scratch_waves = 0;
if (scratch_size_per_wave <= queue->scratch_size_per_wave &&
scratch_waves <= queue->scratch_waves &&
compute_scratch_size_per_wave <= queue->compute_scratch_size_per_wave &&
compute_scratch_waves <= queue->compute_scratch_waves &&
esgs_ring_size <= queue->esgs_ring_size && gsvs_ring_size <= queue->gsvs_ring_size &&
!add_tess_rings && !add_gds && !add_gds_oa && !add_sample_positions &&
queue->initial_preamble_cs) {
return VK_SUCCESS;
}
uint32_t scratch_size = scratch_size_per_wave * scratch_waves;
uint32_t queue_scratch_size = queue->scratch_size_per_wave * queue->scratch_waves;
if (scratch_size > queue_scratch_size) {
result =
queue->device->ws->buffer_create(queue->device->ws, scratch_size, 4096, RADEON_DOMAIN_VRAM,
@ -4042,9 +4012,10 @@ radv_update_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave
goto fail;
}
uint32_t compute_scratch_size = compute_scratch_size_per_wave * compute_scratch_waves;
uint32_t compute_queue_scratch_size =
queue->compute_scratch_size_per_wave * queue->compute_scratch_waves;
const uint32_t compute_scratch_size =
needs->compute_scratch_size_per_wave * needs->compute_scratch_waves;
const uint32_t compute_queue_scratch_size =
queue->ring_info.compute_scratch_size_per_wave * queue->ring_info.compute_scratch_waves;
if (compute_scratch_size > compute_queue_scratch_size) {
result = queue->device->ws->buffer_create(queue->device->ws, compute_scratch_size, 4096,
RADEON_DOMAIN_VRAM, ring_bo_flags,
@ -4053,25 +4024,23 @@ radv_update_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave
goto fail;
}
esgs_ring_size = MAX2(esgs_ring_size, queue->esgs_ring_size);
if (esgs_ring_size > queue->esgs_ring_size) {
result = queue->device->ws->buffer_create(queue->device->ws, esgs_ring_size, 4096,
if (needs->esgs_ring_size > queue->ring_info.esgs_ring_size) {
result = queue->device->ws->buffer_create(queue->device->ws, needs->esgs_ring_size, 4096,
RADEON_DOMAIN_VRAM, ring_bo_flags,
RADV_BO_PRIORITY_SCRATCH, 0, &esgs_ring_bo);
if (result != VK_SUCCESS)
goto fail;
}
gsvs_ring_size = MAX2(gsvs_ring_size, queue->gsvs_ring_size);
if (gsvs_ring_size > queue->gsvs_ring_size) {
result = queue->device->ws->buffer_create(queue->device->ws, gsvs_ring_size, 4096,
if (needs->gsvs_ring_size > queue->ring_info.gsvs_ring_size) {
result = queue->device->ws->buffer_create(queue->device->ws, needs->gsvs_ring_size, 4096,
RADEON_DOMAIN_VRAM, ring_bo_flags,
RADV_BO_PRIORITY_SCRATCH, 0, &gsvs_ring_bo);
if (result != VK_SUCCESS)
goto fail;
}
if (add_tess_rings) {
if (!queue->ring_info.tess_rings && needs->tess_rings) {
result = queue->device->ws->buffer_create(
queue->device->ws, queue->device->hs.tess_offchip_ring_offset + queue->device->hs.tess_offchip_ring_size, 256,
RADEON_DOMAIN_VRAM, ring_bo_flags, RADV_BO_PRIORITY_SCRATCH, 0, &tess_rings_bo);
@ -4079,7 +4048,7 @@ radv_update_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave
goto fail;
}
if (add_gds) {
if (!queue->ring_info.gds && needs->gds) {
assert(queue->device->physical_device->rad_info.gfx_level >= GFX10);
/* 4 streamout GDS counters.
@ -4092,7 +4061,7 @@ radv_update_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave
goto fail;
}
if (add_gds_oa) {
if (!queue->ring_info.gds_oa && needs->gds_oa) {
assert(queue->device->physical_device->rad_info.gfx_level >= GFX10);
result =
@ -4141,8 +4110,8 @@ radv_update_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave
}
if (esgs_ring_bo || gsvs_ring_bo || tess_rings_bo || add_sample_positions)
radv_fill_shader_rings(queue, map, add_sample_positions, esgs_ring_size, esgs_ring_bo,
gsvs_ring_size, gsvs_ring_bo, tess_rings_bo);
radv_fill_shader_rings(queue, map, add_sample_positions, needs->esgs_ring_size,
esgs_ring_bo, needs->gsvs_ring_size, gsvs_ring_bo, tess_rings_bo);
queue->device->ws->buffer_unmap(descriptor_bo);
}
@ -4155,9 +4124,9 @@ radv_update_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave
queue->device->physical_device->rad_info.gfx_level >= GFX7)
continue;
/* Continue preamble is unnecessary when no shader rings are used. */
if (!scratch_size_per_wave && !compute_scratch_size_per_wave && !esgs_ring_size &&
!gsvs_ring_size && !needs_tess_rings && !needs_gds && !needs_gds_oa &&
!needs_sample_positions)
if (!needs->scratch_size_per_wave && !needs->compute_scratch_size_per_wave &&
!needs->esgs_ring_size && !needs->gsvs_ring_size && !needs->tess_rings &&
!needs->gds && !needs->gds_oa && !needs->sample_positions)
continue;
}
@ -4188,19 +4157,20 @@ radv_update_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
}
radv_emit_gs_ring_sizes(queue, cs, esgs_ring_bo, esgs_ring_size, gsvs_ring_bo,
gsvs_ring_size);
radv_emit_gs_ring_sizes(queue, cs, esgs_ring_bo, needs->esgs_ring_size, gsvs_ring_bo,
needs->gsvs_ring_size);
radv_emit_tess_factor_ring(queue, cs, tess_rings_bo);
radv_emit_global_shader_pointers(queue, cs, descriptor_bo);
radv_emit_compute_scratch(queue, cs, compute_scratch_size_per_wave, compute_scratch_waves,
compute_scratch_bo);
radv_emit_graphics_scratch(queue, cs, scratch_size_per_wave, scratch_waves, scratch_bo);
radv_emit_compute_scratch(queue, cs, needs->compute_scratch_size_per_wave,
needs->compute_scratch_waves, compute_scratch_bo);
radv_emit_graphics_scratch(queue, cs, needs->scratch_size_per_wave, needs->scratch_waves,
scratch_bo);
break;
case RADV_QUEUE_COMPUTE:
radv_init_compute_state(cs, queue);
radv_emit_global_shader_pointers(queue, cs, descriptor_bo);
radv_emit_compute_scratch(queue, cs, compute_scratch_size_per_wave, compute_scratch_waves,
compute_scratch_bo);
radv_emit_compute_scratch(queue, cs, needs->compute_scratch_size_per_wave,
needs->compute_scratch_waves, compute_scratch_bo);
break;
default:
break;
@ -4252,56 +4222,35 @@ radv_update_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave
queue->device->ws->buffer_destroy(queue->device->ws, queue->scratch_bo);
queue->scratch_bo = scratch_bo;
}
queue->scratch_size_per_wave = scratch_size_per_wave;
queue->scratch_waves = scratch_waves;
if (compute_scratch_bo != queue->compute_scratch_bo) {
if (queue->compute_scratch_bo)
queue->device->ws->buffer_destroy(queue->device->ws, queue->compute_scratch_bo);
queue->compute_scratch_bo = compute_scratch_bo;
}
queue->compute_scratch_size_per_wave = compute_scratch_size_per_wave;
queue->compute_scratch_waves = compute_scratch_waves;
if (esgs_ring_bo != queue->esgs_ring_bo) {
if (queue->esgs_ring_bo)
queue->device->ws->buffer_destroy(queue->device->ws, queue->esgs_ring_bo);
queue->esgs_ring_bo = esgs_ring_bo;
queue->esgs_ring_size = esgs_ring_size;
}
if (gsvs_ring_bo != queue->gsvs_ring_bo) {
if (queue->gsvs_ring_bo)
queue->device->ws->buffer_destroy(queue->device->ws, queue->gsvs_ring_bo);
queue->gsvs_ring_bo = gsvs_ring_bo;
queue->gsvs_ring_size = gsvs_ring_size;
}
if (tess_rings_bo != queue->tess_rings_bo) {
queue->tess_rings_bo = tess_rings_bo;
queue->has_tess_rings = true;
}
if (gds_bo != queue->gds_bo) {
queue->gds_bo = gds_bo;
queue->has_gds = true;
}
if (gds_oa_bo != queue->gds_oa_bo) {
queue->gds_oa_bo = gds_oa_bo;
queue->has_gds_oa = true;
}
if (descriptor_bo != queue->descriptor_bo) {
if (queue->descriptor_bo)
queue->device->ws->buffer_destroy(queue->device->ws, queue->descriptor_bo);
queue->descriptor_bo = descriptor_bo;
}
if (add_sample_positions)
queue->has_sample_positions = true;
queue->tess_rings_bo = tess_rings_bo;
queue->gds_bo = gds_bo;
queue->gds_oa_bo = gds_oa_bo;
queue->ring_info = *needs;
return VK_SUCCESS;
fail:
for (int i = 0; i < ARRAY_SIZE(dest_cs); ++i)
@ -4447,34 +4396,57 @@ radv_update_preambles(struct radv_queue *queue, struct vk_command_buffer *const
if (queue->qf == RADV_QUEUE_TRANSFER)
return VK_SUCCESS;
uint32_t scratch_size_per_wave = 0, waves_wanted = 0;
uint32_t compute_scratch_size_per_wave = 0, compute_waves_wanted = 0;
uint32_t esgs_ring_size = 0, gsvs_ring_size = 0;
bool tess_rings_needed = false;
bool gds_needed = false;
bool gds_oa_needed = false;
bool sample_positions_needed = false;
/* Figure out the needs of the current submission.
* Start by copying the queue's current info.
* This is done because we only allow two possible behaviours for these buffers:
* - Grow when the newly needed amount is larger than what we had
* - Allocate the max size and reuse it, but don't free it until the queue is destroyed
*/
struct radv_queue_ring_info needs = queue->ring_info;
for (uint32_t j = 0; j < cmd_buffer_count; j++) {
struct radv_cmd_buffer *cmd_buffer = container_of(cmd_buffers[j], struct radv_cmd_buffer, vk);
scratch_size_per_wave = MAX2(scratch_size_per_wave, cmd_buffer->scratch_size_per_wave_needed);
waves_wanted = MAX2(waves_wanted, cmd_buffer->scratch_waves_wanted);
compute_scratch_size_per_wave =
MAX2(compute_scratch_size_per_wave, cmd_buffer->compute_scratch_size_per_wave_needed);
compute_waves_wanted = MAX2(compute_waves_wanted, cmd_buffer->compute_scratch_waves_wanted);
esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed);
gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
tess_rings_needed |= cmd_buffer->tess_rings_needed;
gds_needed |= cmd_buffer->gds_needed;
gds_oa_needed |= cmd_buffer->gds_oa_needed;
sample_positions_needed |= cmd_buffer->sample_positions_needed;
needs.scratch_size_per_wave =
MAX2(needs.scratch_size_per_wave, cmd_buffer->scratch_size_per_wave_needed);
needs.scratch_waves = MAX2(needs.scratch_waves, cmd_buffer->scratch_waves_wanted);
needs.compute_scratch_size_per_wave = MAX2(needs.compute_scratch_size_per_wave,
cmd_buffer->compute_scratch_size_per_wave_needed);
needs.compute_scratch_waves =
MAX2(needs.compute_scratch_waves, cmd_buffer->compute_scratch_waves_wanted);
needs.esgs_ring_size = MAX2(needs.esgs_ring_size, cmd_buffer->esgs_ring_size_needed);
needs.gsvs_ring_size = MAX2(needs.gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
needs.tess_rings |= cmd_buffer->tess_rings_needed;
needs.gds |= cmd_buffer->gds_needed;
needs.gds_oa |= cmd_buffer->gds_oa_needed;
needs.sample_positions |= cmd_buffer->sample_positions_needed;
}
return radv_update_preamble_cs(queue, scratch_size_per_wave, waves_wanted,
compute_scratch_size_per_wave, compute_waves_wanted,
esgs_ring_size, gsvs_ring_size, tess_rings_needed, gds_needed,
gds_oa_needed, sample_positions_needed);
/* Sanitize scratch size information. */
needs.scratch_waves = needs.scratch_size_per_wave
? MIN2(needs.scratch_waves, UINT32_MAX / needs.scratch_size_per_wave)
: 0;
needs.compute_scratch_waves =
needs.compute_scratch_size_per_wave
? MIN2(needs.compute_scratch_waves, UINT32_MAX / needs.compute_scratch_size_per_wave)
: 0;
/* Return early if we already match these needs.
* Note that it's not possible for any of the needed values to be less
* than what the queue already had, because we only ever increase the allocated size.
*/
if (queue->initial_full_flush_preamble_cs &&
queue->ring_info.scratch_size_per_wave == needs.scratch_size_per_wave &&
queue->ring_info.scratch_waves == needs.scratch_waves &&
queue->ring_info.compute_scratch_size_per_wave == needs.compute_scratch_size_per_wave &&
queue->ring_info.compute_scratch_waves == needs.compute_scratch_waves &&
queue->ring_info.esgs_ring_size == needs.esgs_ring_size &&
queue->ring_info.gsvs_ring_size == needs.gsvs_ring_size &&
queue->ring_info.tess_rings == needs.tess_rings && queue->ring_info.gds == needs.gds &&
queue->ring_info.gds_oa == needs.gds_oa &&
queue->ring_info.sample_positions == needs.sample_positions)
return VK_SUCCESS;
return radv_update_preamble_cs(queue, &needs);
}
struct radv_deferred_queue_submission {

View file

@ -704,6 +704,19 @@ vk_queue_to_radv(const struct radv_physical_device *phys_dev, int queue_family_i
enum amd_ip_type radv_queue_family_to_ring(struct radv_physical_device *physical_device,
enum radv_queue_family f);
struct radv_queue_ring_info {
uint32_t scratch_size_per_wave;
uint32_t scratch_waves;
uint32_t compute_scratch_size_per_wave;
uint32_t compute_scratch_waves;
uint32_t esgs_ring_size;
uint32_t gsvs_ring_size;
bool tess_rings;
bool gds;
bool gds_oa;
bool sample_positions;
};
struct radv_queue {
struct vk_queue vk;
struct radv_device *device;
@ -711,16 +724,7 @@ struct radv_queue {
enum radeon_ctx_priority priority;
enum radv_queue_family qf;
uint32_t scratch_size_per_wave;
uint32_t scratch_waves;
uint32_t compute_scratch_size_per_wave;
uint32_t compute_scratch_waves;
uint32_t esgs_ring_size;
uint32_t gsvs_ring_size;
bool has_tess_rings;
bool has_gds;
bool has_gds_oa;
bool has_sample_positions;
struct radv_queue_ring_info ring_info;
struct radeon_winsys_bo *scratch_bo;
struct radeon_winsys_bo *descriptor_bo;