tu: Rewrite visibility stream allocation

The mechanism implemented in the hardware to synchronize against Write
after Read hazards with the visibility stream for concurrent binning is
for BV and BR to keep track of the number of render passes they have
finished and BV waits until
BR_count >= BV_count - vis stream count. For example, if
there are two visibility streams and the user submits three
renderpasses, before starting renderpass #3 BV will wait for BR to
finish renderpass #1. It's assumed that renderpass #3 and #2 use
different visibility streams, so it's safe to start working on #3 once
 #2 is done.

This mechanism is assumed to work across renderpasses and even submits,
and the only way to reset the BR/BV counts is via
CP_RESET_CONTEXT_STATE which is only done by the kernel when
switching contexts. This vastly complicates things for Vulkan,
where we have no idea what order command buffers will be submitted. This
means that we have to defer emitting the actual pointers until
submission time and create patchpoints instead. This gets unfortunately
very complicated with SIMULTANEOUS_USE_BIT where we have to update the
patchpoints on the GPU.

I've taken the liberty of also deferring the allocation of the
visibility stream until submit time. This will help us later move to
per-queue visibility streams, which will be necessary for supporting
multiple simultaneous queues.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36590>
This commit is contained in:
Connor Abbott 2025-05-19 17:56:05 -04:00 committed by Marge Bot
parent 416dc87be9
commit 50aa66a7c1
5 changed files with 358 additions and 23 deletions

View file

@ -201,21 +201,32 @@ tu6_lazy_init_vsc(struct tu_cmd_buffer *cmd)
mtx_unlock(&dev->mutex); mtx_unlock(&dev->mutex);
struct tu_bo *vsc_bo;
uint32_t prim_strm_size = cmd->vsc_prim_strm_pitch * num_vsc_pipes; uint32_t prim_strm_size = cmd->vsc_prim_strm_pitch * num_vsc_pipes;
uint32_t draw_strm_size = cmd->vsc_draw_strm_pitch * num_vsc_pipes; uint32_t draw_strm_size = cmd->vsc_draw_strm_pitch * num_vsc_pipes;
uint32_t draw_strm_size_size = 4 * num_vsc_pipes; uint32_t draw_strm_size_size = 4 * num_vsc_pipes;
uint32_t state_size = 4 * num_vsc_pipes; uint32_t state_size = 4 * num_vsc_pipes;
tu_get_scratch_bo(dev, cmd->vsc_size =
prim_strm_size + draw_strm_size + draw_strm_size_size + prim_strm_size + draw_strm_size + draw_strm_size_size + state_size;
state_size,
&vsc_bo);
cmd->vsc_prim_strm_va = vsc_bo->iova; cmd->vsc_prim_strm_offset = 0;
cmd->vsc_draw_strm_va = vsc_bo->iova + prim_strm_size; cmd->vsc_draw_strm_offset = prim_strm_size;
cmd->vsc_draw_strm_size_va = cmd->vsc_draw_strm_va + draw_strm_size; cmd->vsc_draw_strm_size_offset = cmd->vsc_draw_strm_offset + draw_strm_size;
cmd->vsc_state_va = cmd->vsc_draw_strm_size_va + draw_strm_size_size; cmd->vsc_state_offset = cmd->vsc_draw_strm_size_offset + draw_strm_size_size;
}
static void
tu_emit_vis_stream_patchpoint(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
uint32_t offset)
{
struct tu_vis_stream_patchpoint patchpoint = {
.data = cs->cur,
.iova = tu_cs_get_cur_iova(cs),
.offset = offset,
};
util_dynarray_append(&cmd->vis_stream_patchpoints, patchpoint);
tu_cs_emit_qw(cs, offset);
} }
template <chip CHIP> template <chip CHIP>
@ -223,20 +234,20 @@ static void
tu_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{ {
if (CHIP == A6XX) { if (CHIP == A6XX) {
tu_cs_emit_regs(cs, tu_cs_emit_pkt4(cs, REG_A6XX_VSC_SIZE_BASE, 2);
A6XX_VSC_SIZE_BASE(.qword = cmd->vsc_draw_strm_size_va)); tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_size_offset);
tu_cs_emit_regs(cs, tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_DATA_PRIM_BASE, 2);
A6XX_VSC_PIPE_DATA_PRIM_BASE(.qword = cmd->vsc_prim_strm_va)); tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_prim_strm_offset);
tu_cs_emit_regs( tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_DATA_DRAW_BASE, 2);
cs, A6XX_VSC_PIPE_DATA_DRAW_BASE(.qword = cmd->vsc_draw_strm_va)); tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_offset);
} else { } else {
tu_cs_emit_pkt7(cs, CP_SET_PSEUDO_REG, 3 * 3); tu_cs_emit_pkt7(cs, CP_SET_PSEUDO_REG, 3 * 3);
tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(VSC_PIPE_DATA_DRAW_BASE)); tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(VSC_PIPE_DATA_DRAW_BASE));
tu_cs_emit_qw(cs, cmd->vsc_draw_strm_va); tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_offset);
tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(VSC_SIZE_BASE)); tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(VSC_SIZE_BASE));
tu_cs_emit_qw(cs, cmd->vsc_draw_strm_size_va); tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_size_offset);
tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(VSC_PIPE_DATA_PRIM_BASE)); tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(VSC_PIPE_DATA_PRIM_BASE));
tu_cs_emit_qw(cs, cmd->vsc_prim_strm_va); tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_prim_strm_offset);
} }
cmd->vsc_initialized = true; cmd->vsc_initialized = true;
@ -1278,7 +1289,13 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
A6XX_CP_SET_MARKER_0_USES_GMEM); A6XX_CP_SET_MARKER_0_USES_GMEM);
if (CHIP == A6XX && cmd->device->physical_device->has_preemption) { if (CHIP == A6XX && cmd->device->physical_device->has_preemption) {
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
tu_cs_set_writeable(cs, true);
tu_emit_vsc<CHIP>(cmd, &cmd->cs); tu_emit_vsc<CHIP>(cmd, &cmd->cs);
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
tu_cs_set_writeable(cs, false);
} }
unsigned views = tu_fdm_num_layers(cmd); unsigned views = tu_fdm_num_layers(cmd);
@ -2798,8 +2815,14 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
* emits the preamble lazily. We chose the per-bin approach but blob's * emits the preamble lazily. We chose the per-bin approach but blob's
* should be a better one. * should be a better one.
*/ */
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
tu_cs_set_writeable(cs, true);
tu_emit_vsc<CHIP>(cmd, cs); tu_emit_vsc<CHIP>(cmd, cs);
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
tu_cs_set_writeable(cs, false);
tu6_emit_bin_size<CHIP>(cs, tiling->tile0.width, tiling->tile0.height, tu6_emit_bin_size<CHIP>(cs, tiling->tile0.width, tiling->tile0.height,
{ {
.render_mode = BINNING_PASS, .render_mode = BINNING_PASS,
@ -2855,13 +2878,18 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
tu6_lazy_init_vsc(cmd); tu6_lazy_init_vsc(cmd);
/* Upload state regs to memory to be restored on skipsaverestore /* Upload state regs to memory to be restored on skipsaverestore
* preemption. * preemption. On a7xx this is considered part of the vis stream that
* requires a patchpoint.
*/ */
if (CHIP >= A7XX &&
(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
tu_cs_set_writeable(cs, true);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_VSC_CHANNEL_VISIBILITY(0)) | tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_VSC_CHANNEL_VISIBILITY(0)) |
CP_REG_TO_MEM_0_CNT(32)); CP_REG_TO_MEM_0_CNT(32));
if (CHIP >= A7XX) if (CHIP >= A7XX)
tu_cs_emit_qw(cs, cmd->vsc_state_va); tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_state_offset);
else else
tu_cs_emit_qw(cs, global_iova(cmd, vsc_state)); tu_cs_emit_qw(cs, global_iova(cmd, vsc_state));
@ -2874,8 +2902,12 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
tu_cs_emit_pkt7(cs, CP_MEM_TO_SCRATCH_MEM, 4); tu_cs_emit_pkt7(cs, CP_MEM_TO_SCRATCH_MEM, 4);
tu_cs_emit(cs, num_vsc_pipes); /* count */ tu_cs_emit(cs, num_vsc_pipes); /* count */
tu_cs_emit(cs, 0); /* offset */ tu_cs_emit(cs, 0); /* offset */
tu_cs_emit_qw(cs, cmd->vsc_state_va); tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_state_offset);
} }
if (CHIP >= A7XX &&
(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
tu_cs_set_writeable(cs, false);
} }
tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result); tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
@ -3573,6 +3605,26 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
ralloc_free(cmd_buffer->pre_chain.patchpoints_ctx); ralloc_free(cmd_buffer->pre_chain.patchpoints_ctx);
util_dynarray_fini(&cmd_buffer->fdm_bin_patchpoints); util_dynarray_fini(&cmd_buffer->fdm_bin_patchpoints);
util_dynarray_fini(&cmd_buffer->pre_chain.fdm_bin_patchpoints); util_dynarray_fini(&cmd_buffer->pre_chain.fdm_bin_patchpoints);
util_dynarray_fini(&cmd_buffer->vis_stream_patchpoints);
util_dynarray_foreach (&cmd_buffer->vis_stream_bos, struct tu_bo *,
bo) {
tu_bo_finish(cmd_buffer->device, *bo);
}
mtx_lock(&cmd_buffer->device->vis_stream_suballocator_mtx);
util_dynarray_foreach (&cmd_buffer->vis_stream_cs_bos,
struct tu_vis_stream_patchpoint_cs,
bo) {
tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator,
&bo->cs_bo);
tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator,
&bo->fence_bo);
}
mtx_unlock(&cmd_buffer->device->vis_stream_suballocator_mtx);
util_dynarray_fini(&cmd_buffer->vis_stream_bos);
util_dynarray_fini(&cmd_buffer->vis_stream_cs_bos);
vk_command_buffer_finish(&cmd_buffer->vk); vk_command_buffer_finish(&cmd_buffer->vk);
vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->vk.pool->alloc, vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->vk.pool->alloc,
@ -3649,6 +3701,26 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
cmd_buffer->pre_chain.patchpoints_ctx = NULL; cmd_buffer->pre_chain.patchpoints_ctx = NULL;
util_dynarray_clear(&cmd_buffer->fdm_bin_patchpoints); util_dynarray_clear(&cmd_buffer->fdm_bin_patchpoints);
util_dynarray_clear(&cmd_buffer->pre_chain.fdm_bin_patchpoints); util_dynarray_clear(&cmd_buffer->pre_chain.fdm_bin_patchpoints);
util_dynarray_clear(&cmd_buffer->vis_stream_patchpoints);
util_dynarray_foreach (&cmd_buffer->vis_stream_bos, struct tu_bo *,
bo) {
tu_bo_finish(cmd_buffer->device, *bo);
}
mtx_lock(&cmd_buffer->device->vis_stream_suballocator_mtx);
util_dynarray_foreach (&cmd_buffer->vis_stream_cs_bos,
struct tu_vis_stream_patchpoint_cs,
bo) {
tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator,
&bo->cs_bo);
tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator,
&bo->fence_bo);
}
mtx_unlock(&cmd_buffer->device->vis_stream_suballocator_mtx);
util_dynarray_clear(&cmd_buffer->vis_stream_bos);
util_dynarray_clear(&cmd_buffer->vis_stream_cs_bos);
} }
const struct vk_command_buffer_ops tu_cmd_buffer_ops = { const struct vk_command_buffer_ops tu_cmd_buffer_ops = {
@ -5562,6 +5634,58 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
util_dynarray_append_dynarray(&cmd->fdm_bin_patchpoints, util_dynarray_append_dynarray(&cmd->fdm_bin_patchpoints,
&secondary->fdm_bin_patchpoints); &secondary->fdm_bin_patchpoints);
} else { } else {
struct tu_cs *cs = &cmd->cs;
/* If the secondary can be used multiple times, we have to set its
* patchpoints on the GPU. Set them here, and create a new
* patchpoint pointing to the CP_MEM_WRITE packet. Otherwise just
* copy them over adjusting the index.
*/
bool simultaneous_use = secondary->usage_flags &
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
/* If this cmdbuf itself can be used multiple times in a submit then
* its patchpoint will also be updated on the GPU.
*/
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
tu_cs_set_writeable(cs, true);
util_dynarray_foreach (&secondary->vis_stream_patchpoints,
struct tu_vis_stream_patchpoint,
secondary_patchpoint) {
struct tu_vis_stream_patchpoint patchpoint =
*secondary_patchpoint;
if (simultaneous_use) {
tu_cs_reserve_space(cs, 5);
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
tu_cs_emit_qw(cs, patchpoint.iova);
patchpoint.iova = tu_cs_get_cur_iova(cs);
patchpoint.data = cs->cur;
tu_cs_emit_qw(cs, 0);
}
util_dynarray_append(&cmd->vis_stream_patchpoints,
patchpoint);
}
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
tu_cs_set_writeable(cs, false);
if (simultaneous_use) {
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
/* Make BV wait for updates on BR to land */
if (cmd->device->physical_device->info->chip >= 7) {
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
CP_THREAD_CONTROL_0_SYNC_THREADS);
}
}
cmd->vsc_size = MAX2(cmd->vsc_size, secondary->vsc_size);
switch (secondary->state.suspend_resume) { switch (secondary->state.suspend_resume) {
case SR_NONE: case SR_NONE:
assert(tu_cs_is_empty(&secondary->draw_cs)); assert(tu_cs_is_empty(&secondary->draw_cs));

View file

@ -618,6 +618,10 @@ struct tu_cmd_buffer
void *patchpoints_ctx; void *patchpoints_ctx;
struct util_dynarray fdm_bin_patchpoints; struct util_dynarray fdm_bin_patchpoints;
struct util_dynarray vis_stream_patchpoints;
struct util_dynarray vis_stream_bos;
struct util_dynarray vis_stream_cs_bos;
VkCommandBufferUsageFlags usage_flags; VkCommandBufferUsageFlags usage_flags;
VkQueryPipelineStatisticFlags inherited_pipeline_statistics; VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
@ -686,8 +690,9 @@ struct tu_cmd_buffer
uint32_t vsc_draw_strm_pitch; uint32_t vsc_draw_strm_pitch;
uint32_t vsc_prim_strm_pitch; uint32_t vsc_prim_strm_pitch;
uint64_t vsc_draw_strm_va, vsc_draw_strm_size_va, vsc_prim_strm_va; uint32_t vsc_draw_strm_offset, vsc_draw_strm_size_offset;
uint64_t vsc_state_va; uint32_t vsc_prim_strm_offset, vsc_state_offset;
uint64_t vsc_size;
bool vsc_initialized; bool vsc_initialized;
bool prev_fsr_is_null; bool prev_fsr_is_null;
@ -833,6 +838,16 @@ struct tu_fdm_bin_patchpoint {
tu_fdm_bin_apply_t apply; tu_fdm_bin_apply_t apply;
}; };
struct tu_vis_stream_patchpoint {
uint32_t *data;
uint64_t iova;
uint32_t offset;
};
struct tu_vis_stream_patchpoint_cs {
struct tu_suballoc_bo cs_bo;
struct tu_suballoc_bo fence_bo;
};
void void
tu_barrier(struct tu_cmd_buffer *cmd, tu_barrier(struct tu_cmd_buffer *cmd,

View file

@ -2725,6 +2725,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
mtx_init(&device->radix_sort_mutex, mtx_plain); mtx_init(&device->radix_sort_mutex, mtx_plain);
mtx_init(&device->fiber_pvtmem_bo.mtx, mtx_plain); mtx_init(&device->fiber_pvtmem_bo.mtx, mtx_plain);
mtx_init(&device->wave_pvtmem_bo.mtx, mtx_plain); mtx_init(&device->wave_pvtmem_bo.mtx, mtx_plain);
mtx_init(&device->vis_stream_mtx, mtx_plain);
mtx_init(&device->vis_stream_suballocator_mtx, mtx_plain);
mtx_init(&device->mutex, mtx_plain); mtx_init(&device->mutex, mtx_plain);
mtx_init(&device->copy_timestamp_cs_pool_mutex, mtx_plain); mtx_init(&device->copy_timestamp_cs_pool_mutex, mtx_plain);
#ifdef HAVE_PERFETTO #ifdef HAVE_PERFETTO
@ -2853,6 +2855,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
getpagesize(), TU_BO_ALLOC_INTERNAL_RESOURCE, getpagesize(), TU_BO_ALLOC_INTERNAL_RESOURCE,
"event_suballoc"); "event_suballoc");
tu_bo_suballocator_init(
&device->vis_stream_suballocator, device,
getpagesize(),
(enum tu_bo_alloc_flags)(TU_BO_ALLOC_INTERNAL_RESOURCE |
TU_BO_ALLOC_ALLOW_DUMP),
"vis_stream_suballoc");
result = tu_bo_init_new( result = tu_bo_init_new(
device, NULL, &device->global_bo, global_size, device, NULL, &device->global_bo, global_size,
(enum tu_bo_alloc_flags) (TU_BO_ALLOC_ALLOW_DUMP | (enum tu_bo_alloc_flags) (TU_BO_ALLOC_ALLOW_DUMP |
@ -3146,12 +3155,16 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
tu_bo_suballocator_finish(&device->autotune_suballoc); tu_bo_suballocator_finish(&device->autotune_suballoc);
tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc); tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
tu_bo_suballocator_finish(&device->event_suballoc); tu_bo_suballocator_finish(&device->event_suballoc);
tu_bo_suballocator_finish(&device->vis_stream_suballocator);
tu_bo_finish(device, device->global_bo); tu_bo_finish(device, device->global_bo);
if (device->vm_bind_fence_fd != -1) if (device->vm_bind_fence_fd != -1)
close(device->vm_bind_fence_fd); close(device->vm_bind_fence_fd);
if (device->vis_stream_bo)
tu_bo_finish(device, device->vis_stream_bo);
if (device->null_accel_struct_bo) if (device->null_accel_struct_bo)
tu_bo_finish(device, device->null_accel_struct_bo); tu_bo_finish(device, device->null_accel_struct_bo);

View file

@ -363,6 +363,11 @@ struct tu_device
struct tu_suballocator *trace_suballoc; struct tu_suballocator *trace_suballoc;
mtx_t trace_mutex; mtx_t trace_mutex;
/* VSC patchpoint BO suballocator.
*/
struct tu_suballocator vis_stream_suballocator;
mtx_t vis_stream_suballocator_mtx;
/* the blob seems to always use 8K factor and 128K param sizes, copy them */ /* the blob seems to always use 8K factor and 128K param sizes, copy them */
#define TU_TESS_FACTOR_SIZE (8 * 1024) #define TU_TESS_FACTOR_SIZE (8 * 1024)
#define TU_TESS_PARAM_SIZE (128 * 1024) #define TU_TESS_PARAM_SIZE (128 * 1024)
@ -433,6 +438,9 @@ struct tu_device
struct tu_cs_entry bin_preamble_entry, bin_preamble_bv_entry; struct tu_cs_entry bin_preamble_entry, bin_preamble_bv_entry;
struct tu_bo *vis_stream_bo;
mtx_t vis_stream_mtx;
struct util_dynarray dynamic_rendering_pending; struct util_dynarray dynamic_rendering_pending;
VkCommandPool dynamic_rendering_pool; VkCommandPool dynamic_rendering_pool;
uint32_t dynamic_rendering_fence; uint32_t dynamic_rendering_fence;

View file

@ -85,6 +85,176 @@ submit_add_entries(struct tu_device *dev, void *submit,
} }
} }
/* Normally, we can just resolve visibility stream patchpoints on the CPU by
* writing directly to the command stream with the final iova of the allocated
* BO. However this doesn't work with SIMULTANEOUS_USE command buffers, where
* the same buffer may be in flight more than once, including within a submit.
* To handle this we have to update the patchpoints on the GPU. The lifetime
* of the CS used to write the patchpoints on the GPU is tricky, since if we
* always allocate a new one for each submit the size could grow infinitely if
* the command buffer is never freed or reset. Instead this implements a pool
* of patchpoint CS's per command buffer that reuses finiehed CS's.
*/
static VkResult
get_vis_stream_patchpoint_cs(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_cs *sub_cs,
uint64_t *fence_iova)
{
/* See below for the commands emitted to the CS. */
uint32_t cs_size = 5 *
util_dynarray_num_elements(&cmd->vis_stream_patchpoints,
struct tu_vis_stream_patchpoint) + 6;
util_dynarray_foreach (&cmd->vis_stream_cs_bos,
struct tu_vis_stream_patchpoint_cs,
patchpoint_cs) {
uint32_t *fence = (uint32_t *)patchpoint_cs->fence_bo.bo->map;
if (*fence == 1) {
*fence = 0;
tu_cs_init_suballoc(cs, cmd->device, &patchpoint_cs->cs_bo);
tu_cs_begin_sub_stream(cs, cs_size, sub_cs);
*fence_iova = patchpoint_cs->fence_bo.iova;
return VK_SUCCESS;
}
}
struct tu_vis_stream_patchpoint_cs patchpoint_cs;
mtx_lock(&cmd->device->vis_stream_suballocator_mtx);
VkResult result =
tu_suballoc_bo_alloc(&patchpoint_cs.cs_bo,
&cmd->device->vis_stream_suballocator,
cs_size * 4, 4);
if (result != VK_SUCCESS) {
mtx_unlock(&cmd->device->vis_stream_suballocator_mtx);
return result;
}
result =
tu_suballoc_bo_alloc(&patchpoint_cs.fence_bo,
&cmd->device->vis_stream_suballocator,
4, 4);
if (result != VK_SUCCESS) {
tu_suballoc_bo_free(&cmd->device->vis_stream_suballocator,
&patchpoint_cs.cs_bo);
mtx_unlock(&cmd->device->vis_stream_suballocator_mtx);
return result;
}
mtx_unlock(&cmd->device->vis_stream_suballocator_mtx);
util_dynarray_append(&cmd->vis_stream_cs_bos, patchpoint_cs);
tu_cs_init_suballoc(cs, cmd->device, &patchpoint_cs.cs_bo);
tu_cs_begin_sub_stream(cs, cs_size, sub_cs);
*fence_iova = patchpoint_cs.fence_bo.iova;
return VK_SUCCESS;
}
static VkResult
resolve_vis_stream_patchpoints(struct tu_queue *queue,
void *submit,
struct util_dynarray *dump_cmds,
struct tu_cmd_buffer **cmd_buffers,
uint32_t cmdbuf_count)
{
struct tu_device *dev = queue->device;
uint32_t max_size = 0;
for (unsigned i = 0; i < cmdbuf_count; i++)
max_size = MAX2(max_size, cmd_buffers[i]->vsc_size);
if (max_size == 0)
return VK_SUCCESS;
struct tu_bo *bo = NULL;
VkResult result = VK_SUCCESS;
mtx_lock(&dev->vis_stream_mtx);
if (!dev->vis_stream_bo || max_size > dev->vis_stream_bo->size) {
if (dev->vis_stream_bo)
tu_bo_finish(dev, dev->vis_stream_bo);
result = tu_bo_init_new(dev, &dev->vk.base, &dev->vis_stream_bo,
max_size, TU_BO_ALLOC_INTERNAL_RESOURCE,
"visibility stream");
}
bo = dev->vis_stream_bo;
mtx_unlock(&dev->vis_stream_mtx);
if (!bo)
return result;
/* Attach a reference to the BO to each command buffer involved in the
* submit.
*/
for (unsigned i = 0; i < cmdbuf_count; i++) {
bool has_bo = false;
util_dynarray_foreach (&cmd_buffers[i]->vis_stream_bos,
struct tu_bo *, cmd_bo) {
if (*cmd_bo == bo) {
has_bo = true;
break;
}
}
if (!has_bo) {
util_dynarray_append(&cmd_buffers[i]->vis_stream_bos,
tu_bo_get_ref(bo));
}
}
for (unsigned i = 0; i < cmdbuf_count; i++) {
struct tu_cs cs, sub_cs;
uint64_t fence_iova = 0;
if (cmd_buffers[i]->usage_flags &
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) {
result = get_vis_stream_patchpoint_cs(cmd_buffers[i],
&cs, &sub_cs, &fence_iova);
if (result != VK_SUCCESS)
return result;
}
util_dynarray_foreach (&cmd_buffers[i]->vis_stream_patchpoints,
struct tu_vis_stream_patchpoint,
patchpoint) {
uint64_t final_iova = bo->iova + patchpoint->offset;
if (cmd_buffers[i]->usage_flags &
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) {
tu_cs_emit_pkt7(&sub_cs, CP_MEM_WRITE, 4);
tu_cs_emit_qw(&sub_cs, patchpoint->iova);
tu_cs_emit_qw(&sub_cs, final_iova);
} else {
patchpoint->data[0] = final_iova;
patchpoint->data[1] = final_iova >> 32;
}
}
if (cmd_buffers[i]->usage_flags &
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) {
tu_cs_emit_pkt7(&sub_cs, CP_WAIT_MEM_WRITES, 0);
tu_cs_emit_pkt7(&sub_cs, CP_WAIT_FOR_ME, 0);
/* Signal that this CS is done and can be reused. */
tu_cs_emit_pkt7(&sub_cs, CP_MEM_WRITE, 3);
tu_cs_emit_qw(&sub_cs, fence_iova);
tu_cs_emit(&sub_cs, 1);
struct tu_cs_entry entry = tu_cs_end_sub_stream(&cs, &sub_cs);
submit_add_entries(queue->device, submit, dump_cmds, &entry, 1);
}
}
return VK_SUCCESS;
}
static VkResult static VkResult
queue_submit_sparse(struct vk_queue *_queue, struct vk_queue_submit *vk_submit) queue_submit_sparse(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
{ {
@ -206,6 +376,11 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
if (!submit) if (!submit)
goto fail_create_submit; goto fail_create_submit;
result = resolve_vis_stream_patchpoints(queue, submit, &dump_cmds,
cmd_buffers, cmdbuf_count);
if (result != VK_SUCCESS)
goto out;
if (has_trace_points) { if (has_trace_points) {
tu_u_trace_submission_data_create( tu_u_trace_submission_data_create(
device, cmd_buffers, cmdbuf_count, &u_trace_submission_data); device, cmd_buffers, cmdbuf_count, &u_trace_submission_data);