mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 16:00:08 +01:00
tu: Rewrite visibility stream allocation
The mechanism implemented in the hardware to synchronize against Write after Read hazards with the visibility stream for concurrent binning is for BV and BR to keep track of the number of render passes they have finished and BV waits until BR_count >= BV_count - vis stream count. For example, if there are two visibility streams and the user submits three renderpasses, before starting renderpass #3 BV will wait for BR to finish renderpass #1. It's assumed that renderpass #3 and #2 use different visibility streams, so it's safe to start working on #3 once #2 is done. This mechanism is assumed to work across renderpasses and even submits, and the only way to reset the BR/BV counts is via CP_RESET_CONTEXT_STATE which is only done by the kernel when switching contexts. This vastly complicates things for Vulkan, where we have no idea what order command buffers will be submitted. This means that we have to defer emitting the actual pointers until submission time and create patchpoints instead. This gets unfortunately very complicated with SIMULTANEOUS_USE_BIT where we have to update the patchpoints on the GPU. I've taken the liberty of also deferring the allocation of the visibility stream until submit time. This will help us later move to per-queue visibility streams, which will be necessary for supporting multiple simultaneous queues. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36590>
This commit is contained in:
parent
416dc87be9
commit
50aa66a7c1
5 changed files with 358 additions and 23 deletions
|
|
@ -201,21 +201,32 @@ tu6_lazy_init_vsc(struct tu_cmd_buffer *cmd)
|
|||
|
||||
mtx_unlock(&dev->mutex);
|
||||
|
||||
struct tu_bo *vsc_bo;
|
||||
uint32_t prim_strm_size = cmd->vsc_prim_strm_pitch * num_vsc_pipes;
|
||||
uint32_t draw_strm_size = cmd->vsc_draw_strm_pitch * num_vsc_pipes;
|
||||
uint32_t draw_strm_size_size = 4 * num_vsc_pipes;
|
||||
uint32_t state_size = 4 * num_vsc_pipes;
|
||||
|
||||
tu_get_scratch_bo(dev,
|
||||
prim_strm_size + draw_strm_size + draw_strm_size_size +
|
||||
state_size,
|
||||
&vsc_bo);
|
||||
cmd->vsc_size =
|
||||
prim_strm_size + draw_strm_size + draw_strm_size_size + state_size;
|
||||
|
||||
cmd->vsc_prim_strm_va = vsc_bo->iova;
|
||||
cmd->vsc_draw_strm_va = vsc_bo->iova + prim_strm_size;
|
||||
cmd->vsc_draw_strm_size_va = cmd->vsc_draw_strm_va + draw_strm_size;
|
||||
cmd->vsc_state_va = cmd->vsc_draw_strm_size_va + draw_strm_size_size;
|
||||
cmd->vsc_prim_strm_offset = 0;
|
||||
cmd->vsc_draw_strm_offset = prim_strm_size;
|
||||
cmd->vsc_draw_strm_size_offset = cmd->vsc_draw_strm_offset + draw_strm_size;
|
||||
cmd->vsc_state_offset = cmd->vsc_draw_strm_size_offset + draw_strm_size_size;
|
||||
}
|
||||
|
||||
static void
|
||||
tu_emit_vis_stream_patchpoint(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
||||
uint32_t offset)
|
||||
{
|
||||
struct tu_vis_stream_patchpoint patchpoint = {
|
||||
.data = cs->cur,
|
||||
.iova = tu_cs_get_cur_iova(cs),
|
||||
.offset = offset,
|
||||
};
|
||||
|
||||
util_dynarray_append(&cmd->vis_stream_patchpoints, patchpoint);
|
||||
tu_cs_emit_qw(cs, offset);
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
|
|
@ -223,20 +234,20 @@ static void
|
|||
tu_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
||||
{
|
||||
if (CHIP == A6XX) {
|
||||
tu_cs_emit_regs(cs,
|
||||
A6XX_VSC_SIZE_BASE(.qword = cmd->vsc_draw_strm_size_va));
|
||||
tu_cs_emit_regs(cs,
|
||||
A6XX_VSC_PIPE_DATA_PRIM_BASE(.qword = cmd->vsc_prim_strm_va));
|
||||
tu_cs_emit_regs(
|
||||
cs, A6XX_VSC_PIPE_DATA_DRAW_BASE(.qword = cmd->vsc_draw_strm_va));
|
||||
tu_cs_emit_pkt4(cs, REG_A6XX_VSC_SIZE_BASE, 2);
|
||||
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_size_offset);
|
||||
tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_DATA_PRIM_BASE, 2);
|
||||
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_prim_strm_offset);
|
||||
tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_DATA_DRAW_BASE, 2);
|
||||
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_offset);
|
||||
} else {
|
||||
tu_cs_emit_pkt7(cs, CP_SET_PSEUDO_REG, 3 * 3);
|
||||
tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(VSC_PIPE_DATA_DRAW_BASE));
|
||||
tu_cs_emit_qw(cs, cmd->vsc_draw_strm_va);
|
||||
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_offset);
|
||||
tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(VSC_SIZE_BASE));
|
||||
tu_cs_emit_qw(cs, cmd->vsc_draw_strm_size_va);
|
||||
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_size_offset);
|
||||
tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(VSC_PIPE_DATA_PRIM_BASE));
|
||||
tu_cs_emit_qw(cs, cmd->vsc_prim_strm_va);
|
||||
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_prim_strm_offset);
|
||||
}
|
||||
|
||||
cmd->vsc_initialized = true;
|
||||
|
|
@ -1278,7 +1289,13 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
|
|||
A6XX_CP_SET_MARKER_0_USES_GMEM);
|
||||
|
||||
if (CHIP == A6XX && cmd->device->physical_device->has_preemption) {
|
||||
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
|
||||
tu_cs_set_writeable(cs, true);
|
||||
|
||||
tu_emit_vsc<CHIP>(cmd, &cmd->cs);
|
||||
|
||||
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
|
||||
tu_cs_set_writeable(cs, false);
|
||||
}
|
||||
|
||||
unsigned views = tu_fdm_num_layers(cmd);
|
||||
|
|
@ -2798,8 +2815,14 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
* emits the preamble lazily. We chose the per-bin approach but blob's
|
||||
* should be a better one.
|
||||
*/
|
||||
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
|
||||
tu_cs_set_writeable(cs, true);
|
||||
|
||||
tu_emit_vsc<CHIP>(cmd, cs);
|
||||
|
||||
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
|
||||
tu_cs_set_writeable(cs, false);
|
||||
|
||||
tu6_emit_bin_size<CHIP>(cs, tiling->tile0.width, tiling->tile0.height,
|
||||
{
|
||||
.render_mode = BINNING_PASS,
|
||||
|
|
@ -2855,13 +2878,18 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
tu6_lazy_init_vsc(cmd);
|
||||
|
||||
/* Upload state regs to memory to be restored on skipsaverestore
|
||||
* preemption.
|
||||
* preemption. On a7xx this is considered part of the vis stream that
|
||||
* requires a patchpoint.
|
||||
*/
|
||||
if (CHIP >= A7XX &&
|
||||
(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
|
||||
tu_cs_set_writeable(cs, true);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
|
||||
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_VSC_CHANNEL_VISIBILITY(0)) |
|
||||
CP_REG_TO_MEM_0_CNT(32));
|
||||
if (CHIP >= A7XX)
|
||||
tu_cs_emit_qw(cs, cmd->vsc_state_va);
|
||||
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_state_offset);
|
||||
else
|
||||
tu_cs_emit_qw(cs, global_iova(cmd, vsc_state));
|
||||
|
||||
|
|
@ -2874,8 +2902,12 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
tu_cs_emit_pkt7(cs, CP_MEM_TO_SCRATCH_MEM, 4);
|
||||
tu_cs_emit(cs, num_vsc_pipes); /* count */
|
||||
tu_cs_emit(cs, 0); /* offset */
|
||||
tu_cs_emit_qw(cs, cmd->vsc_state_va);
|
||||
tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_state_offset);
|
||||
}
|
||||
|
||||
if (CHIP >= A7XX &&
|
||||
(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
|
||||
tu_cs_set_writeable(cs, false);
|
||||
}
|
||||
|
||||
tu_autotune_begin_renderpass<CHIP>(cmd, cs, autotune_result);
|
||||
|
|
@ -3573,6 +3605,26 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
|
|||
ralloc_free(cmd_buffer->pre_chain.patchpoints_ctx);
|
||||
util_dynarray_fini(&cmd_buffer->fdm_bin_patchpoints);
|
||||
util_dynarray_fini(&cmd_buffer->pre_chain.fdm_bin_patchpoints);
|
||||
util_dynarray_fini(&cmd_buffer->vis_stream_patchpoints);
|
||||
|
||||
util_dynarray_foreach (&cmd_buffer->vis_stream_bos, struct tu_bo *,
|
||||
bo) {
|
||||
tu_bo_finish(cmd_buffer->device, *bo);
|
||||
}
|
||||
|
||||
mtx_lock(&cmd_buffer->device->vis_stream_suballocator_mtx);
|
||||
util_dynarray_foreach (&cmd_buffer->vis_stream_cs_bos,
|
||||
struct tu_vis_stream_patchpoint_cs,
|
||||
bo) {
|
||||
tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator,
|
||||
&bo->cs_bo);
|
||||
tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator,
|
||||
&bo->fence_bo);
|
||||
}
|
||||
mtx_unlock(&cmd_buffer->device->vis_stream_suballocator_mtx);
|
||||
|
||||
util_dynarray_fini(&cmd_buffer->vis_stream_bos);
|
||||
util_dynarray_fini(&cmd_buffer->vis_stream_cs_bos);
|
||||
|
||||
vk_command_buffer_finish(&cmd_buffer->vk);
|
||||
vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->vk.pool->alloc,
|
||||
|
|
@ -3649,6 +3701,26 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
|
|||
cmd_buffer->pre_chain.patchpoints_ctx = NULL;
|
||||
util_dynarray_clear(&cmd_buffer->fdm_bin_patchpoints);
|
||||
util_dynarray_clear(&cmd_buffer->pre_chain.fdm_bin_patchpoints);
|
||||
util_dynarray_clear(&cmd_buffer->vis_stream_patchpoints);
|
||||
|
||||
util_dynarray_foreach (&cmd_buffer->vis_stream_bos, struct tu_bo *,
|
||||
bo) {
|
||||
tu_bo_finish(cmd_buffer->device, *bo);
|
||||
}
|
||||
|
||||
mtx_lock(&cmd_buffer->device->vis_stream_suballocator_mtx);
|
||||
util_dynarray_foreach (&cmd_buffer->vis_stream_cs_bos,
|
||||
struct tu_vis_stream_patchpoint_cs,
|
||||
bo) {
|
||||
tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator,
|
||||
&bo->cs_bo);
|
||||
tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator,
|
||||
&bo->fence_bo);
|
||||
}
|
||||
mtx_unlock(&cmd_buffer->device->vis_stream_suballocator_mtx);
|
||||
|
||||
util_dynarray_clear(&cmd_buffer->vis_stream_bos);
|
||||
util_dynarray_clear(&cmd_buffer->vis_stream_cs_bos);
|
||||
}
|
||||
|
||||
const struct vk_command_buffer_ops tu_cmd_buffer_ops = {
|
||||
|
|
@ -5562,6 +5634,58 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
|
|||
util_dynarray_append_dynarray(&cmd->fdm_bin_patchpoints,
|
||||
&secondary->fdm_bin_patchpoints);
|
||||
} else {
|
||||
struct tu_cs *cs = &cmd->cs;
|
||||
|
||||
/* If the secondary can be used multiple times, we have to set its
|
||||
* patchpoints on the GPU. Set them here, and create a new
|
||||
* patchpoint pointing to the CP_MEM_WRITE packet. Otherwise just
|
||||
* copy them over adjusting the index.
|
||||
*/
|
||||
bool simultaneous_use = secondary->usage_flags &
|
||||
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
|
||||
|
||||
/* If this cmdbuf itself can be used multiple times in a submit then
|
||||
* its patchpoint will also be updated on the GPU.
|
||||
*/
|
||||
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
|
||||
tu_cs_set_writeable(cs, true);
|
||||
|
||||
util_dynarray_foreach (&secondary->vis_stream_patchpoints,
|
||||
struct tu_vis_stream_patchpoint,
|
||||
secondary_patchpoint) {
|
||||
struct tu_vis_stream_patchpoint patchpoint =
|
||||
*secondary_patchpoint;
|
||||
|
||||
if (simultaneous_use) {
|
||||
tu_cs_reserve_space(cs, 5);
|
||||
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
|
||||
tu_cs_emit_qw(cs, patchpoint.iova);
|
||||
patchpoint.iova = tu_cs_get_cur_iova(cs);
|
||||
patchpoint.data = cs->cur;
|
||||
tu_cs_emit_qw(cs, 0);
|
||||
}
|
||||
|
||||
util_dynarray_append(&cmd->vis_stream_patchpoints,
|
||||
patchpoint);
|
||||
}
|
||||
|
||||
if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
|
||||
tu_cs_set_writeable(cs, false);
|
||||
|
||||
if (simultaneous_use) {
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
|
||||
|
||||
/* Make BV wait for updates on BR to land */
|
||||
if (cmd->device->physical_device->info->chip >= 7) {
|
||||
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
|
||||
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) |
|
||||
CP_THREAD_CONTROL_0_SYNC_THREADS);
|
||||
}
|
||||
}
|
||||
|
||||
cmd->vsc_size = MAX2(cmd->vsc_size, secondary->vsc_size);
|
||||
|
||||
switch (secondary->state.suspend_resume) {
|
||||
case SR_NONE:
|
||||
assert(tu_cs_is_empty(&secondary->draw_cs));
|
||||
|
|
|
|||
|
|
@ -618,6 +618,10 @@ struct tu_cmd_buffer
|
|||
void *patchpoints_ctx;
|
||||
struct util_dynarray fdm_bin_patchpoints;
|
||||
|
||||
struct util_dynarray vis_stream_patchpoints;
|
||||
struct util_dynarray vis_stream_bos;
|
||||
struct util_dynarray vis_stream_cs_bos;
|
||||
|
||||
VkCommandBufferUsageFlags usage_flags;
|
||||
|
||||
VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
|
||||
|
|
@ -686,8 +690,9 @@ struct tu_cmd_buffer
|
|||
|
||||
uint32_t vsc_draw_strm_pitch;
|
||||
uint32_t vsc_prim_strm_pitch;
|
||||
uint64_t vsc_draw_strm_va, vsc_draw_strm_size_va, vsc_prim_strm_va;
|
||||
uint64_t vsc_state_va;
|
||||
uint32_t vsc_draw_strm_offset, vsc_draw_strm_size_offset;
|
||||
uint32_t vsc_prim_strm_offset, vsc_state_offset;
|
||||
uint64_t vsc_size;
|
||||
bool vsc_initialized;
|
||||
|
||||
bool prev_fsr_is_null;
|
||||
|
|
@ -833,6 +838,16 @@ struct tu_fdm_bin_patchpoint {
|
|||
tu_fdm_bin_apply_t apply;
|
||||
};
|
||||
|
||||
struct tu_vis_stream_patchpoint {
|
||||
uint32_t *data;
|
||||
uint64_t iova;
|
||||
uint32_t offset;
|
||||
};
|
||||
|
||||
struct tu_vis_stream_patchpoint_cs {
|
||||
struct tu_suballoc_bo cs_bo;
|
||||
struct tu_suballoc_bo fence_bo;
|
||||
};
|
||||
|
||||
void
|
||||
tu_barrier(struct tu_cmd_buffer *cmd,
|
||||
|
|
|
|||
|
|
@ -2725,6 +2725,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
mtx_init(&device->radix_sort_mutex, mtx_plain);
|
||||
mtx_init(&device->fiber_pvtmem_bo.mtx, mtx_plain);
|
||||
mtx_init(&device->wave_pvtmem_bo.mtx, mtx_plain);
|
||||
mtx_init(&device->vis_stream_mtx, mtx_plain);
|
||||
mtx_init(&device->vis_stream_suballocator_mtx, mtx_plain);
|
||||
mtx_init(&device->mutex, mtx_plain);
|
||||
mtx_init(&device->copy_timestamp_cs_pool_mutex, mtx_plain);
|
||||
#ifdef HAVE_PERFETTO
|
||||
|
|
@ -2853,6 +2855,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
getpagesize(), TU_BO_ALLOC_INTERNAL_RESOURCE,
|
||||
"event_suballoc");
|
||||
|
||||
tu_bo_suballocator_init(
|
||||
&device->vis_stream_suballocator, device,
|
||||
getpagesize(),
|
||||
(enum tu_bo_alloc_flags)(TU_BO_ALLOC_INTERNAL_RESOURCE |
|
||||
TU_BO_ALLOC_ALLOW_DUMP),
|
||||
"vis_stream_suballoc");
|
||||
|
||||
result = tu_bo_init_new(
|
||||
device, NULL, &device->global_bo, global_size,
|
||||
(enum tu_bo_alloc_flags) (TU_BO_ALLOC_ALLOW_DUMP |
|
||||
|
|
@ -3146,12 +3155,16 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
|
|||
tu_bo_suballocator_finish(&device->autotune_suballoc);
|
||||
tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
|
||||
tu_bo_suballocator_finish(&device->event_suballoc);
|
||||
tu_bo_suballocator_finish(&device->vis_stream_suballocator);
|
||||
|
||||
tu_bo_finish(device, device->global_bo);
|
||||
|
||||
if (device->vm_bind_fence_fd != -1)
|
||||
close(device->vm_bind_fence_fd);
|
||||
|
||||
if (device->vis_stream_bo)
|
||||
tu_bo_finish(device, device->vis_stream_bo);
|
||||
|
||||
if (device->null_accel_struct_bo)
|
||||
tu_bo_finish(device, device->null_accel_struct_bo);
|
||||
|
||||
|
|
|
|||
|
|
@ -363,6 +363,11 @@ struct tu_device
|
|||
struct tu_suballocator *trace_suballoc;
|
||||
mtx_t trace_mutex;
|
||||
|
||||
/* VSC patchpoint BO suballocator.
|
||||
*/
|
||||
struct tu_suballocator vis_stream_suballocator;
|
||||
mtx_t vis_stream_suballocator_mtx;
|
||||
|
||||
/* the blob seems to always use 8K factor and 128K param sizes, copy them */
|
||||
#define TU_TESS_FACTOR_SIZE (8 * 1024)
|
||||
#define TU_TESS_PARAM_SIZE (128 * 1024)
|
||||
|
|
@ -433,6 +438,9 @@ struct tu_device
|
|||
|
||||
struct tu_cs_entry bin_preamble_entry, bin_preamble_bv_entry;
|
||||
|
||||
struct tu_bo *vis_stream_bo;
|
||||
mtx_t vis_stream_mtx;
|
||||
|
||||
struct util_dynarray dynamic_rendering_pending;
|
||||
VkCommandPool dynamic_rendering_pool;
|
||||
uint32_t dynamic_rendering_fence;
|
||||
|
|
|
|||
|
|
@ -85,6 +85,176 @@ submit_add_entries(struct tu_device *dev, void *submit,
|
|||
}
|
||||
}
|
||||
|
||||
/* Normally, we can just resolve visibility stream patchpoints on the CPU by
|
||||
* writing directly to the command stream with the final iova of the allocated
|
||||
* BO. However this doesn't work with SIMULTANEOUS_USE command buffers, where
|
||||
* the same buffer may be in flight more than once, including within a submit.
|
||||
* To handle this we have to update the patchpoints on the GPU. The lifetime
|
||||
* of the CS used to write the patchpoints on the GPU is tricky, since if we
|
||||
* always allocate a new one for each submit the size could grow infinitely if
|
||||
* the command buffer is never freed or reset. Instead this implements a pool
|
||||
* of patchpoint CS's per command buffer that reuses finiehed CS's.
|
||||
*/
|
||||
static VkResult
|
||||
get_vis_stream_patchpoint_cs(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cs *cs,
|
||||
struct tu_cs *sub_cs,
|
||||
uint64_t *fence_iova)
|
||||
{
|
||||
/* See below for the commands emitted to the CS. */
|
||||
uint32_t cs_size = 5 *
|
||||
util_dynarray_num_elements(&cmd->vis_stream_patchpoints,
|
||||
struct tu_vis_stream_patchpoint) + 6;
|
||||
|
||||
util_dynarray_foreach (&cmd->vis_stream_cs_bos,
|
||||
struct tu_vis_stream_patchpoint_cs,
|
||||
patchpoint_cs) {
|
||||
uint32_t *fence = (uint32_t *)patchpoint_cs->fence_bo.bo->map;
|
||||
if (*fence == 1) {
|
||||
*fence = 0;
|
||||
tu_cs_init_suballoc(cs, cmd->device, &patchpoint_cs->cs_bo);
|
||||
tu_cs_begin_sub_stream(cs, cs_size, sub_cs);
|
||||
*fence_iova = patchpoint_cs->fence_bo.iova;
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
struct tu_vis_stream_patchpoint_cs patchpoint_cs;
|
||||
|
||||
mtx_lock(&cmd->device->vis_stream_suballocator_mtx);
|
||||
VkResult result =
|
||||
tu_suballoc_bo_alloc(&patchpoint_cs.cs_bo,
|
||||
&cmd->device->vis_stream_suballocator,
|
||||
cs_size * 4, 4);
|
||||
|
||||
if (result != VK_SUCCESS) {
|
||||
mtx_unlock(&cmd->device->vis_stream_suballocator_mtx);
|
||||
return result;
|
||||
}
|
||||
|
||||
result =
|
||||
tu_suballoc_bo_alloc(&patchpoint_cs.fence_bo,
|
||||
&cmd->device->vis_stream_suballocator,
|
||||
4, 4);
|
||||
|
||||
if (result != VK_SUCCESS) {
|
||||
tu_suballoc_bo_free(&cmd->device->vis_stream_suballocator,
|
||||
&patchpoint_cs.cs_bo);
|
||||
mtx_unlock(&cmd->device->vis_stream_suballocator_mtx);
|
||||
return result;
|
||||
}
|
||||
|
||||
mtx_unlock(&cmd->device->vis_stream_suballocator_mtx);
|
||||
|
||||
util_dynarray_append(&cmd->vis_stream_cs_bos, patchpoint_cs);
|
||||
|
||||
tu_cs_init_suballoc(cs, cmd->device, &patchpoint_cs.cs_bo);
|
||||
tu_cs_begin_sub_stream(cs, cs_size, sub_cs);
|
||||
*fence_iova = patchpoint_cs.fence_bo.iova;
|
||||
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
static VkResult
|
||||
resolve_vis_stream_patchpoints(struct tu_queue *queue,
|
||||
void *submit,
|
||||
struct util_dynarray *dump_cmds,
|
||||
struct tu_cmd_buffer **cmd_buffers,
|
||||
uint32_t cmdbuf_count)
|
||||
{
|
||||
struct tu_device *dev = queue->device;
|
||||
|
||||
uint32_t max_size = 0;
|
||||
for (unsigned i = 0; i < cmdbuf_count; i++)
|
||||
max_size = MAX2(max_size, cmd_buffers[i]->vsc_size);
|
||||
|
||||
if (max_size == 0)
|
||||
return VK_SUCCESS;
|
||||
|
||||
struct tu_bo *bo = NULL;
|
||||
VkResult result = VK_SUCCESS;
|
||||
|
||||
mtx_lock(&dev->vis_stream_mtx);
|
||||
|
||||
if (!dev->vis_stream_bo || max_size > dev->vis_stream_bo->size) {
|
||||
if (dev->vis_stream_bo)
|
||||
tu_bo_finish(dev, dev->vis_stream_bo);
|
||||
result = tu_bo_init_new(dev, &dev->vk.base, &dev->vis_stream_bo,
|
||||
max_size, TU_BO_ALLOC_INTERNAL_RESOURCE,
|
||||
"visibility stream");
|
||||
}
|
||||
|
||||
bo = dev->vis_stream_bo;
|
||||
|
||||
mtx_unlock(&dev->vis_stream_mtx);
|
||||
|
||||
if (!bo)
|
||||
return result;
|
||||
|
||||
/* Attach a reference to the BO to each command buffer involved in the
|
||||
* submit.
|
||||
*/
|
||||
for (unsigned i = 0; i < cmdbuf_count; i++) {
|
||||
bool has_bo = false;
|
||||
util_dynarray_foreach (&cmd_buffers[i]->vis_stream_bos,
|
||||
struct tu_bo *, cmd_bo) {
|
||||
if (*cmd_bo == bo) {
|
||||
has_bo = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!has_bo) {
|
||||
util_dynarray_append(&cmd_buffers[i]->vis_stream_bos,
|
||||
tu_bo_get_ref(bo));
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < cmdbuf_count; i++) {
|
||||
struct tu_cs cs, sub_cs;
|
||||
uint64_t fence_iova = 0;
|
||||
if (cmd_buffers[i]->usage_flags &
|
||||
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) {
|
||||
result = get_vis_stream_patchpoint_cs(cmd_buffers[i],
|
||||
&cs, &sub_cs, &fence_iova);
|
||||
if (result != VK_SUCCESS)
|
||||
return result;
|
||||
}
|
||||
|
||||
util_dynarray_foreach (&cmd_buffers[i]->vis_stream_patchpoints,
|
||||
struct tu_vis_stream_patchpoint,
|
||||
patchpoint) {
|
||||
uint64_t final_iova = bo->iova + patchpoint->offset;
|
||||
|
||||
if (cmd_buffers[i]->usage_flags &
|
||||
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) {
|
||||
tu_cs_emit_pkt7(&sub_cs, CP_MEM_WRITE, 4);
|
||||
tu_cs_emit_qw(&sub_cs, patchpoint->iova);
|
||||
tu_cs_emit_qw(&sub_cs, final_iova);
|
||||
} else {
|
||||
patchpoint->data[0] = final_iova;
|
||||
patchpoint->data[1] = final_iova >> 32;
|
||||
}
|
||||
}
|
||||
|
||||
if (cmd_buffers[i]->usage_flags &
|
||||
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) {
|
||||
tu_cs_emit_pkt7(&sub_cs, CP_WAIT_MEM_WRITES, 0);
|
||||
tu_cs_emit_pkt7(&sub_cs, CP_WAIT_FOR_ME, 0);
|
||||
|
||||
/* Signal that this CS is done and can be reused. */
|
||||
tu_cs_emit_pkt7(&sub_cs, CP_MEM_WRITE, 3);
|
||||
tu_cs_emit_qw(&sub_cs, fence_iova);
|
||||
tu_cs_emit(&sub_cs, 1);
|
||||
|
||||
struct tu_cs_entry entry = tu_cs_end_sub_stream(&cs, &sub_cs);
|
||||
submit_add_entries(queue->device, submit, dump_cmds, &entry, 1);
|
||||
}
|
||||
}
|
||||
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
static VkResult
|
||||
queue_submit_sparse(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
|
||||
{
|
||||
|
|
@ -206,6 +376,11 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
|
|||
if (!submit)
|
||||
goto fail_create_submit;
|
||||
|
||||
result = resolve_vis_stream_patchpoints(queue, submit, &dump_cmds,
|
||||
cmd_buffers, cmdbuf_count);
|
||||
if (result != VK_SUCCESS)
|
||||
goto out;
|
||||
|
||||
if (has_trace_points) {
|
||||
tu_u_trace_submission_data_create(
|
||||
device, cmd_buffers, cmdbuf_count, &u_trace_submission_data);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue