diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index f3bc3c95210..f8d05f73238 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -201,21 +201,32 @@ tu6_lazy_init_vsc(struct tu_cmd_buffer *cmd) mtx_unlock(&dev->mutex); - struct tu_bo *vsc_bo; uint32_t prim_strm_size = cmd->vsc_prim_strm_pitch * num_vsc_pipes; uint32_t draw_strm_size = cmd->vsc_draw_strm_pitch * num_vsc_pipes; uint32_t draw_strm_size_size = 4 * num_vsc_pipes; uint32_t state_size = 4 * num_vsc_pipes; - tu_get_scratch_bo(dev, - prim_strm_size + draw_strm_size + draw_strm_size_size + - state_size, - &vsc_bo); + cmd->vsc_size = + prim_strm_size + draw_strm_size + draw_strm_size_size + state_size; - cmd->vsc_prim_strm_va = vsc_bo->iova; - cmd->vsc_draw_strm_va = vsc_bo->iova + prim_strm_size; - cmd->vsc_draw_strm_size_va = cmd->vsc_draw_strm_va + draw_strm_size; - cmd->vsc_state_va = cmd->vsc_draw_strm_size_va + draw_strm_size_size; + cmd->vsc_prim_strm_offset = 0; + cmd->vsc_draw_strm_offset = prim_strm_size; + cmd->vsc_draw_strm_size_offset = cmd->vsc_draw_strm_offset + draw_strm_size; + cmd->vsc_state_offset = cmd->vsc_draw_strm_size_offset + draw_strm_size_size; +} + +static void +tu_emit_vis_stream_patchpoint(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + uint32_t offset) +{ + struct tu_vis_stream_patchpoint patchpoint = { + .data = cs->cur, + .iova = tu_cs_get_cur_iova(cs), + .offset = offset, + }; + + util_dynarray_append(&cmd->vis_stream_patchpoints, patchpoint); + tu_cs_emit_qw(cs, offset); } template @@ -223,20 +234,20 @@ static void tu_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { if (CHIP == A6XX) { - tu_cs_emit_regs(cs, - A6XX_VSC_SIZE_BASE(.qword = cmd->vsc_draw_strm_size_va)); - tu_cs_emit_regs(cs, - A6XX_VSC_PIPE_DATA_PRIM_BASE(.qword = cmd->vsc_prim_strm_va)); - tu_cs_emit_regs( - cs, A6XX_VSC_PIPE_DATA_DRAW_BASE(.qword = cmd->vsc_draw_strm_va)); + tu_cs_emit_pkt4(cs, REG_A6XX_VSC_SIZE_BASE, 2); + tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_size_offset); + tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_DATA_PRIM_BASE, 2); + tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_prim_strm_offset); + tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_DATA_DRAW_BASE, 2); + tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_offset); } else { tu_cs_emit_pkt7(cs, CP_SET_PSEUDO_REG, 3 * 3); tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(VSC_PIPE_DATA_DRAW_BASE)); - tu_cs_emit_qw(cs, cmd->vsc_draw_strm_va); + tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_offset); tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(VSC_SIZE_BASE)); - tu_cs_emit_qw(cs, cmd->vsc_draw_strm_size_va); + tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_draw_strm_size_offset); tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(VSC_PIPE_DATA_PRIM_BASE)); - tu_cs_emit_qw(cs, cmd->vsc_prim_strm_va); + tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_prim_strm_offset); } cmd->vsc_initialized = true; @@ -1278,7 +1289,13 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, A6XX_CP_SET_MARKER_0_USES_GMEM); if (CHIP == A6XX && cmd->device->physical_device->has_preemption) { + if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) + tu_cs_set_writeable(cs, true); + tu_emit_vsc(cmd, &cmd->cs); + + if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) + tu_cs_set_writeable(cs, false); } unsigned views = tu_fdm_num_layers(cmd); @@ -2798,8 +2815,14 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, * emits the preamble lazily. We chose the per-bin approach but blob's * should be a better one. */ + if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) + tu_cs_set_writeable(cs, true); + tu_emit_vsc(cmd, cs); + if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) + tu_cs_set_writeable(cs, false); + tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height, { .render_mode = BINNING_PASS, @@ -2855,13 +2878,18 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu6_lazy_init_vsc(cmd); /* Upload state regs to memory to be restored on skipsaverestore - * preemption. + * preemption. On a7xx this is considered part of the vis stream that + * requires a patchpoint. */ + if (CHIP >= A7XX && + (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) + tu_cs_set_writeable(cs, true); + tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_VSC_CHANNEL_VISIBILITY(0)) | CP_REG_TO_MEM_0_CNT(32)); if (CHIP >= A7XX) - tu_cs_emit_qw(cs, cmd->vsc_state_va); + tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_state_offset); else tu_cs_emit_qw(cs, global_iova(cmd, vsc_state)); @@ -2874,8 +2902,12 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_emit_pkt7(cs, CP_MEM_TO_SCRATCH_MEM, 4); tu_cs_emit(cs, num_vsc_pipes); /* count */ tu_cs_emit(cs, 0); /* offset */ - tu_cs_emit_qw(cs, cmd->vsc_state_va); + tu_emit_vis_stream_patchpoint(cmd, cs, cmd->vsc_state_offset); } + + if (CHIP >= A7XX && + (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) + tu_cs_set_writeable(cs, false); } tu_autotune_begin_renderpass(cmd, cs, autotune_result); @@ -3573,6 +3605,26 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer) ralloc_free(cmd_buffer->pre_chain.patchpoints_ctx); util_dynarray_fini(&cmd_buffer->fdm_bin_patchpoints); util_dynarray_fini(&cmd_buffer->pre_chain.fdm_bin_patchpoints); + util_dynarray_fini(&cmd_buffer->vis_stream_patchpoints); + + util_dynarray_foreach (&cmd_buffer->vis_stream_bos, struct tu_bo *, + bo) { + tu_bo_finish(cmd_buffer->device, *bo); + } + + mtx_lock(&cmd_buffer->device->vis_stream_suballocator_mtx); + util_dynarray_foreach (&cmd_buffer->vis_stream_cs_bos, + struct tu_vis_stream_patchpoint_cs, + bo) { + tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator, + &bo->cs_bo); + tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator, + &bo->fence_bo); + } + mtx_unlock(&cmd_buffer->device->vis_stream_suballocator_mtx); + + util_dynarray_fini(&cmd_buffer->vis_stream_bos); + util_dynarray_fini(&cmd_buffer->vis_stream_cs_bos); vk_command_buffer_finish(&cmd_buffer->vk); vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->vk.pool->alloc, @@ -3649,6 +3701,26 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, cmd_buffer->pre_chain.patchpoints_ctx = NULL; util_dynarray_clear(&cmd_buffer->fdm_bin_patchpoints); util_dynarray_clear(&cmd_buffer->pre_chain.fdm_bin_patchpoints); + util_dynarray_clear(&cmd_buffer->vis_stream_patchpoints); + + util_dynarray_foreach (&cmd_buffer->vis_stream_bos, struct tu_bo *, + bo) { + tu_bo_finish(cmd_buffer->device, *bo); + } + + mtx_lock(&cmd_buffer->device->vis_stream_suballocator_mtx); + util_dynarray_foreach (&cmd_buffer->vis_stream_cs_bos, + struct tu_vis_stream_patchpoint_cs, + bo) { + tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator, + &bo->cs_bo); + tu_suballoc_bo_free(&cmd_buffer->device->vis_stream_suballocator, + &bo->fence_bo); + } + mtx_unlock(&cmd_buffer->device->vis_stream_suballocator_mtx); + + util_dynarray_clear(&cmd_buffer->vis_stream_bos); + util_dynarray_clear(&cmd_buffer->vis_stream_cs_bos); } const struct vk_command_buffer_ops tu_cmd_buffer_ops = { @@ -5562,6 +5634,58 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, util_dynarray_append_dynarray(&cmd->fdm_bin_patchpoints, &secondary->fdm_bin_patchpoints); } else { + struct tu_cs *cs = &cmd->cs; + + /* If the secondary can be used multiple times, we have to set its + * patchpoints on the GPU. Set them here, and create a new + * patchpoint pointing to the CP_MEM_WRITE packet. Otherwise just + * copy them over adjusting the index. + */ + bool simultaneous_use = secondary->usage_flags & + VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; + + /* If this cmdbuf itself can be used multiple times in a submit then + * its patchpoint will also be updated on the GPU. + */ + if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) + tu_cs_set_writeable(cs, true); + + util_dynarray_foreach (&secondary->vis_stream_patchpoints, + struct tu_vis_stream_patchpoint, + secondary_patchpoint) { + struct tu_vis_stream_patchpoint patchpoint = + *secondary_patchpoint; + + if (simultaneous_use) { + tu_cs_reserve_space(cs, 5); + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); + tu_cs_emit_qw(cs, patchpoint.iova); + patchpoint.iova = tu_cs_get_cur_iova(cs); + patchpoint.data = cs->cur; + tu_cs_emit_qw(cs, 0); + } + + util_dynarray_append(&cmd->vis_stream_patchpoints, + patchpoint); + } + + if (cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) + tu_cs_set_writeable(cs, false); + + if (simultaneous_use) { + tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); + tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); + + /* Make BV wait for updates on BR to land */ + if (cmd->device->physical_device->info->chip >= 7) { + tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); + tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) | + CP_THREAD_CONTROL_0_SYNC_THREADS); + } + } + + cmd->vsc_size = MAX2(cmd->vsc_size, secondary->vsc_size); + switch (secondary->state.suspend_resume) { case SR_NONE: assert(tu_cs_is_empty(&secondary->draw_cs)); diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index ca06c9180ec..22d1b85dc63 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -618,6 +618,10 @@ struct tu_cmd_buffer void *patchpoints_ctx; struct util_dynarray fdm_bin_patchpoints; + struct util_dynarray vis_stream_patchpoints; + struct util_dynarray vis_stream_bos; + struct util_dynarray vis_stream_cs_bos; + VkCommandBufferUsageFlags usage_flags; VkQueryPipelineStatisticFlags inherited_pipeline_statistics; @@ -686,8 +690,9 @@ struct tu_cmd_buffer uint32_t vsc_draw_strm_pitch; uint32_t vsc_prim_strm_pitch; - uint64_t vsc_draw_strm_va, vsc_draw_strm_size_va, vsc_prim_strm_va; - uint64_t vsc_state_va; + uint32_t vsc_draw_strm_offset, vsc_draw_strm_size_offset; + uint32_t vsc_prim_strm_offset, vsc_state_offset; + uint64_t vsc_size; bool vsc_initialized; bool prev_fsr_is_null; @@ -833,6 +838,16 @@ struct tu_fdm_bin_patchpoint { tu_fdm_bin_apply_t apply; }; +struct tu_vis_stream_patchpoint { + uint32_t *data; + uint64_t iova; + uint32_t offset; +}; + +struct tu_vis_stream_patchpoint_cs { + struct tu_suballoc_bo cs_bo; + struct tu_suballoc_bo fence_bo; +}; void tu_barrier(struct tu_cmd_buffer *cmd, diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index c8afe51b49d..c99cc535cf2 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -2725,6 +2725,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, mtx_init(&device->radix_sort_mutex, mtx_plain); mtx_init(&device->fiber_pvtmem_bo.mtx, mtx_plain); mtx_init(&device->wave_pvtmem_bo.mtx, mtx_plain); + mtx_init(&device->vis_stream_mtx, mtx_plain); + mtx_init(&device->vis_stream_suballocator_mtx, mtx_plain); mtx_init(&device->mutex, mtx_plain); mtx_init(&device->copy_timestamp_cs_pool_mutex, mtx_plain); #ifdef HAVE_PERFETTO @@ -2853,6 +2855,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, getpagesize(), TU_BO_ALLOC_INTERNAL_RESOURCE, "event_suballoc"); + tu_bo_suballocator_init( + &device->vis_stream_suballocator, device, + getpagesize(), + (enum tu_bo_alloc_flags)(TU_BO_ALLOC_INTERNAL_RESOURCE | + TU_BO_ALLOC_ALLOW_DUMP), + "vis_stream_suballoc"); + result = tu_bo_init_new( device, NULL, &device->global_bo, global_size, (enum tu_bo_alloc_flags) (TU_BO_ALLOC_ALLOW_DUMP | @@ -3146,12 +3155,16 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) tu_bo_suballocator_finish(&device->autotune_suballoc); tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc); tu_bo_suballocator_finish(&device->event_suballoc); + tu_bo_suballocator_finish(&device->vis_stream_suballocator); tu_bo_finish(device, device->global_bo); if (device->vm_bind_fence_fd != -1) close(device->vm_bind_fence_fd); + if (device->vis_stream_bo) + tu_bo_finish(device, device->vis_stream_bo); + if (device->null_accel_struct_bo) tu_bo_finish(device, device->null_accel_struct_bo); diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index b8591e773ac..6d555a0be2a 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -363,6 +363,11 @@ struct tu_device struct tu_suballocator *trace_suballoc; mtx_t trace_mutex; + /* VSC patchpoint BO suballocator. + */ + struct tu_suballocator vis_stream_suballocator; + mtx_t vis_stream_suballocator_mtx; + /* the blob seems to always use 8K factor and 128K param sizes, copy them */ #define TU_TESS_FACTOR_SIZE (8 * 1024) #define TU_TESS_PARAM_SIZE (128 * 1024) @@ -433,6 +438,9 @@ struct tu_device struct tu_cs_entry bin_preamble_entry, bin_preamble_bv_entry; + struct tu_bo *vis_stream_bo; + mtx_t vis_stream_mtx; + struct util_dynarray dynamic_rendering_pending; VkCommandPool dynamic_rendering_pool; uint32_t dynamic_rendering_fence; diff --git a/src/freedreno/vulkan/tu_queue.cc b/src/freedreno/vulkan/tu_queue.cc index 65a510f61a8..7394e18c39c 100644 --- a/src/freedreno/vulkan/tu_queue.cc +++ b/src/freedreno/vulkan/tu_queue.cc @@ -85,6 +85,176 @@ submit_add_entries(struct tu_device *dev, void *submit, } } +/* Normally, we can just resolve visibility stream patchpoints on the CPU by + * writing directly to the command stream with the final iova of the allocated + * BO. However this doesn't work with SIMULTANEOUS_USE command buffers, where + * the same buffer may be in flight more than once, including within a submit. + * To handle this we have to update the patchpoints on the GPU. The lifetime + * of the CS used to write the patchpoints on the GPU is tricky, since if we + * always allocate a new one for each submit the size could grow infinitely if + * the command buffer is never freed or reset. Instead this implements a pool + * of patchpoint CS's per command buffer that reuses finiehed CS's. + */ +static VkResult +get_vis_stream_patchpoint_cs(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + struct tu_cs *sub_cs, + uint64_t *fence_iova) +{ + /* See below for the commands emitted to the CS. */ + uint32_t cs_size = 5 * + util_dynarray_num_elements(&cmd->vis_stream_patchpoints, + struct tu_vis_stream_patchpoint) + 6; + + util_dynarray_foreach (&cmd->vis_stream_cs_bos, + struct tu_vis_stream_patchpoint_cs, + patchpoint_cs) { + uint32_t *fence = (uint32_t *)patchpoint_cs->fence_bo.bo->map; + if (*fence == 1) { + *fence = 0; + tu_cs_init_suballoc(cs, cmd->device, &patchpoint_cs->cs_bo); + tu_cs_begin_sub_stream(cs, cs_size, sub_cs); + *fence_iova = patchpoint_cs->fence_bo.iova; + return VK_SUCCESS; + } + } + + struct tu_vis_stream_patchpoint_cs patchpoint_cs; + + mtx_lock(&cmd->device->vis_stream_suballocator_mtx); + VkResult result = + tu_suballoc_bo_alloc(&patchpoint_cs.cs_bo, + &cmd->device->vis_stream_suballocator, + cs_size * 4, 4); + + if (result != VK_SUCCESS) { + mtx_unlock(&cmd->device->vis_stream_suballocator_mtx); + return result; + } + + result = + tu_suballoc_bo_alloc(&patchpoint_cs.fence_bo, + &cmd->device->vis_stream_suballocator, + 4, 4); + + if (result != VK_SUCCESS) { + tu_suballoc_bo_free(&cmd->device->vis_stream_suballocator, + &patchpoint_cs.cs_bo); + mtx_unlock(&cmd->device->vis_stream_suballocator_mtx); + return result; + } + + mtx_unlock(&cmd->device->vis_stream_suballocator_mtx); + + util_dynarray_append(&cmd->vis_stream_cs_bos, patchpoint_cs); + + tu_cs_init_suballoc(cs, cmd->device, &patchpoint_cs.cs_bo); + tu_cs_begin_sub_stream(cs, cs_size, sub_cs); + *fence_iova = patchpoint_cs.fence_bo.iova; + + return VK_SUCCESS; +} + +static VkResult +resolve_vis_stream_patchpoints(struct tu_queue *queue, + void *submit, + struct util_dynarray *dump_cmds, + struct tu_cmd_buffer **cmd_buffers, + uint32_t cmdbuf_count) +{ + struct tu_device *dev = queue->device; + + uint32_t max_size = 0; + for (unsigned i = 0; i < cmdbuf_count; i++) + max_size = MAX2(max_size, cmd_buffers[i]->vsc_size); + + if (max_size == 0) + return VK_SUCCESS; + + struct tu_bo *bo = NULL; + VkResult result = VK_SUCCESS; + + mtx_lock(&dev->vis_stream_mtx); + + if (!dev->vis_stream_bo || max_size > dev->vis_stream_bo->size) { + if (dev->vis_stream_bo) + tu_bo_finish(dev, dev->vis_stream_bo); + result = tu_bo_init_new(dev, &dev->vk.base, &dev->vis_stream_bo, + max_size, TU_BO_ALLOC_INTERNAL_RESOURCE, + "visibility stream"); + } + + bo = dev->vis_stream_bo; + + mtx_unlock(&dev->vis_stream_mtx); + + if (!bo) + return result; + + /* Attach a reference to the BO to each command buffer involved in the + * submit. + */ + for (unsigned i = 0; i < cmdbuf_count; i++) { + bool has_bo = false; + util_dynarray_foreach (&cmd_buffers[i]->vis_stream_bos, + struct tu_bo *, cmd_bo) { + if (*cmd_bo == bo) { + has_bo = true; + break; + } + } + + if (!has_bo) { + util_dynarray_append(&cmd_buffers[i]->vis_stream_bos, + tu_bo_get_ref(bo)); + } + } + + for (unsigned i = 0; i < cmdbuf_count; i++) { + struct tu_cs cs, sub_cs; + uint64_t fence_iova = 0; + if (cmd_buffers[i]->usage_flags & + VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) { + result = get_vis_stream_patchpoint_cs(cmd_buffers[i], + &cs, &sub_cs, &fence_iova); + if (result != VK_SUCCESS) + return result; + } + + util_dynarray_foreach (&cmd_buffers[i]->vis_stream_patchpoints, + struct tu_vis_stream_patchpoint, + patchpoint) { + uint64_t final_iova = bo->iova + patchpoint->offset; + + if (cmd_buffers[i]->usage_flags & + VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) { + tu_cs_emit_pkt7(&sub_cs, CP_MEM_WRITE, 4); + tu_cs_emit_qw(&sub_cs, patchpoint->iova); + tu_cs_emit_qw(&sub_cs, final_iova); + } else { + patchpoint->data[0] = final_iova; + patchpoint->data[1] = final_iova >> 32; + } + } + + if (cmd_buffers[i]->usage_flags & + VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) { + tu_cs_emit_pkt7(&sub_cs, CP_WAIT_MEM_WRITES, 0); + tu_cs_emit_pkt7(&sub_cs, CP_WAIT_FOR_ME, 0); + + /* Signal that this CS is done and can be reused. */ + tu_cs_emit_pkt7(&sub_cs, CP_MEM_WRITE, 3); + tu_cs_emit_qw(&sub_cs, fence_iova); + tu_cs_emit(&sub_cs, 1); + + struct tu_cs_entry entry = tu_cs_end_sub_stream(&cs, &sub_cs); + submit_add_entries(queue->device, submit, dump_cmds, &entry, 1); + } + } + + return VK_SUCCESS; +} + static VkResult queue_submit_sparse(struct vk_queue *_queue, struct vk_queue_submit *vk_submit) { @@ -206,6 +376,11 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit) if (!submit) goto fail_create_submit; + result = resolve_vis_stream_patchpoints(queue, submit, &dump_cmds, + cmd_buffers, cmdbuf_count); + if (result != VK_SUCCESS) + goto out; + if (has_trace_points) { tu_u_trace_submission_data_create( device, cmd_buffers, cmdbuf_count, &u_trace_submission_data);