diff --git a/src/freedreno/vulkan/tu_knl.cc b/src/freedreno/vulkan/tu_knl.cc index 9633d955d05..46a17c4e880 100644 --- a/src/freedreno/vulkan/tu_knl.cc +++ b/src/freedreno/vulkan/tu_knl.cc @@ -281,11 +281,37 @@ tu_drm_submitqueue_close(struct tu_device *dev, uint32_t queue_id) dev->instance->knl->submitqueue_close(dev, queue_id); } -VkResult -tu_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit) +void * +tu_submit_create(struct tu_device *dev) { - struct tu_queue *queue = container_of(vk_queue, struct tu_queue, vk); - return queue->device->instance->knl->queue_submit(queue, submit); + return dev->instance->knl->submit_create(dev); +} + +void +tu_submit_finish(struct tu_device *dev, void *submit) +{ + return dev->instance->knl->submit_finish(dev, submit); +} + +void +tu_submit_add_entries(struct tu_device *dev, void *submit, + struct tu_cs_entry *entries, + unsigned num_entries) +{ + return dev->instance->knl->submit_add_entries(dev, submit, entries, + num_entries); +} + +VkResult +tu_queue_submit(struct tu_queue *queue, void *submit, + struct vk_sync_wait *waits, uint32_t wait_count, + struct vk_sync_signal *signals, uint32_t signal_count, + struct tu_u_trace_submission_data *u_trace_submission_data) +{ + return queue->device->instance->knl->queue_submit(queue, submit, + waits, wait_count, + signals, signal_count, + u_trace_submission_data); } /** diff --git a/src/freedreno/vulkan/tu_knl.h b/src/freedreno/vulkan/tu_knl.h index 305686439c1..a0c42a9d0c0 100644 --- a/src/freedreno/vulkan/tu_knl.h +++ b/src/freedreno/vulkan/tu_knl.h @@ -103,8 +103,15 @@ struct tu_knl { void *metadata, uint32_t metadata_size); VkResult (*device_wait_u_trace)(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj); - VkResult (*queue_submit)(struct tu_queue *queue, - struct vk_queue_submit *submit); + void *(*submit_create)(struct tu_device *device); + void (*submit_finish)(struct tu_device *device, void *_submit); + void (*submit_add_entries)(struct tu_device *device, void *_submit, + struct tu_cs_entry *entries, + unsigned num_entries); + VkResult (*queue_submit)(struct tu_queue *queue, void *_submit, + struct vk_sync_wait *waits, uint32_t wait_count, + struct vk_sync_signal *signals, uint32_t signal_count, + struct tu_u_trace_submission_data *u_trace_submission_data); const struct vk_device_entrypoint_table *device_entrypoints; }; @@ -237,7 +244,21 @@ tu_drm_submitqueue_new(struct tu_device *dev, void tu_drm_submitqueue_close(struct tu_device *dev, uint32_t queue_id); +void * +tu_submit_create(struct tu_device *dev); + +void +tu_submit_finish(struct tu_device *dev, void *submit); + +void +tu_submit_add_entries(struct tu_device *dev, void *submit, + struct tu_cs_entry *entries, + unsigned num_entries); + VkResult -tu_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit); +tu_queue_submit(struct tu_queue *queue, void *submit, + struct vk_sync_wait *waits, uint32_t wait_count, + struct vk_sync_signal *signals, uint32_t signal_count, + struct tu_u_trace_submission_data *u_trace_submission_data); #endif /* TU_DRM_H */ diff --git a/src/freedreno/vulkan/tu_knl_drm.cc b/src/freedreno/vulkan/tu_knl_drm.cc index d0a537dc598..9578c46211e 100644 --- a/src/freedreno/vulkan/tu_knl_drm.cc +++ b/src/freedreno/vulkan/tu_knl_drm.cc @@ -129,6 +129,52 @@ tu_drm_bo_finish(struct tu_device *dev, struct tu_bo *bo) u_rwlock_rdunlock(&dev->dma_bo_lock); } +void * +msm_submit_create(struct tu_device *device) +{ + return vk_zalloc(&device->vk.alloc, sizeof(struct tu_msm_queue_submit), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); +} + +void +msm_submit_finish(struct tu_device *device, + void *_submit) +{ + struct tu_msm_queue_submit *submit = + (struct tu_msm_queue_submit *)_submit; + + util_dynarray_fini(&submit->commands); + util_dynarray_fini(&submit->command_bos); + vk_free(&device->vk.alloc, submit); +} + +void +msm_submit_add_entries(struct tu_device *device, void *_submit, + struct tu_cs_entry *entries, unsigned num_entries) +{ + struct tu_msm_queue_submit *submit = + (struct tu_msm_queue_submit *)_submit; + + struct drm_msm_gem_submit_cmd *cmds = (struct drm_msm_gem_submit_cmd *) + util_dynarray_grow(&submit->commands, struct drm_msm_gem_submit_cmd, + num_entries); + + const struct tu_bo **bos = (const struct tu_bo **) + util_dynarray_grow(&submit->command_bos, struct tu_bo *, + num_entries); + + for (unsigned i = 0; i < num_entries; i++) { + cmds[i].type = MSM_SUBMIT_CMD_BUF; + cmds[i].submit_idx = entries[i].bo->bo_list_idx; + cmds[i].submit_offset = entries[i].offset; + cmds[i].size = entries[i].size; + cmds[i].pad = 0; + cmds[i].nr_relocs = 0; + cmds[i].relocs = 0; + bos[i] = entries[i].bo; + } +} + uint32_t tu_syncobj_from_vk_sync(struct vk_sync *sync) { diff --git a/src/freedreno/vulkan/tu_knl_drm.h b/src/freedreno/vulkan/tu_knl_drm.h index f19969ceb8a..7e6dea1456d 100644 --- a/src/freedreno/vulkan/tu_knl_drm.h +++ b/src/freedreno/vulkan/tu_knl_drm.h @@ -22,6 +22,18 @@ VkResult tu_allocate_userspace_iova(struct tu_device *dev, int tu_drm_export_dmabuf(struct tu_device *dev, struct tu_bo *bo); void tu_drm_bo_finish(struct tu_device *dev, struct tu_bo *bo); +struct tu_msm_queue_submit +{ + struct util_dynarray commands; + struct util_dynarray command_bos; +}; + +void *msm_submit_create(struct tu_device *device); +void msm_submit_finish(struct tu_device *device, void *_submit); +void msm_submit_add_entries(struct tu_device *device, void *_submit, + struct tu_cs_entry *entries, + unsigned num_entries); + static inline void get_abs_timeout(struct drm_msm_timespec *tv, uint64_t ns) { @@ -54,4 +66,4 @@ to_tu_timeline_sync(struct vk_sync *sync) uint32_t tu_syncobj_from_vk_sync(struct vk_sync *sync); -#endif \ No newline at end of file +#endif diff --git a/src/freedreno/vulkan/tu_knl_drm_msm.cc b/src/freedreno/vulkan/tu_knl_drm_msm.cc index 8016b62e052..28d6089b734 100644 --- a/src/freedreno/vulkan/tu_knl_drm_msm.cc +++ b/src/freedreno/vulkan/tu_knl_drm_msm.cc @@ -27,25 +27,6 @@ #include "tu_rmv.h" #include "redump.h" -struct tu_msm_queue_submit -{ - struct vk_queue_submit *vk_submit; - struct tu_u_trace_submission_data *u_trace_submission_data; - - struct tu_cmd_buffer **cmd_buffers; - struct drm_msm_gem_submit_cmd *cmds; - struct drm_msm_gem_submit_syncobj *in_syncobjs; - struct drm_msm_gem_submit_syncobj *out_syncobjs; - - uint32_t nr_cmd_buffers; - uint32_t nr_in_syncobjs; - uint32_t nr_out_syncobjs; - uint32_t entry_count; - uint32_t perf_pass_index; - - bool autotune_fence; -}; - struct tu_u_trace_syncobj { uint32_t msm_queue_id; @@ -804,198 +785,73 @@ msm_bo_get_metadata(struct tu_device *dev, struct tu_bo *bo, } static VkResult -tu_queue_submit_create_locked(struct tu_queue *queue, - struct vk_queue_submit *vk_submit, - const uint32_t nr_in_syncobjs, - const uint32_t nr_out_syncobjs, - uint32_t perf_pass_index, - struct tu_msm_queue_submit *new_submit) +msm_queue_submit(struct tu_queue *queue, void *_submit, + struct vk_sync_wait *waits, uint32_t wait_count, + struct vk_sync_signal *signals, uint32_t signal_count, + struct tu_u_trace_submission_data *u_trace_submission_data) { - VkResult result; + VkResult result = VK_SUCCESS; + int ret; + struct tu_msm_queue_submit *submit = + (struct tu_msm_queue_submit *)_submit; + struct drm_msm_gem_submit_syncobj *in_syncobjs, *out_syncobjs; + struct drm_msm_gem_submit req; + uint32_t submit_idx = queue->device->submit_count; + uint64_t gpu_offset = 0; + uint32_t entry_count = + util_dynarray_num_elements(&submit->commands, struct drm_msm_gem_submit_cmd); +#if HAVE_PERFETTO + struct tu_perfetto_clocks clocks; +#endif - bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context); - bool has_trace_points = false; - - struct vk_command_buffer **vk_cmd_buffers = vk_submit->command_buffers; - - memset(new_submit, 0, sizeof(struct tu_msm_queue_submit)); - - new_submit->cmd_buffers = (struct tu_cmd_buffer **) vk_cmd_buffers; - new_submit->nr_cmd_buffers = vk_submit->command_buffer_count; - tu_insert_dynamic_cmdbufs(queue->device, &new_submit->cmd_buffers, - &new_submit->nr_cmd_buffers); - - uint32_t entry_count = 0; - for (uint32_t j = 0; j < new_submit->nr_cmd_buffers; ++j) { - struct tu_cmd_buffer *cmdbuf = new_submit->cmd_buffers[j]; - - if (perf_pass_index != ~0) - entry_count++; - - entry_count += cmdbuf->cs.entry_count; - - if (u_trace_enabled && u_trace_has_points(&cmdbuf->trace)) { - if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) - entry_count++; - - has_trace_points = true; - } - } - - new_submit->autotune_fence = - tu_autotune_submit_requires_fence(new_submit->cmd_buffers, new_submit->nr_cmd_buffers); - if (new_submit->autotune_fence) - entry_count++; - - new_submit->cmds = (struct drm_msm_gem_submit_cmd *) vk_zalloc( - &queue->device->vk.alloc, entry_count * sizeof(*new_submit->cmds), 8, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - - if (new_submit->cmds == NULL) { - result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); - goto fail_cmds; - } - - if (has_trace_points) { - result = - tu_u_trace_submission_data_create( - queue->device, new_submit->cmd_buffers, - new_submit->nr_cmd_buffers, - &new_submit->u_trace_submission_data); - - if (result != VK_SUCCESS) { - goto fail_u_trace_submission_data; - } - } + uint32_t flags = MSM_PIPE_3D0; /* Allocate without wait timeline semaphores */ - new_submit->in_syncobjs = (struct drm_msm_gem_submit_syncobj *) vk_zalloc( + in_syncobjs = (struct drm_msm_gem_submit_syncobj *) vk_zalloc( &queue->device->vk.alloc, - nr_in_syncobjs * sizeof(*new_submit->in_syncobjs), 8, + wait_count * sizeof(*in_syncobjs), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - if (new_submit->in_syncobjs == NULL) { + if (in_syncobjs == NULL) { result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); goto fail_in_syncobjs; } /* Allocate with signal timeline semaphores considered */ - new_submit->out_syncobjs = (struct drm_msm_gem_submit_syncobj *) vk_zalloc( + out_syncobjs = (struct drm_msm_gem_submit_syncobj *) vk_zalloc( &queue->device->vk.alloc, - nr_out_syncobjs * sizeof(*new_submit->out_syncobjs), 8, + signal_count * sizeof(*out_syncobjs), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - if (new_submit->out_syncobjs == NULL) { + if (out_syncobjs == NULL) { result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); goto fail_out_syncobjs; } - new_submit->entry_count = entry_count; - new_submit->nr_in_syncobjs = nr_in_syncobjs; - new_submit->nr_out_syncobjs = nr_out_syncobjs; - new_submit->perf_pass_index = perf_pass_index; - new_submit->vk_submit = vk_submit; + for (uint32_t i = 0; i < wait_count; i++) { + struct vk_sync *sync = waits[i].sync; - return VK_SUCCESS; - -fail_out_syncobjs: - vk_free(&queue->device->vk.alloc, new_submit->in_syncobjs); -fail_in_syncobjs: - if (new_submit->u_trace_submission_data) - tu_u_trace_submission_data_finish(queue->device, - new_submit->u_trace_submission_data); -fail_u_trace_submission_data: - vk_free(&queue->device->vk.alloc, new_submit->cmds); -fail_cmds: - return result; -} - -static void -tu_queue_submit_finish(struct tu_queue *queue, struct tu_msm_queue_submit *submit) -{ - vk_free(&queue->device->vk.alloc, submit->cmds); - vk_free(&queue->device->vk.alloc, submit->in_syncobjs); - vk_free(&queue->device->vk.alloc, submit->out_syncobjs); - if (submit->cmd_buffers != (void *) submit->vk_submit->command_buffers) - vk_free(&queue->device->vk.alloc, submit->cmd_buffers); -} - -static void -tu_fill_msm_gem_submit(struct tu_device *dev, - struct drm_msm_gem_submit_cmd *cmd, - struct tu_cs_entry *cs_entry) -{ - cmd->type = MSM_SUBMIT_CMD_BUF; - cmd->submit_idx = cs_entry->bo->bo_list_idx; - cmd->submit_offset = cs_entry->offset; - cmd->size = cs_entry->size; - cmd->pad = 0; - cmd->nr_relocs = 0; - cmd->relocs = 0; -} - -static void -tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue, - struct tu_msm_queue_submit *submit, - struct tu_cs *autotune_cs) -{ - struct tu_device *dev = queue->device; - struct drm_msm_gem_submit_cmd *cmds = submit->cmds; - - uint32_t entry_idx = 0; - for (uint32_t j = 0; j < submit->nr_cmd_buffers; ++j) { - struct tu_device *dev = queue->device; - struct tu_cmd_buffer *cmdbuf = submit->cmd_buffers[j]; - struct tu_cs *cs = &cmdbuf->cs; - - if (submit->perf_pass_index != ~0) { - struct tu_cs_entry *perf_cs_entry = - &dev->perfcntrs_pass_cs_entries[submit->perf_pass_index]; - - tu_fill_msm_gem_submit(dev, &cmds[entry_idx], perf_cs_entry); - entry_idx++; - } - - for (unsigned i = 0; i < cs->entry_count; ++i, ++entry_idx) { - tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &cs->entries[i]); - } - - if (submit->u_trace_submission_data) { - struct tu_cs *ts_cs = - submit->u_trace_submission_data->cmd_trace_data[j].timestamp_copy_cs; - if (ts_cs) { - tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &ts_cs->entries[0]); - entry_idx++; - } - } + in_syncobjs[i] = (struct drm_msm_gem_submit_syncobj) { + .handle = tu_syncobj_from_vk_sync(sync), + .flags = 0, + .point = waits[i].wait_value, + }; } - if (autotune_cs) { - assert(autotune_cs->entry_count == 1); - tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &autotune_cs->entries[0]); - entry_idx++; - } -} + for (uint32_t i = 0; i < signal_count; i++) { + struct vk_sync *sync = signals[i].sync; -static VkResult -tu_queue_submit_locked(struct tu_queue *queue, struct tu_msm_queue_submit *submit) -{ - uint32_t submit_idx = queue->device->submit_count++; - - struct tu_cs *autotune_cs = NULL; - if (submit->autotune_fence) { - autotune_cs = tu_autotune_on_submit(queue->device, - &queue->device->autotune, - submit->cmd_buffers, - submit->nr_cmd_buffers); + out_syncobjs[i] = (struct drm_msm_gem_submit_syncobj) { + .handle = tu_syncobj_from_vk_sync(sync), + .flags = 0, + .point = signals[i].signal_value, + }; } - uint32_t flags = MSM_PIPE_3D0; - - if (submit->vk_submit->wait_count) + if (wait_count) flags |= MSM_SUBMIT_SYNCOBJ_IN; - if (submit->vk_submit->signal_count) + if (signal_count) flags |= MSM_SUBMIT_SYNCOBJ_OUT; mtx_lock(&queue->device->bo_mutex); @@ -1004,22 +860,30 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_msm_queue_submit *submi flags |= MSM_SUBMIT_NO_IMPLICIT; /* drm_msm_gem_submit_cmd requires index of bo which could change at any - * time when bo_mutex is not locked. So we build submit cmds here the real - * place to submit. + * time when bo_mutex is not locked. So we update the index here under the + * lock. */ - tu_queue_build_msm_gem_submit_cmds(queue, submit, autotune_cs); + util_dynarray_foreach (&submit->commands, struct drm_msm_gem_submit_cmd, + cmd) { + unsigned i = cmd - + util_dynarray_element(&submit->commands, + struct drm_msm_gem_submit_cmd, 0); + struct tu_bo **bo = util_dynarray_element(&submit->command_bos, + struct tu_bo *, i); + cmd->submit_idx = (*bo)->bo_list_idx; + } - struct drm_msm_gem_submit req = { + req = (struct drm_msm_gem_submit) { .flags = flags, - .nr_bos = submit->entry_count ? queue->device->bo_count : 0, - .nr_cmds = submit->entry_count, + .nr_bos = entry_count ? queue->device->bo_count : 0, + .nr_cmds = entry_count, .bos = (uint64_t)(uintptr_t) queue->device->bo_list, - .cmds = (uint64_t)(uintptr_t)submit->cmds, + .cmds = (uint64_t)(uintptr_t)submit->commands.data, .queueid = queue->msm_queue_id, - .in_syncobjs = (uint64_t)(uintptr_t)submit->in_syncobjs, - .out_syncobjs = (uint64_t)(uintptr_t)submit->out_syncobjs, - .nr_in_syncobjs = submit->nr_in_syncobjs, - .nr_out_syncobjs = submit->nr_out_syncobjs, + .in_syncobjs = (uint64_t)(uintptr_t)in_syncobjs, + .out_syncobjs = (uint64_t)(uintptr_t)out_syncobjs, + .nr_in_syncobjs = wait_count, + .nr_out_syncobjs = signal_count, .syncobj_stride = sizeof(struct drm_msm_gem_submit_syncobj), }; @@ -1052,8 +916,8 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_msm_queue_submit *submi } } - for (unsigned i = 0; i < req.nr_cmds; i++) { - struct drm_msm_gem_submit_cmd *cmd = &submit->cmds[i]; + util_dynarray_foreach (&submit->commands, struct drm_msm_gem_submit_cmd, + cmd) { uint64_t iova = device->bo_list[cmd->submit_idx].presumed + cmd->submit_offset; uint32_t size = cmd->size >> 2; uint32_t buf[3] = { iova, size, iova >> 32 }; @@ -1063,60 +927,40 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_msm_queue_submit *submi fd_rd_output_end(rd_output); } - int ret = drmCommandWriteRead(queue->device->fd, - DRM_MSM_GEM_SUBMIT, - &req, sizeof(req)); + ret = drmCommandWriteRead(queue->device->fd, + DRM_MSM_GEM_SUBMIT, + &req, sizeof(req)); mtx_unlock(&queue->device->bo_mutex); - tu_debug_bos_print_stats(queue->device); - - if (ret) - return vk_device_set_lost(&queue->device->vk, "submit failed: %m"); + if (ret) { + result = vk_device_set_lost(&queue->device->vk, "submit failed: %m"); + goto fail_submit; + } p_atomic_set(&queue->fence, req.fence); - uint64_t gpu_offset = 0; #if HAVE_PERFETTO - struct tu_perfetto_clocks clocks = - tu_perfetto_submit(queue->device, queue->device->submit_count, NULL); + clocks = tu_perfetto_submit(queue->device, queue->device->submit_count, NULL); gpu_offset = clocks.gpu_ts_offset; #endif - if (submit->u_trace_submission_data) { - struct tu_u_trace_submission_data *submission_data = - submit->u_trace_submission_data; - submission_data->submission_id = queue->device->submit_count; - submission_data->gpu_ts_offset = gpu_offset; + if (u_trace_submission_data) { + u_trace_submission_data->gpu_ts_offset = gpu_offset; /* We have to allocate it here since it is different between drm/kgsl */ - submission_data->syncobj = (struct tu_u_trace_syncobj *) + u_trace_submission_data->syncobj = (struct tu_u_trace_syncobj *) vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - submission_data->syncobj->fence = req.fence; - submission_data->syncobj->msm_queue_id = queue->msm_queue_id; - - submit->u_trace_submission_data = NULL; - - for (uint32_t i = 0; i < submission_data->cmd_buffer_count; i++) { - bool free_data = i == submission_data->last_buffer_with_tracepoints; - if (submission_data->cmd_trace_data[i].trace) - u_trace_flush(submission_data->cmd_trace_data[i].trace, - submission_data, queue->device->vk.current_frame, - free_data); - - if (!submission_data->cmd_trace_data[i].timestamp_copy_cs) { - /* u_trace is owned by cmd_buffer */ - submission_data->cmd_trace_data[i].trace = NULL; - } - } + u_trace_submission_data->syncobj->fence = req.fence; + u_trace_submission_data->syncobj->msm_queue_id = queue->msm_queue_id; } - for (uint32_t i = 0; i < submit->vk_submit->wait_count; i++) { - if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->waits[i].sync)) + for (uint32_t i = 0; i < wait_count; i++) { + if (!vk_sync_is_tu_timeline_sync(waits[i].sync)) continue; struct tu_timeline_sync *sync = - container_of(submit->vk_submit->waits[i].sync, struct tu_timeline_sync, base); + container_of(waits[i].sync, struct tu_timeline_sync, base); assert(sync->state != TU_TIMELINE_SYNC_STATE_RESET); @@ -1126,12 +970,12 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_msm_queue_submit *submi sync->state = TU_TIMELINE_SYNC_STATE_SIGNALED; } - for (uint32_t i = 0; i < submit->vk_submit->signal_count; i++) { - if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->signals[i].sync)) + for (uint32_t i = 0; i < signal_count; i++) { + if (!vk_sync_is_tu_timeline_sync(signals[i].sync)) continue; struct tu_timeline_sync *sync = - container_of(submit->vk_submit->signals[i].sync, struct tu_timeline_sync, base); + container_of(signals[i].sync, struct tu_timeline_sync, base); assert(sync->state == TU_TIMELINE_SYNC_STATE_RESET); /* Set SUBMITTED to the state of the signal timeline sync so we could wait for @@ -1140,9 +984,12 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_msm_queue_submit *submi sync->state = TU_TIMELINE_SYNC_STATE_SUBMITTED; } - pthread_cond_broadcast(&queue->device->timeline_cond); - - return VK_SUCCESS; +fail_submit: + vk_free(&queue->device->vk.alloc, out_syncobjs); +fail_out_syncobjs: + vk_free(&queue->device->vk.alloc, in_syncobjs); +fail_in_syncobjs: + return result; } static VkResult @@ -1151,68 +998,6 @@ msm_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncob return tu_wait_fence(dev, syncobj->msm_queue_id, syncobj->fence, 1000000000); } -static VkResult -msm_queue_submit(struct tu_queue *queue, struct vk_queue_submit *submit) -{ - MESA_TRACE_FUNC(); - uint32_t perf_pass_index = queue->device->perfcntrs_pass_cs_entries ? - submit->perf_pass_index : ~0; - struct tu_msm_queue_submit submit_req; - - if (TU_DEBUG(LOG_SKIP_GMEM_OPS)) { - tu_dbg_log_gmem_load_store_skips(queue->device); - } - - pthread_mutex_lock(&queue->device->submit_mutex); - - VkResult ret = tu_queue_submit_create_locked(queue, submit, - submit->wait_count, submit->signal_count, - perf_pass_index, &submit_req); - - if (ret != VK_SUCCESS) { - pthread_mutex_unlock(&queue->device->submit_mutex); - return ret; - } - - /* note: assuming there won't be any very large semaphore counts */ - struct drm_msm_gem_submit_syncobj *in_syncobjs = submit_req.in_syncobjs; - struct drm_msm_gem_submit_syncobj *out_syncobjs = submit_req.out_syncobjs; - - uint32_t nr_in_syncobjs = 0, nr_out_syncobjs = 0; - - for (uint32_t i = 0; i < submit->wait_count; i++) { - struct vk_sync *sync = submit->waits[i].sync; - - in_syncobjs[nr_in_syncobjs++] = (struct drm_msm_gem_submit_syncobj) { - .handle = tu_syncobj_from_vk_sync(sync), - .flags = 0, - .point = submit->waits[i].wait_value, - }; - } - - for (uint32_t i = 0; i < submit->signal_count; i++) { - struct vk_sync *sync = submit->signals[i].sync; - - out_syncobjs[nr_out_syncobjs++] = (struct drm_msm_gem_submit_syncobj) { - .handle = tu_syncobj_from_vk_sync(sync), - .flags = 0, - .point = submit->signals[i].signal_value, - }; - } - - ret = tu_queue_submit_locked(queue, &submit_req); - - pthread_mutex_unlock(&queue->device->submit_mutex); - tu_queue_submit_finish(queue, &submit_req); - - if (ret != VK_SUCCESS) - return ret; - - u_trace_context_process(&queue->device->trace_context, false); - - return VK_SUCCESS; -} - static const struct tu_knl msm_knl_funcs = { .name = "msm", @@ -1232,6 +1017,9 @@ static const struct tu_knl msm_knl_funcs = { .bo_set_metadata = msm_bo_set_metadata, .bo_get_metadata = msm_bo_get_metadata, .device_wait_u_trace = msm_device_wait_u_trace, + .submit_create = msm_submit_create, + .submit_finish = msm_submit_finish, + .submit_add_entries = msm_submit_add_entries, .queue_submit = msm_queue_submit, }; diff --git a/src/freedreno/vulkan/tu_knl_drm_virtio.cc b/src/freedreno/vulkan/tu_knl_drm_virtio.cc index b869e23c9b3..dad47938749 100644 --- a/src/freedreno/vulkan/tu_knl_drm_virtio.cc +++ b/src/freedreno/vulkan/tu_knl_drm_virtio.cc @@ -45,24 +45,6 @@ struct tu_userspace_fence_cmds { struct tu_userspace_fence_cmd cmds[64]; }; -struct tu_virtio_queue_submit { - struct vk_queue_submit *vk_submit; - struct tu_u_trace_submission_data *u_trace_submission_data; - - struct tu_cmd_buffer **cmd_buffers; - struct drm_msm_gem_submit_cmd *cmds; - struct drm_virtgpu_execbuffer_syncobj *in_syncobjs; - struct drm_virtgpu_execbuffer_syncobj *out_syncobjs; - - uint32_t nr_cmd_buffers; - uint32_t nr_in_syncobjs; - uint32_t nr_out_syncobjs; - uint32_t entry_count; - uint32_t perf_pass_index; - - bool autotune_fence; -}; - struct tu_u_trace_syncobj { uint32_t msm_queue_id; uint32_t fence; @@ -835,206 +817,6 @@ virtio_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo) mtx_unlock(&dev->bo_mutex); } -static VkResult -tu_queue_submit_create_locked(struct tu_queue *queue, - struct vk_queue_submit *vk_submit, - const uint32_t nr_in_syncobjs, - const uint32_t nr_out_syncobjs, - uint32_t perf_pass_index, - struct tu_virtio_queue_submit *new_submit) -{ - VkResult result; - - bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context); - bool has_trace_points = false; - - struct vk_command_buffer **vk_cmd_buffers = vk_submit->command_buffers; - - memset(new_submit, 0, sizeof(struct tu_virtio_queue_submit)); - - new_submit->cmd_buffers = (struct tu_cmd_buffer **) vk_cmd_buffers; - new_submit->nr_cmd_buffers = vk_submit->command_buffer_count; - tu_insert_dynamic_cmdbufs(queue->device, &new_submit->cmd_buffers, - &new_submit->nr_cmd_buffers); - - uint32_t entry_count = 0; - for (uint32_t j = 0; j < new_submit->nr_cmd_buffers; ++j) { - struct tu_cmd_buffer *cmdbuf = new_submit->cmd_buffers[j]; - - if (perf_pass_index != ~0) - entry_count++; - - entry_count += cmdbuf->cs.entry_count; - - if (u_trace_enabled && u_trace_has_points(&cmdbuf->trace)) { - if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) - entry_count++; - - has_trace_points = true; - } - } - - new_submit->autotune_fence = - tu_autotune_submit_requires_fence(new_submit->cmd_buffers, new_submit->nr_cmd_buffers); - if (new_submit->autotune_fence) - entry_count++; - - /* Add one for the userspace fence cmd: */ - entry_count += 1; - - new_submit->cmds = (struct drm_msm_gem_submit_cmd *) vk_zalloc( - &queue->device->vk.alloc, entry_count * sizeof(*new_submit->cmds), 8, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - - if (new_submit->cmds == NULL) { - result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); - goto fail_cmds; - } - - if (has_trace_points) { - result = - tu_u_trace_submission_data_create( - queue->device, new_submit->cmd_buffers, - new_submit->nr_cmd_buffers, - &new_submit->u_trace_submission_data); - - if (result != VK_SUCCESS) { - goto fail_u_trace_submission_data; - } - } - - /* Allocate without wait timeline semaphores */ - new_submit->in_syncobjs = (struct drm_virtgpu_execbuffer_syncobj *) vk_zalloc( - &queue->device->vk.alloc, - nr_in_syncobjs * sizeof(*new_submit->in_syncobjs), 8, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - - if (new_submit->in_syncobjs == NULL) { - result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); - goto fail_in_syncobjs; - } - - /* Allocate with signal timeline semaphores considered */ - new_submit->out_syncobjs = (struct drm_virtgpu_execbuffer_syncobj *) vk_zalloc( - &queue->device->vk.alloc, - nr_out_syncobjs * sizeof(*new_submit->out_syncobjs), 8, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - - if (new_submit->out_syncobjs == NULL) { - result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); - goto fail_out_syncobjs; - } - - new_submit->entry_count = entry_count; - new_submit->nr_in_syncobjs = nr_in_syncobjs; - new_submit->nr_out_syncobjs = nr_out_syncobjs; - new_submit->perf_pass_index = perf_pass_index; - new_submit->vk_submit = vk_submit; - - return VK_SUCCESS; - -fail_out_syncobjs: - vk_free(&queue->device->vk.alloc, new_submit->in_syncobjs); -fail_in_syncobjs: - if (new_submit->u_trace_submission_data) - tu_u_trace_submission_data_finish(queue->device, - new_submit->u_trace_submission_data); -fail_u_trace_submission_data: - vk_free(&queue->device->vk.alloc, new_submit->cmds); -fail_cmds: - return result; -} - -static void -tu_queue_submit_finish(struct tu_queue *queue, struct tu_virtio_queue_submit *submit) -{ - vk_free(&queue->device->vk.alloc, submit->cmds); - vk_free(&queue->device->vk.alloc, submit->in_syncobjs); - vk_free(&queue->device->vk.alloc, submit->out_syncobjs); - if (submit->cmd_buffers != (void *) submit->vk_submit->command_buffers) - vk_free(&queue->device->vk.alloc, submit->cmd_buffers); -} - -static void -tu_fill_msm_gem_submit(struct tu_device *dev, - struct drm_msm_gem_submit_cmd *cmd, - struct tu_cs_entry *cs_entry) -{ - cmd->type = MSM_SUBMIT_CMD_BUF; - cmd->submit_idx = cs_entry->bo->bo_list_idx; - cmd->submit_offset = cs_entry->offset; - cmd->size = cs_entry->size; - cmd->pad = 0; - cmd->nr_relocs = 0; - cmd->relocs = 0; -} - -static void -tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue, - struct tu_virtio_queue_submit *submit, - struct tu_cs *autotune_cs) -{ - struct tu_device *dev = queue->device; - struct tu_virtio_device *vdev = dev->vdev; - struct drm_msm_gem_submit_cmd *cmds = submit->cmds; - - uint32_t entry_idx = 0; - for (uint32_t j = 0; j < submit->nr_cmd_buffers; ++j) { - struct tu_device *dev = queue->device; - struct tu_cmd_buffer *cmdbuf = submit->cmd_buffers[j]; - struct tu_cs *cs = &cmdbuf->cs; - - if (submit->perf_pass_index != ~0) { - struct tu_cs_entry *perf_cs_entry = - &dev->perfcntrs_pass_cs_entries[submit->perf_pass_index]; - - tu_fill_msm_gem_submit(dev, &cmds[entry_idx], perf_cs_entry); - entry_idx++; - } - - for (unsigned i = 0; i < cs->entry_count; ++i, ++entry_idx) { - tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &cs->entries[i]); - } - - if (submit->u_trace_submission_data) { - struct tu_cs *ts_cs = - submit->u_trace_submission_data->cmd_trace_data[j].timestamp_copy_cs; - if (ts_cs) { - tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &ts_cs->entries[0]); - entry_idx++; - } - } - } - - if (autotune_cs) { - assert(autotune_cs->entry_count == 1); - tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &autotune_cs->entries[0]); - entry_idx++; - } - - /* Last, add the userspace fence cmd: */ - struct tu_userspace_fence_cmds *fcmds = vdev->fence_cmds; - if (queue->fence <= 0) - queue->fence = 0; - uint32_t fence = ++queue->fence; - int idx = fence % ARRAY_SIZE(fcmds->cmds); - - /* Wait for previous usage of fence cmd to be idle.. in practice the table - * of recycled cmds should be big enough to never stall here: - */ - tu_wait_fence(dev, dev->queues[0]->msm_queue_id, fcmds->cmds[idx].fence, 3000000000); - - fcmds->cmds[idx].fence = fence; - - cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF; - cmds[entry_idx].submit_idx = vdev->fence_cmds_mem->bo_list_idx; - cmds[entry_idx].submit_offset = ((intptr_t)&fcmds->cmds[idx]) - (intptr_t)fcmds; - cmds[entry_idx].size = 5 * 4; - cmds[entry_idx].pad = 0; - cmds[entry_idx].nr_relocs = 0; - cmds[entry_idx].relocs = 0; -} - static VkResult setup_fence_cmds(struct tu_device *dev) { @@ -1078,11 +860,23 @@ setup_fence_cmds(struct tu_device *dev) } static VkResult -tu_queue_submit_locked(struct tu_queue *queue, struct tu_virtio_queue_submit *submit) +virtio_queue_submit(struct tu_queue *queue, void *_submit, + struct vk_sync_wait *waits, uint32_t wait_count, + struct vk_sync_signal *signals, uint32_t signal_count, + struct tu_u_trace_submission_data *u_trace_submission_data) { + VkResult result = VK_SUCCESS; + int ret; + struct tu_msm_queue_submit *submit = + (struct tu_msm_queue_submit *)_submit; struct tu_virtio_device *vdev = queue->device->vdev; - - queue->device->submit_count++; + struct drm_virtgpu_execbuffer_syncobj *in_syncobjs, *out_syncobjs; + uint64_t gpu_offset = 0; + int ring_idx = queue->priority + 1; + struct vdrm_execbuf_params params; +#if HAVE_PERFETTO + struct tu_perfetto_clocks clocks; +#endif /* It would be nice to not need to defer this, but virtio_device_init() * happens before the device is initialized enough to allocate normal @@ -1094,20 +888,74 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_virtio_queue_submit *su return result; } - struct tu_cs *autotune_cs = NULL; - if (submit->autotune_fence) { - autotune_cs = tu_autotune_on_submit(queue->device, - &queue->device->autotune, - submit->cmd_buffers, - submit->nr_cmd_buffers); - } + /* Add the userspace fence cmd: */ + struct tu_userspace_fence_cmds *fcmds = vdev->fence_cmds; + if (queue->fence <= 0) + queue->fence = 0; + uint32_t fence = ++queue->fence; + int idx = fence % ARRAY_SIZE(fcmds->cmds); + struct tu_cs_entry fence_cs = { + .bo = vdev->fence_cmds_mem, + .size = 5 * 4, + .offset = ((intptr_t)&fcmds->cmds[idx]) - (intptr_t)fcmds, + }; + msm_submit_add_entries(queue->device, _submit, &fence_cs, 1); + uint32_t entry_count = + util_dynarray_num_elements(&submit->commands, struct drm_msm_gem_submit_cmd); + unsigned nr_bos = entry_count ? queue->device->bo_count : 0; + unsigned bos_len = nr_bos * sizeof(struct drm_msm_gem_submit_bo); + unsigned cmd_len = entry_count * sizeof(struct drm_msm_gem_submit_cmd); + unsigned req_len = sizeof(struct msm_ccmd_gem_submit_req) + bos_len + cmd_len; + struct msm_ccmd_gem_submit_req *req; uint32_t flags = MSM_PIPE_3D0; - if (submit->vk_submit->wait_count) + /* Allocate without wait timeline semaphores */ + in_syncobjs = (struct drm_virtgpu_execbuffer_syncobj *) vk_zalloc( + &queue->device->vk.alloc, + wait_count * sizeof(*in_syncobjs), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + if (in_syncobjs == NULL) { + result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_in_syncobjs; + } + + /* Allocate with signal timeline semaphores considered */ + out_syncobjs = (struct drm_virtgpu_execbuffer_syncobj *) vk_zalloc( + &queue->device->vk.alloc, + signal_count * sizeof(*out_syncobjs), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + if (out_syncobjs == NULL) { + result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_out_syncobjs; + } + + for (uint32_t i = 0; i < wait_count; i++) { + struct vk_sync *sync = waits[i].sync; + + in_syncobjs[i] = (struct drm_virtgpu_execbuffer_syncobj) { + .handle = tu_syncobj_from_vk_sync(sync), + .flags = 0, + .point = waits[i].wait_value, + }; + } + + for (uint32_t i = 0; i < signal_count; i++) { + struct vk_sync *sync = signals[i].sync; + + out_syncobjs[i] = (struct drm_virtgpu_execbuffer_syncobj) { + .handle = tu_syncobj_from_vk_sync(sync), + .flags = 0, + .point = signals[i].signal_value, + }; + } + + if (wait_count) flags |= MSM_SUBMIT_SYNCOBJ_IN; - if (submit->vk_submit->signal_count) + if (signal_count) flags |= MSM_SUBMIT_SYNCOBJ_OUT; mtx_lock(&queue->device->bo_mutex); @@ -1116,32 +964,33 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_virtio_queue_submit *su flags |= MSM_SUBMIT_NO_IMPLICIT; /* drm_msm_gem_submit_cmd requires index of bo which could change at any - * time when bo_mutex is not locked. So we build submit cmds here the real - * place to submit. + * time when bo_mutex is not locked. So we update the index here under the + * lock. */ - tu_queue_build_msm_gem_submit_cmds(queue, submit, autotune_cs); + util_dynarray_foreach (&submit->commands, struct drm_msm_gem_submit_cmd, + cmd) { + unsigned i = cmd - + util_dynarray_element(&submit->commands, + struct drm_msm_gem_submit_cmd, 0); + struct tu_bo **bo = util_dynarray_element(&submit->command_bos, + struct tu_bo *, i); + cmd->submit_idx = (*bo)->bo_list_idx; + } - /* TODO avoid extra memcpy, and populate bo's and cmds directly - * into the req msg - */ - unsigned nr_cmds = submit->entry_count; - unsigned nr_bos = nr_cmds ? queue->device->bo_count : 0; - unsigned bos_len = nr_bos * sizeof(struct drm_msm_gem_submit_bo); - unsigned cmd_len = nr_cmds * sizeof(struct drm_msm_gem_submit_cmd); - unsigned req_len = sizeof(struct msm_ccmd_gem_submit_req) + bos_len + cmd_len; - struct msm_ccmd_gem_submit_req *req = (struct msm_ccmd_gem_submit_req *)vk_alloc( + req = (struct msm_ccmd_gem_submit_req *)vk_alloc( &queue->device->vk.alloc, req_len, 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (!req) { mtx_unlock(&queue->device->bo_mutex); - return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); + result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_alloc_req; } req->hdr = MSM_CCMD(GEM_SUBMIT, req_len); req->flags = flags; req->queue_id = queue->msm_queue_id; req->nr_bos = nr_bos; - req->nr_cmds = nr_cmds; + req->nr_cmds = entry_count; /* Use same kernel fence and userspace fence seqno to avoid having * to track both: @@ -1149,70 +998,47 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_virtio_queue_submit *su req->fence = queue->fence; memcpy(req->payload, queue->device->bo_list, bos_len); - memcpy(req->payload + bos_len, submit->cmds, cmd_len); + memcpy(req->payload + bos_len, submit->commands.data, cmd_len); - int ring_idx = queue->priority + 1; - int ret; - - struct vdrm_execbuf_params p = { + params = (struct vdrm_execbuf_params) { .ring_idx = ring_idx, .req = &req->hdr, - .in_syncobjs = submit->in_syncobjs, - .out_syncobjs = submit->out_syncobjs, - .num_in_syncobjs = submit->nr_in_syncobjs, - .num_out_syncobjs = submit->nr_out_syncobjs, + .in_syncobjs = in_syncobjs, + .out_syncobjs = out_syncobjs, + .num_in_syncobjs = wait_count, + .num_out_syncobjs = signal_count, }; - ret = vdrm_execbuf(vdev->vdrm, &p); + ret = vdrm_execbuf(vdev->vdrm, ¶ms); mtx_unlock(&queue->device->bo_mutex); - tu_debug_bos_print_stats(queue->device); + if (ret) { + result = vk_device_set_lost(&queue->device->vk, "submit failed: %m"); + goto fail_submit; + } - if (ret) - return vk_device_set_lost(&queue->device->vk, "submit failed: %m"); - - uint64_t gpu_offset = 0; #if HAVE_PERFETTO - struct tu_perfetto_clocks clocks = - tu_perfetto_submit(queue->device, queue->device->submit_count, NULL); + clocks = tu_perfetto_submit(queue->device, queue->device->submit_count, NULL); gpu_offset = clocks.gpu_ts_offset; #endif - if (submit->u_trace_submission_data) { - struct tu_u_trace_submission_data *submission_data = - submit->u_trace_submission_data; - submission_data->submission_id = queue->device->submit_count; - submission_data->gpu_ts_offset = gpu_offset; + if (u_trace_submission_data) { + u_trace_submission_data->gpu_ts_offset = gpu_offset; /* We have to allocate it here since it is different between drm/kgsl */ - submission_data->syncobj = (struct tu_u_trace_syncobj *) + u_trace_submission_data->syncobj = (struct tu_u_trace_syncobj *) vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - submission_data->syncobj->fence = req->fence; - submission_data->syncobj->msm_queue_id = queue->msm_queue_id; - - submit->u_trace_submission_data = NULL; - - for (uint32_t i = 0; i < submission_data->cmd_buffer_count; i++) { - bool free_data = i == submission_data->last_buffer_with_tracepoints; - if (submission_data->cmd_trace_data[i].trace) - u_trace_flush(submission_data->cmd_trace_data[i].trace, - submission_data, queue->device->vk.current_frame, - free_data); - - if (!submission_data->cmd_trace_data[i].timestamp_copy_cs) { - /* u_trace is owned by cmd_buffer */ - submission_data->cmd_trace_data[i].trace = NULL; - } - } + u_trace_submission_data->syncobj->fence = req->fence; + u_trace_submission_data->syncobj->msm_queue_id = queue->msm_queue_id; } - for (uint32_t i = 0; i < submit->vk_submit->wait_count; i++) { - if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->waits[i].sync)) + for (uint32_t i = 0; i < wait_count; i++) { + if (!vk_sync_is_tu_timeline_sync(waits[i].sync)) continue; struct tu_timeline_sync *sync = - container_of(submit->vk_submit->waits[i].sync, struct tu_timeline_sync, base); + container_of(waits[i].sync, struct tu_timeline_sync, base); assert(sync->state != TU_TIMELINE_SYNC_STATE_RESET); @@ -1222,12 +1048,12 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_virtio_queue_submit *su sync->state = TU_TIMELINE_SYNC_STATE_SIGNALED; } - for (uint32_t i = 0; i < submit->vk_submit->signal_count; i++) { - if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->signals[i].sync)) + for (uint32_t i = 0; i < signal_count; i++) { + if (!vk_sync_is_tu_timeline_sync(signals[i].sync)) continue; struct tu_timeline_sync *sync = - container_of(submit->vk_submit->signals[i].sync, struct tu_timeline_sync, base); + container_of(signals[i].sync, struct tu_timeline_sync, base); assert(sync->state == TU_TIMELINE_SYNC_STATE_RESET); /* Set SUBMITTED to the state of the signal timeline sync so we could wait for @@ -1236,9 +1062,14 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_virtio_queue_submit *su sync->state = TU_TIMELINE_SYNC_STATE_SUBMITTED; } - pthread_cond_broadcast(&queue->device->timeline_cond); - - return VK_SUCCESS; +fail_submit: + vk_free(&queue->device->vk.alloc, req); +fail_alloc_req: + vk_free(&queue->device->vk.alloc, out_syncobjs); +fail_out_syncobjs: + vk_free(&queue->device->vk.alloc, in_syncobjs); +fail_in_syncobjs: + return result; } static VkResult @@ -1247,68 +1078,6 @@ virtio_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syn return tu_wait_fence(dev, syncobj->msm_queue_id, syncobj->fence, 1000000000); } -static VkResult -virtio_queue_submit(struct tu_queue *queue, struct vk_queue_submit *submit) -{ - MESA_TRACE_FUNC(); - uint32_t perf_pass_index = queue->device->perfcntrs_pass_cs_entries ? - submit->perf_pass_index : ~0; - struct tu_virtio_queue_submit submit_req; - - if (TU_DEBUG(LOG_SKIP_GMEM_OPS)) { - tu_dbg_log_gmem_load_store_skips(queue->device); - } - - pthread_mutex_lock(&queue->device->submit_mutex); - - VkResult ret = tu_queue_submit_create_locked(queue, submit, - submit->wait_count, submit->signal_count, - perf_pass_index, &submit_req); - - if (ret != VK_SUCCESS) { - pthread_mutex_unlock(&queue->device->submit_mutex); - return ret; - } - - /* note: assuming there won't be any very large semaphore counts */ - struct drm_virtgpu_execbuffer_syncobj *in_syncobjs = submit_req.in_syncobjs; - struct drm_virtgpu_execbuffer_syncobj *out_syncobjs = submit_req.out_syncobjs; - - uint32_t nr_in_syncobjs = 0, nr_out_syncobjs = 0; - - for (uint32_t i = 0; i < submit->wait_count; i++) { - struct vk_sync *sync = submit->waits[i].sync; - - in_syncobjs[nr_in_syncobjs++] = (struct drm_virtgpu_execbuffer_syncobj) { - .handle = tu_syncobj_from_vk_sync(sync), - .flags = 0, - .point = submit->waits[i].wait_value, - }; - } - - for (uint32_t i = 0; i < submit->signal_count; i++) { - struct vk_sync *sync = submit->signals[i].sync; - - out_syncobjs[nr_out_syncobjs++] = (struct drm_virtgpu_execbuffer_syncobj) { - .handle = tu_syncobj_from_vk_sync(sync), - .flags = 0, - .point = submit->signals[i].signal_value, - }; - } - - ret = tu_queue_submit_locked(queue, &submit_req); - - pthread_mutex_unlock(&queue->device->submit_mutex); - tu_queue_submit_finish(queue, &submit_req); - - if (ret != VK_SUCCESS) - return ret; - - u_trace_context_process(&queue->device->trace_context, false); - - return VK_SUCCESS; -} - static const struct tu_knl virtio_knl_funcs = { .name = "virtgpu", @@ -1326,6 +1095,9 @@ static const struct tu_knl virtio_knl_funcs = { .bo_allow_dump = virtio_bo_allow_dump, .bo_finish = tu_drm_bo_finish, .device_wait_u_trace = virtio_device_wait_u_trace, + .submit_create = msm_submit_create, + .submit_finish = msm_submit_finish, + .submit_add_entries = msm_submit_add_entries, .queue_submit = virtio_queue_submit, }; diff --git a/src/freedreno/vulkan/tu_knl_kgsl.cc b/src/freedreno/vulkan/tu_knl_kgsl.cc index 6f00db79092..9eab1387fed 100644 --- a/src/freedreno/vulkan/tu_knl_kgsl.cc +++ b/src/freedreno/vulkan/tu_knl_kgsl.cc @@ -379,6 +379,7 @@ kgsl_bo_finish(struct tu_device *dev, struct tu_bo *bo) close(bo->shared_fd); TU_RMV(bo_destroy, dev, bo); + tu_debug_bos_del(dev, bo); struct kgsl_gpumem_free_id req = { .id = bo->gem_handle @@ -1041,20 +1042,62 @@ const struct vk_sync_type vk_kgsl_sync_type = { .export_sync_file = vk_kgsl_sync_export_sync_file, }; -static VkResult -kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit) +struct tu_kgsl_queue_submit { + struct util_dynarray commands; +}; + +static void * +kgsl_submit_create(struct tu_device *device) { - MESA_TRACE_FUNC(); + return vk_zalloc(&device->vk.alloc, sizeof(struct tu_kgsl_queue_submit), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); +} - bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context); - bool has_trace_points = false; +static void +kgsl_submit_finish(struct tu_device *device, + void *_submit) +{ + struct tu_kgsl_queue_submit *submit = + (struct tu_kgsl_queue_submit *)_submit; - if (vk_submit->command_buffer_count == 0) { - pthread_mutex_lock(&queue->device->submit_mutex); + util_dynarray_fini(&submit->commands); + vk_free(&device->vk.alloc, submit); +} - const struct kgsl_syncobj *wait_semaphores[vk_submit->wait_count + 1]; - for (uint32_t i = 0; i < vk_submit->wait_count; i++) { - wait_semaphores[i] = &container_of(vk_submit->waits[i].sync, +static void +kgsl_submit_add_entries(struct tu_device *device, void *_submit, + struct tu_cs_entry *entries, unsigned num_entries) +{ + struct tu_kgsl_queue_submit *submit = + (struct tu_kgsl_queue_submit *)_submit; + + struct kgsl_command_object *cmds = (struct kgsl_command_object *) + util_dynarray_grow(&submit->commands, struct kgsl_command_object, + num_entries); + + for (unsigned i = 0; i < num_entries; i++) { + cmds[i] = (struct kgsl_command_object) { + .gpuaddr = entries[i].bo->iova + entries[i].offset, + .size = entries[i].size, + .flags = KGSL_CMDLIST_IB, + .id = entries[i].bo->gem_handle, + }; + } +} + +static VkResult +kgsl_queue_submit(struct tu_queue *queue, void *_submit, + struct vk_sync_wait *waits, uint32_t wait_count, + struct vk_sync_signal *signals, uint32_t signal_count, + struct tu_u_trace_submission_data *u_trace_submission_data) +{ + struct tu_kgsl_queue_submit *submit = + (struct tu_kgsl_queue_submit *)_submit; + + if (submit->commands.size == 0) { + const struct kgsl_syncobj *wait_semaphores[wait_count + 1]; + for (uint32_t i = 0; i < wait_count; i++) { + wait_semaphores[i] = &container_of(waits[i].sync, struct vk_kgsl_syncobj, vk) ->syncobj; } @@ -1071,94 +1114,28 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit) .state = KGSL_SYNCOBJ_STATE_SIGNALED, }; - wait_semaphores[vk_submit->wait_count] = &last_submit_sync; + wait_semaphores[wait_count] = &last_submit_sync; struct kgsl_syncobj wait_sync = - kgsl_syncobj_merge(wait_semaphores, vk_submit->wait_count + 1); + kgsl_syncobj_merge(wait_semaphores, wait_count + 1); assert(wait_sync.state != KGSL_SYNCOBJ_STATE_UNSIGNALED); // Would wait forever - for (uint32_t i = 0; i < vk_submit->signal_count; i++) { + for (uint32_t i = 0; i < signal_count; i++) { struct kgsl_syncobj *signal_sync = - &container_of(vk_submit->signals[i].sync, struct vk_kgsl_syncobj, - vk) + &container_of(signals[i].sync, struct vk_kgsl_syncobj, vk) ->syncobj; kgsl_syncobj_reset(signal_sync); *signal_sync = wait_sync; } - pthread_mutex_unlock(&queue->device->submit_mutex); - pthread_cond_broadcast(&queue->device->timeline_cond); - return VK_SUCCESS; } - uint32_t perf_pass_index = - queue->device->perfcntrs_pass_cs_entries ? vk_submit->perf_pass_index : ~0; - - if (TU_DEBUG(LOG_SKIP_GMEM_OPS)) - tu_dbg_log_gmem_load_store_skips(queue->device); - VkResult result = VK_SUCCESS; - pthread_mutex_lock(&queue->device->submit_mutex); - - struct tu_cmd_buffer **cmd_buffers = - (struct tu_cmd_buffer **) vk_submit->command_buffers; - static_assert(offsetof(struct tu_cmd_buffer, vk) == 0, - "vk must be first member of tu_cmd_buffer"); - uint32_t cmdbuf_count = vk_submit->command_buffer_count; - - result = - tu_insert_dynamic_cmdbufs(queue->device, &cmd_buffers, &cmdbuf_count); - if (result != VK_SUCCESS) { - pthread_mutex_unlock(&queue->device->submit_mutex); - return result; - } - - uint32_t entry_count = 0; - for (uint32_t i = 0; i < cmdbuf_count; ++i) { - struct tu_cmd_buffer *cmd_buffer = cmd_buffers[i]; - - if (perf_pass_index != ~0) - entry_count++; - - entry_count += cmd_buffer->cs.entry_count; - - if (u_trace_enabled && u_trace_has_points(&cmd_buffers[i]->trace)) { - if (!(cmd_buffers[i]->usage_flags & - VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) - entry_count++; - - has_trace_points = true; - } - } - - if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) - entry_count++; - - struct kgsl_command_object *cmds = (struct kgsl_command_object *) - vk_alloc(&queue->device->vk.alloc, sizeof(*cmds) * entry_count, - alignof(*cmds), VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (cmds == NULL) { - pthread_mutex_unlock(&queue->device->submit_mutex); - return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); - } - - uint32_t obj_count = 0; - if (has_trace_points) - obj_count++; - - struct kgsl_command_object *objs = (struct kgsl_command_object *) - vk_alloc(&queue->device->vk.alloc, sizeof(*objs) * obj_count, - alignof(*objs), VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - - struct tu_u_trace_submission_data *u_trace_submission_data = NULL; - if (has_trace_points) { - tu_u_trace_submission_data_create( - queue->device, cmd_buffers, cmdbuf_count, &u_trace_submission_data); - + if (u_trace_submission_data) { mtx_lock(&queue->device->kgsl_profiling_mutex); tu_suballoc_bo_alloc(&u_trace_submission_data->kgsl_timestamp_bo, &queue->device->kgsl_profiling_suballoc, @@ -1166,46 +1143,13 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit) mtx_unlock(&queue->device->kgsl_profiling_mutex); } - uint32_t entry_idx = 0; - for (uint32_t i = 0; i < cmdbuf_count; i++) { - struct tu_cmd_buffer *cmd_buffer = cmd_buffers[i]; - struct tu_cs *cs = &cmd_buffer->cs; + uint32_t obj_count = 0; + if (u_trace_submission_data) + obj_count++; - if (perf_pass_index != ~0) { - struct tu_cs_entry *perf_cs_entry = - &cmd_buffer->device->perfcntrs_pass_cs_entries[perf_pass_index]; - - cmds[entry_idx++] = (struct kgsl_command_object) { - .gpuaddr = perf_cs_entry->bo->iova + perf_cs_entry->offset, - .size = perf_cs_entry->size, - .flags = KGSL_CMDLIST_IB, - .id = perf_cs_entry->bo->gem_handle, - }; - } - - for (uint32_t j = 0; j < cs->entry_count; j++) { - cmds[entry_idx++] = (struct kgsl_command_object) { - .gpuaddr = cs->entries[j].bo->iova + cs->entries[j].offset, - .size = cs->entries[j].size, - .flags = KGSL_CMDLIST_IB, - .id = cs->entries[j].bo->gem_handle, - }; - } - - if (u_trace_submission_data && - u_trace_submission_data->cmd_trace_data[i].timestamp_copy_cs) { - struct tu_cs_entry *trace_cs_entry = - &u_trace_submission_data->cmd_trace_data[i] - .timestamp_copy_cs->entries[0]; - cmds[entry_idx++] = (struct kgsl_command_object) { - .offset = trace_cs_entry->offset, - .gpuaddr = trace_cs_entry->bo->iova, - .size = trace_cs_entry->size, - .flags = KGSL_CMDLIST_IB, - .id = trace_cs_entry->bo->gem_handle, - }; - } - } + struct kgsl_command_object *objs = (struct kgsl_command_object *) + vk_alloc(&queue->device->vk.alloc, sizeof(*objs) * obj_count, + alignof(*objs), VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); struct kgsl_cmdbatch_profiling_buffer *profiling_buffer = NULL; uint32_t obj_idx = 0; @@ -1224,27 +1168,15 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit) memset(profiling_buffer, 0, sizeof(*profiling_buffer)); } - if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) { - struct tu_cs *autotune_cs = tu_autotune_on_submit( - queue->device, &queue->device->autotune, cmd_buffers, cmdbuf_count); - cmds[entry_idx++] = (struct kgsl_command_object) { - .gpuaddr = - autotune_cs->entries[0].bo->iova + autotune_cs->entries[0].offset, - .size = autotune_cs->entries[0].size, - .flags = KGSL_CMDLIST_IB, - .id = autotune_cs->entries[0].bo->gem_handle, - }; - } - - const struct kgsl_syncobj *wait_semaphores[vk_submit->wait_count]; - for (uint32_t i = 0; i < vk_submit->wait_count; i++) { + const struct kgsl_syncobj *wait_semaphores[wait_count]; + for (uint32_t i = 0; i < wait_count; i++) { wait_semaphores[i] = - &container_of(vk_submit->waits[i].sync, struct vk_kgsl_syncobj, vk) + &container_of(waits[i].sync, struct vk_kgsl_syncobj, vk) ->syncobj; } struct kgsl_syncobj wait_sync = - kgsl_syncobj_merge(wait_semaphores, vk_submit->wait_count); + kgsl_syncobj_merge(wait_semaphores, wait_count); assert(wait_sync.state != KGSL_SYNCOBJ_STATE_UNSIGNALED); // Would wait forever @@ -1281,9 +1213,10 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit) struct kgsl_gpu_command req = { .flags = KGSL_CMDBATCH_SUBMIT_IB_LIST, - .cmdlist = (uintptr_t) cmds, + .cmdlist = (uintptr_t) submit->commands.data, .cmdsize = sizeof(struct kgsl_command_object), - .numcmds = entry_idx, + .numcmds = util_dynarray_num_elements(&submit->commands, + struct kgsl_command_object), .synclist = (uintptr_t) &sync, .syncsize = sizeof(sync), .numsyncs = has_sync != 0 ? 1 : 0, @@ -1349,9 +1282,9 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit) p_atomic_set(&queue->fence, req.timestamp); - for (uint32_t i = 0; i < vk_submit->signal_count; i++) { + for (uint32_t i = 0; i < signal_count; i++) { struct kgsl_syncobj *signal_sync = - &container_of(vk_submit->signals[i].sync, struct vk_kgsl_syncobj, vk) + &container_of(signals[i].sync, struct vk_kgsl_syncobj, vk) ->syncobj; kgsl_syncobj_reset(signal_sync); @@ -1363,7 +1296,6 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit) if (u_trace_submission_data) { struct tu_u_trace_submission_data *submission_data = u_trace_submission_data; - submission_data->submission_id = queue->device->submit_count; submission_data->gpu_ts_offset = gpu_offset; /* We have to allocate it here since it is different between drm/kgsl */ submission_data->syncobj = (struct tu_u_trace_syncobj *) @@ -1371,40 +1303,9 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit) 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); submission_data->syncobj->timestamp = req.timestamp; submission_data->syncobj->msm_queue_id = queue->msm_queue_id; - - u_trace_submission_data = NULL; - - for (uint32_t i = 0; i < submission_data->cmd_buffer_count; i++) { - bool free_data = i == submission_data->last_buffer_with_tracepoints; - if (submission_data->cmd_trace_data[i].trace) - u_trace_flush(submission_data->cmd_trace_data[i].trace, - submission_data, queue->device->vk.current_frame, - free_data); - - if (!submission_data->cmd_trace_data[i].timestamp_copy_cs) { - /* u_trace is owned by cmd_buffer */ - submission_data->cmd_trace_data[i].trace = NULL; - } - } } - queue->device->submit_count++; - - pthread_mutex_unlock(&queue->device->submit_mutex); - pthread_cond_broadcast(&queue->device->timeline_cond); - - u_trace_context_process(&queue->device->trace_context, false); - - if (cmd_buffers != (struct tu_cmd_buffer **) vk_submit->command_buffers) - vk_free(&queue->device->vk.alloc, cmd_buffers); - - vk_free(&queue->device->vk.alloc, cmds); - - return VK_SUCCESS; - fail_submit: - pthread_mutex_unlock(&queue->device->submit_mutex); - if (result != VK_SUCCESS) { mtx_lock(&queue->device->kgsl_profiling_mutex); tu_suballoc_bo_free(&queue->device->kgsl_profiling_suballoc, @@ -1412,11 +1313,6 @@ fail_submit: mtx_unlock(&queue->device->kgsl_profiling_mutex); } - if (cmd_buffers != (struct tu_cmd_buffer **) vk_submit->command_buffers) - vk_free(&queue->device->vk.alloc, cmd_buffers); - - vk_free(&queue->device->vk.alloc, cmds); - return result; } @@ -1509,6 +1405,9 @@ static const struct tu_knl kgsl_knl_funcs = { .bo_allow_dump = kgsl_bo_allow_dump, .bo_finish = kgsl_bo_finish, .device_wait_u_trace = kgsl_device_wait_u_trace, + .submit_create = kgsl_submit_create, + .submit_finish = kgsl_submit_finish, + .submit_add_entries = kgsl_submit_add_entries, .queue_submit = kgsl_queue_submit, }; diff --git a/src/freedreno/vulkan/tu_queue.cc b/src/freedreno/vulkan/tu_queue.cc index e53ebfa61c5..509a79eef1d 100644 --- a/src/freedreno/vulkan/tu_queue.cc +++ b/src/freedreno/vulkan/tu_queue.cc @@ -9,6 +9,8 @@ #include "tu_queue.h" +#include "tu_cmd_buffer.h" +#include "tu_dynamic_rendering.h" #include "tu_knl.h" #include "tu_device.h" @@ -49,6 +51,125 @@ tu_get_submitqueue_priority(const struct tu_physical_device *pdevice, return priority; } +static VkResult +queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit) +{ + struct tu_queue *queue = list_entry(_queue, struct tu_queue, vk); + struct tu_device *device = queue->device; + bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context); + + uint32_t perf_pass_index = + device->perfcntrs_pass_cs_entries ? vk_submit->perf_pass_index : ~0; + + if (TU_DEBUG(LOG_SKIP_GMEM_OPS)) + tu_dbg_log_gmem_load_store_skips(device); + + pthread_mutex_lock(&device->submit_mutex); + + struct tu_cmd_buffer **cmd_buffers = + (struct tu_cmd_buffer **) vk_submit->command_buffers; + uint32_t cmdbuf_count = vk_submit->command_buffer_count; + + VkResult result = + tu_insert_dynamic_cmdbufs(device, &cmd_buffers, &cmdbuf_count); + if (result != VK_SUCCESS) + return result; + + bool has_trace_points = false; + static_assert(offsetof(struct tu_cmd_buffer, vk) == 0, + "vk must be first member of tu_cmd_buffer"); + for (unsigned i = 0; i < vk_submit->command_buffer_count; i++) { + if (u_trace_enabled && u_trace_has_points(&cmd_buffers[i]->trace)) + has_trace_points = true; + } + + struct tu_u_trace_submission_data *u_trace_submission_data = NULL; + + void *submit = tu_submit_create(device); + if (!submit) + goto fail_create_submit; + + if (has_trace_points) { + tu_u_trace_submission_data_create( + device, cmd_buffers, cmdbuf_count, &u_trace_submission_data); + } + + for (uint32_t i = 0; i < cmdbuf_count; i++) { + struct tu_cmd_buffer *cmd_buffer = cmd_buffers[i]; + struct tu_cs *cs = &cmd_buffer->cs; + + if (perf_pass_index != ~0) { + struct tu_cs_entry *perf_cs_entry = + &cmd_buffer->device->perfcntrs_pass_cs_entries[perf_pass_index]; + + tu_submit_add_entries(device, submit, perf_cs_entry, 1); + } + + tu_submit_add_entries(device, submit, cs->entries, + cs->entry_count); + + if (u_trace_submission_data && + u_trace_submission_data->cmd_trace_data[i].timestamp_copy_cs) { + struct tu_cs_entry *trace_cs_entry = + &u_trace_submission_data->cmd_trace_data[i] + .timestamp_copy_cs->entries[0]; + tu_submit_add_entries(device, submit, trace_cs_entry, 1); + } + } + + if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) { + struct tu_cs *autotune_cs = tu_autotune_on_submit( + device, &device->autotune, cmd_buffers, cmdbuf_count); + tu_submit_add_entries(device, submit, autotune_cs->entries, + autotune_cs->entry_count); + } + + result = + tu_queue_submit(queue, submit, vk_submit->waits, vk_submit->wait_count, + vk_submit->signals, vk_submit->signal_count, + u_trace_submission_data); + + if (result != VK_SUCCESS) { + pthread_mutex_unlock(&device->submit_mutex); + goto out; + } + + tu_debug_bos_print_stats(device); + + if (u_trace_submission_data) { + u_trace_submission_data->submission_id = device->submit_count; + + for (uint32_t i = 0; i < u_trace_submission_data->cmd_buffer_count; i++) { + bool free_data = i == u_trace_submission_data->last_buffer_with_tracepoints; + if (u_trace_submission_data->cmd_trace_data[i].trace) + u_trace_flush(u_trace_submission_data->cmd_trace_data[i].trace, + u_trace_submission_data, queue->device->vk.current_frame, + free_data); + + if (!u_trace_submission_data->cmd_trace_data[i].timestamp_copy_cs) { + /* u_trace is owned by cmd_buffer */ + u_trace_submission_data->cmd_trace_data[i].trace = NULL; + } + } + } + + device->submit_count++; + + pthread_mutex_unlock(&device->submit_mutex); + pthread_cond_broadcast(&queue->device->timeline_cond); + + u_trace_context_process(&device->trace_context, false); + +out: + tu_submit_finish(device, submit); + +fail_create_submit: + if (cmd_buffers != (struct tu_cmd_buffer **) vk_submit->command_buffers) + vk_free(&queue->device->vk.alloc, cmd_buffers); + + return result; +} + VkResult tu_queue_init(struct tu_device *device, struct tu_queue *queue, @@ -77,7 +198,7 @@ tu_queue_init(struct tu_device *device, queue->device = device; queue->priority = priority; - queue->vk.driver_submit = tu_queue_submit; + queue->vk.driver_submit = queue_submit; int ret = tu_drm_submitqueue_new(device, priority, &queue->msm_queue_id); if (ret)