mesa/src/broadcom/vulkan/v3dv_queue.c

2089 lines
70 KiB
C
Raw Normal View History

/*
* Copyright © 2019 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "v3dv_private.h"
#include "drm-uapi/v3d_drm.h"
#include "broadcom/clif/clif_dump.h"
#include <errno.h>
#include <time.h>
static void
v3dv_clif_dump(struct v3dv_device *device,
struct v3dv_job *job,
struct drm_v3d_submit_cl *submit)
{
if (!(unlikely(V3D_DEBUG & (V3D_DEBUG_CL |
V3D_DEBUG_CL_NO_BIN |
V3D_DEBUG_CLIF))))
return;
struct clif_dump *clif = clif_dump_init(&device->devinfo,
stderr,
V3D_DEBUG & (V3D_DEBUG_CL |
V3D_DEBUG_CL_NO_BIN),
V3D_DEBUG & V3D_DEBUG_CL_NO_BIN);
set_foreach(job->bos, entry) {
struct v3dv_bo *bo = (void *)entry->key;
char *name = ralloc_asprintf(NULL, "%s_0x%x",
bo->name, bo->offset);
bool ok = v3dv_bo_map(device, bo, bo->size);
if (!ok) {
fprintf(stderr, "failed to map BO for clif_dump.\n");
ralloc_free(name);
goto free_clif;
}
clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map);
ralloc_free(name);
}
clif_dump(clif, submit);
free_clif:
clif_dump_destroy(clif);
}
static uint64_t
gettime_ns()
{
struct timespec current;
clock_gettime(CLOCK_MONOTONIC, &current);
return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec;
}
static uint64_t
get_absolute_timeout(uint64_t timeout)
{
uint64_t current_time = gettime_ns();
uint64_t max_timeout = (uint64_t) INT64_MAX - current_time;
timeout = MIN2(max_timeout, timeout);
return (current_time + timeout);
}
static VkResult
queue_submit_job(struct v3dv_queue *queue,
struct v3dv_job *job,
struct v3dv_submit_info_semaphores *sems_info,
pthread_t *wait_thread);
/* Waits for active CPU wait threads spawned before the current thread to
* complete and submit all their GPU jobs.
*/
static void
cpu_queue_wait_idle(struct v3dv_queue *queue)
{
const pthread_t this_thread = pthread_self();
retry:
mtx_lock(&queue->mutex);
list_for_each_entry(struct v3dv_queue_submit_wait_info, info,
&queue->submit_wait_list, list_link) {
for (uint32_t i = 0; i < info->wait_thread_count; i++) {
if (info->wait_threads[i].finished)
continue;
/* Because we are testing this against the list of spawned threads
* it will never match for the main thread, so when we call this from
* the main thread we are effectively waiting for all active threads
* to complete, and otherwise we are only waiting for work submitted
* before the wait thread that called this (a wait thread should never
* be waiting for work submitted after it).
*/
if (info->wait_threads[i].thread == this_thread)
goto done;
/* Wait and try again */
mtx_unlock(&queue->mutex);
usleep(500); /* 0.5 ms */
goto retry;
}
}
done:
mtx_unlock(&queue->mutex);
}
static VkResult
gpu_queue_wait_idle(struct v3dv_queue *queue)
{
struct v3dv_device *device = queue->device;
int render_fd = device->pdevice->render_fd;
struct v3dv_last_job_sync last_job_syncs;
mtx_lock(&device->mutex);
memcpy(&last_job_syncs, &device->last_job_syncs, sizeof(last_job_syncs));
mtx_unlock(&device->mutex);
if (device->pdevice->caps.multisync) {
int ret = drmSyncobjWait(render_fd, (uint32_t *) &last_job_syncs.syncs,
3, INT64_MAX,
DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
if (ret)
return VK_ERROR_DEVICE_LOST;
} else {
int ret =
drmSyncobjWait(render_fd, &last_job_syncs.syncs[V3DV_QUEUE_ANY], 1,
INT64_MAX, 0, NULL);
if (ret)
return VK_ERROR_DEVICE_LOST;
}
return VK_SUCCESS;
}
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_QueueWaitIdle(VkQueue _queue)
{
V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
/* Check that we don't have any wait threads running in the CPU first,
* as these can spawn new GPU jobs.
*/
cpu_queue_wait_idle(queue);
/* Check we don't have any GPU jobs running */
return gpu_queue_wait_idle(queue);
}
static VkResult
handle_reset_query_cpu_job(struct v3dv_job *job)
{
struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
assert(info->pool);
/* We are about to reset query counters so we need to make sure that
* The GPU is not using them. The exception is timestamp queries, since
* we handle those in the CPU.
*
* FIXME: we could avoid blocking the main thread for this if we use
* submission thread.
*/
v3dv: don't use a dedicated BO for each occlusion query Dedicated BOs waste memory and are also a significant cause of CPU overhead when applications use hundreds of them per frame due to all the work the kernel has to do to page in all these BOs for a job. The UE4 Vehicle demo was hitting this causing it to freeze and stutter under 1fps. The hardware allows us to setup groups of 16 queries in consecutive 4-byte addresses, requiring only that each group of 16 queries is aligned to a 1024 byte boundary. With this change, we allocate all the queries in a pool in a single BO and we assign them different offsets based on the above restriction. This eliminates the freezes and stutters in the Vehicle sample. One caveat of this solution is that we can only wait or test for completion of a query by testing if the GPU is still using its BO, which basically means that we can only wait for all active queries in a pool to complete and not just the ones being requested by the API. Since the Vulkan recommendation is to use a different query pool per frame this should not be a big issue though. If this ever becomes a problem (for example if an application does't follow the recommendation and instead allocates a single pool and splits its queries between frames), we could try to group queries in a pool into a number of BOs to try and find a balance, but for now this should work fine in most cases. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10253>
2021-04-14 13:34:00 +02:00
if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION)
v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE);
v3dv_reset_query_pools(job->device, info->pool, info->first, info->count);
return VK_SUCCESS;
}
static VkResult
handle_end_query_cpu_job(struct v3dv_job *job)
{
struct v3dv_end_query_cpu_job_info *info = &job->cpu.query_end;
for (uint32_t i = 0; i < info->count; i++) {
assert(info->query + i < info->pool->query_count);
struct v3dv_query *query = &info->pool->queries[info->query + i];
query->maybe_available = true;
}
return VK_SUCCESS;
}
static VkResult
handle_copy_query_results_cpu_job(struct v3dv_job *job)
{
struct v3dv_copy_query_results_cpu_job_info *info =
&job->cpu.query_copy_results;
assert(info->dst && info->dst->mem && info->dst->mem->bo);
struct v3dv_bo *bo = info->dst->mem->bo;
/* Map the entire dst buffer for the CPU copy if needed */
assert(!bo->map || bo->map_size == bo->size);
if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
/* FIXME: if flags includes VK_QUERY_RESULT_WAIT_BIT this could trigger a
* sync wait on the CPU for the corresponding GPU jobs to finish. We might
* want to use a submission thread to avoid blocking on the main thread.
*/
uint8_t *offset = ((uint8_t *) bo->map) +
info->offset + info->dst->mem_offset;
v3dv_get_query_pool_results_cpu(job->device,
info->pool,
info->first,
info->count,
offset,
info->stride,
info->flags);
return VK_SUCCESS;
}
static VkResult
handle_set_event_cpu_job(struct v3dv_job *job)
{
/* From the Vulkan 1.0 spec:
*
* "When vkCmdSetEvent is submitted to a queue, it defines an execution
* dependency on commands that were submitted before it, and defines an
* event signal operation which sets the event to the signaled state.
* The first synchronization scope includes every command previously
* submitted to the same queue, including those in the same command
* buffer and batch".
*
* So we should wait for all prior work to be completed before signaling
* the event, this includes all active CPU wait threads spawned for any
* command buffer submitted *before* this.
*
* FIXME: we could avoid blocking the main thread for this if we use a
* submission thread.
*/
/* If we are calling this from a wait thread it will only wait
* wait threads sspawned before it, otherwise it will wait for
* all active threads to complete.
*/
cpu_queue_wait_idle(&job->device->queue);
VkResult result = gpu_queue_wait_idle(&job->device->queue);
if (result != VK_SUCCESS)
return result;
struct v3dv_event_set_cpu_job_info *info = &job->cpu.event_set;
p_atomic_set(&info->event->state, info->state);
return VK_SUCCESS;
}
static VkResult
copy_semaphores(struct v3dv_device *device,
VkSemaphore *sems_src, uint32_t sems_src_count,
VkSemaphore **sems_dst, uint32_t *sems_dst_count)
{
*sems_dst_count = sems_src_count;
if (*sems_dst_count == 0) {
*sems_dst = NULL;
return VK_SUCCESS;
}
*sems_dst = vk_alloc(&device->vk.alloc,
*sems_dst_count * sizeof(VkSemaphore), 8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!sems_dst) {
*sems_dst_count = 0;
return VK_ERROR_OUT_OF_HOST_MEMORY;
}
memcpy(*sems_dst, sems_src, *sems_dst_count * sizeof(VkSemaphore));
return VK_SUCCESS;
}
static struct v3dv_submit_info_semaphores *
copy_semaphores_info(struct v3dv_device *device,
struct v3dv_submit_info_semaphores *info)
{
VkResult result;
struct v3dv_submit_info_semaphores *info_copy =
vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_submit_info_semaphores),
8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!info_copy)
return NULL;
result = copy_semaphores(device, info->wait_sems, info->wait_sem_count,
&info_copy->wait_sems, &info_copy->wait_sem_count);
if (result != VK_SUCCESS)
goto fail;
result = copy_semaphores(device, info->signal_sems, info->signal_sem_count,
&info_copy->signal_sems,
&info_copy->signal_sem_count);
if (result != VK_SUCCESS)
goto fail;
return info_copy;
fail:
if (info_copy->wait_sem_count > 0)
vk_free(&device->vk.alloc, info_copy->wait_sems);
vk_free(&device->vk.alloc, info_copy);
return NULL;
}
static struct v3dv_wait_thread_info *
create_wait_thread_info(struct v3dv_job *job,
struct v3dv_submit_info_semaphores *sems_info)
{
struct v3dv_wait_thread_info *info =
vk_alloc(&job->device->vk.alloc, sizeof(*info), 8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!info)
return NULL;
info->job = job;
info->sems_info = copy_semaphores_info(job->device, sems_info);
if (!info->sems_info) {
vk_free(&job->device->vk.alloc, info);
return NULL;
}
return info;
}
static void
free_wait_thread_info(struct v3dv_device *device,
struct v3dv_wait_thread_info *info)
{
assert(info != NULL);
if (info->sems_info->wait_sem_count > 0)
vk_free(&device->vk.alloc, info->sems_info->wait_sems);
if (info->sems_info->signal_sem_count > 0)
vk_free(&device->vk.alloc, info->sems_info->signal_sems);
vk_free(&device->vk.alloc, info->sems_info);
vk_free(&device->vk.alloc, info);
}
static bool
check_wait_events_complete(struct v3dv_job *job)
{
assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
for (uint32_t i = 0; i < info->event_count; i++) {
if (!p_atomic_read(&info->events[i]->state))
return false;
}
return true;
}
static void
wait_thread_finish(struct v3dv_queue *queue, pthread_t thread)
{
mtx_lock(&queue->mutex);
list_for_each_entry(struct v3dv_queue_submit_wait_info, info,
&queue->submit_wait_list, list_link) {
for (uint32_t i = 0; i < info->wait_thread_count; i++) {
if (info->wait_threads[i].thread == thread) {
info->wait_threads[i].finished = true;
goto done;
}
}
}
unreachable(!"Failed to finish wait thread: not found");
done:
mtx_unlock(&queue->mutex);
}
static void *
event_wait_thread_func(void *_info)
{
struct v3dv_wait_thread_info *info = (struct v3dv_wait_thread_info *) _info;
struct v3dv_job *job = info->job;
assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
/* Wait for events to be signaled */
const useconds_t wait_interval_ms = 1;
while (!check_wait_events_complete(job))
usleep(wait_interval_ms * 1000);
/* Now continue submitting pending jobs for the same command buffer after
* the wait job.
*/
struct v3dv_queue *queue = &job->device->queue;
list_for_each_entry_from(struct v3dv_job, pjob, job->list_link.next,
&job->cmd_buffer->jobs, list_link) {
/* We can't signal semaphores from wait threads because in this case
* we can't ensure job completion order any more (i.e. if the wait for
* events is in the first command buffer of a batch then the last job
* from the last command buffer in that batch can't signal). We always
* need to signal from the master thread in that case, when we know we
* are done submitting all jobs from all command buffers.
*/
pjob->do_sem_signal = false;
/* We don't want to spawn more than one wait thread per command buffer.
* If this job also requires a wait for events, we will do the wait here.
*/
VkResult result = queue_submit_job(queue, pjob, info->sems_info, NULL);
if (result == VK_NOT_READY) {
while (!check_wait_events_complete(pjob)) {
usleep(wait_interval_ms * 1000);
}
result = VK_SUCCESS;
}
if (result != VK_SUCCESS) {
fprintf(stderr, "Wait thread job execution failed.\n");
goto done;
}
}
done:
wait_thread_finish(queue, pthread_self());
free_wait_thread_info(job->device, info);
return NULL;
}
static VkResult
spawn_event_wait_thread(struct v3dv_wait_thread_info *info, pthread_t *wait_thread)
{
assert(info->job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
assert(info->job->cmd_buffer);
assert(wait_thread != NULL);
if (pthread_create(wait_thread, NULL, event_wait_thread_func, info))
return vk_error(info->job->device, VK_ERROR_DEVICE_LOST);
return VK_NOT_READY;
}
static VkResult
handle_wait_events_cpu_job(struct v3dv_job *job,
struct v3dv_submit_info_semaphores *sems_info,
pthread_t *wait_thread)
{
assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
/* If all events are signaled then we are done and can continue submitting
* the rest of the command buffer normally.
*/
if (check_wait_events_complete(job))
return VK_SUCCESS;
/* Otherwise, we put the rest of the command buffer on a wait thread until
* all events are signaled. We only spawn a new thread on the first
* wait job we see for a command buffer, any additional wait jobs in the
* same command buffer will run in that same wait thread and will get here
* with a NULL wait_thread pointer.
*
* Also, whether we spawn a wait thread or not, we always return
* VK_NOT_READY (unless an error happened), so we stop trying to submit
* any jobs in the same command buffer after the wait job. The wait thread
* will attempt to submit them after the wait completes.
*/
if (!wait_thread)
return VK_NOT_READY;
/* As events can be signaled by the host, jobs after the event wait must
* still wait for semaphores, if any. So, whenever we spawn a wait thread,
* we keep a copy of the semaphores (info->sems_info) to be used when
* submitting pending jobs in the wait thread context.
*/
struct v3dv_wait_thread_info *info =
create_wait_thread_info(job, sems_info);
if (!info)
return VK_ERROR_OUT_OF_HOST_MEMORY;
return spawn_event_wait_thread(info, wait_thread);
}
static VkResult
handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job)
{
assert(job->type == V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE);
struct v3dv_copy_buffer_to_image_cpu_job_info *info =
&job->cpu.copy_buffer_to_image;
/* Wait for all GPU work to finish first, since we may be accessing
* the BOs involved in the operation.
*/
v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue));
/* Map BOs */
struct v3dv_bo *dst_bo = info->image->mem->bo;
assert(!dst_bo->map || dst_bo->map_size == dst_bo->size);
if (!dst_bo->map && !v3dv_bo_map(job->device, dst_bo, dst_bo->size))
return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
void *dst_ptr = dst_bo->map;
struct v3dv_bo *src_bo = info->buffer->mem->bo;
assert(!src_bo->map || src_bo->map_size == src_bo->size);
if (!src_bo->map && !v3dv_bo_map(job->device, src_bo, src_bo->size))
return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
void *src_ptr = src_bo->map;
const struct v3d_resource_slice *slice =
&info->image->slices[info->mip_level];
const struct pipe_box box = {
info->image_offset.x, info->image_offset.y, info->base_layer,
info->image_extent.width, info->image_extent.height, info->layer_count,
};
/* Copy each layer */
for (uint32_t i = 0; i < info->layer_count; i++) {
const uint32_t dst_offset =
v3dv_layer_offset(info->image, info->mip_level, info->base_layer + i);
const uint32_t src_offset =
info->buffer->mem_offset + info->buffer_offset +
info->buffer_layer_stride * i;
v3d_store_tiled_image(
dst_ptr + dst_offset, slice->stride,
src_ptr + src_offset, info->buffer_stride,
slice->tiling, info->image->cpp, slice->padded_height, &box);
}
return VK_SUCCESS;
}
static VkResult
handle_timestamp_query_cpu_job(struct v3dv_job *job)
{
assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
/* Wait for completion of all work queued before the timestamp query */
v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue));
/* Compute timestamp */
struct timespec t;
clock_gettime(CLOCK_MONOTONIC, &t);
for (uint32_t i = 0; i < info->count; i++) {
assert(info->query + i < info->pool->query_count);
struct v3dv_query *query = &info->pool->queries[info->query + i];
query->maybe_available = true;
if (i == 0)
query->value = t.tv_sec * 1000000000ull + t.tv_nsec;
}
return VK_SUCCESS;
}
static VkResult
handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
struct v3dv_job *job,
struct v3dv_submit_info_semaphores *sems_info)
{
assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect;
assert(info->csd_job);
/* Make sure the GPU is no longer using the indirect buffer*/
assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
v3dv_bo_wait(queue->device, info->buffer->mem->bo, PIPE_TIMEOUT_INFINITE);
/* Map the indirect buffer and read the dispatch parameters */
assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
struct v3dv_bo *bo = info->buffer->mem->bo;
if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
assert(bo->map);
const uint32_t offset = info->buffer->mem_offset + info->offset;
const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
return VK_SUCCESS;
if (memcmp(group_counts, info->csd_job->csd.wg_count,
sizeof(info->csd_job->csd.wg_count)) != 0) {
v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts);
}
return VK_SUCCESS;
}
static uint32_t
semaphore_get_sync(struct v3dv_semaphore *sem)
{
if (!sem->has_temp)
return sem->sync;
assert(sem->temp_sync > 0);
return sem->temp_sync;
}
static uint32_t
fence_get_sync(struct v3dv_fence *fence)
{
if (!fence->has_temp)
return fence->sync;
assert(fence->temp_sync > 0);
return fence->temp_sync;
}
static VkResult
process_semaphores_to_signal(struct v3dv_device *device,
uint32_t count, const VkSemaphore *sems,
bool is_master_thread)
{
if (count == 0)
return VK_SUCCESS;
/* If multisync is supported, we are signalling semaphores in the last job
* of the last command buffer and, therefore, we do not need to process any
* semaphores here, unless we come from a wait thread, because in that case
* we never signal.
*/
if (device->pdevice->caps.multisync && !is_master_thread)
return VK_SUCCESS;
v3dv: move authenticated display fd acquisition to swapchain creation time So far, we have only been supporting X11, so we assumed that we were running inside X11 and would always try to get an authenticated fd from Xorg during device initialization. While this works for desktop Raspbian, it is not really correct and it is not what we want to do when we start considering other WSIs. Initially, one could think we can still do this by guarding the WSI code under the proper instance extension check. This, however, doesn't work reliably, as the Vulkan loader can call vkEnumerateDevices without enabling surface extensions on the instance, which then can lead to us not initializing any display_fd and failing with VK_ERROR_INITIALIZATION_FAILED, which is not correct, so while we can try to acquire the display_fd here, it might not always work, and we should definitely not fail initialization of the physical device for that. Instead, with this change we move acquisition of display_fd to swapchain creation time where required extensions need to be enabled in the instance. This was also suggested by Daniel Stone during review of a work-in-progress implementation for the Wayland WSI. There is a special case to consider though: applications like Zink that don't use Vulkan's swapchains at all but still allocate images that they intend to use for WSI. We need to handle these by checking that we have indeed acquired a display_fd before doing any memory allocation for WSI, and acquiring one at that time if that's not the case. This change also removes the render_fd and display_fd fields from the logical device (which we were copying from the physical device), because now there is no guarantee that we have acquired a display_fd at the time we create a logical device. Instead, we now put a reference to the physical device on the logical device from which we can access these. Finally, this also fixes a regression introduced with VK_KHR_display, where if that extension is enabled but we are running inside a compositor, we would acquire a display_fd that is not authenticated and try to use that instead of acquiring an authenticated display_fd from the display server. Fixes: b1188c9451 (v3dv: VK_KHR_display extension support) Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7546>
2020-11-11 09:45:33 +01:00
int render_fd = device->pdevice->render_fd;
int fd;
mtx_lock(&device->mutex);
drmSyncobjExportSyncFile(render_fd,
device->last_job_syncs.syncs[V3DV_QUEUE_ANY],
&fd);
mtx_unlock(&device->mutex);
if (fd == -1)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
VkResult result = VK_SUCCESS;
for (uint32_t i = 0; i < count; i++) {
struct v3dv_semaphore *sem = v3dv_semaphore_from_handle(sems[i]);
uint32_t sync = semaphore_get_sync(sem);
int ret = drmSyncobjImportSyncFile(render_fd, sync, fd);
if (ret) {
result = VK_ERROR_OUT_OF_HOST_MEMORY;
break;
}
}
assert(fd >= 0);
close(fd);
return result;
}
static VkResult
queue_submit_noop_job(struct v3dv_queue *queue,
struct v3dv_submit_info_semaphores *sems_info,
bool do_sem_signal, bool serialize);
static VkResult
process_fence_to_signal(struct v3dv_device *device, VkFence _fence)
{
if (_fence == VK_NULL_HANDLE)
return VK_SUCCESS;
struct v3dv_fence *fence = v3dv_fence_from_handle(_fence);
v3dv: move authenticated display fd acquisition to swapchain creation time So far, we have only been supporting X11, so we assumed that we were running inside X11 and would always try to get an authenticated fd from Xorg during device initialization. While this works for desktop Raspbian, it is not really correct and it is not what we want to do when we start considering other WSIs. Initially, one could think we can still do this by guarding the WSI code under the proper instance extension check. This, however, doesn't work reliably, as the Vulkan loader can call vkEnumerateDevices without enabling surface extensions on the instance, which then can lead to us not initializing any display_fd and failing with VK_ERROR_INITIALIZATION_FAILED, which is not correct, so while we can try to acquire the display_fd here, it might not always work, and we should definitely not fail initialization of the physical device for that. Instead, with this change we move acquisition of display_fd to swapchain creation time where required extensions need to be enabled in the instance. This was also suggested by Daniel Stone during review of a work-in-progress implementation for the Wayland WSI. There is a special case to consider though: applications like Zink that don't use Vulkan's swapchains at all but still allocate images that they intend to use for WSI. We need to handle these by checking that we have indeed acquired a display_fd before doing any memory allocation for WSI, and acquiring one at that time if that's not the case. This change also removes the render_fd and display_fd fields from the logical device (which we were copying from the physical device), because now there is no guarantee that we have acquired a display_fd at the time we create a logical device. Instead, we now put a reference to the physical device on the logical device from which we can access these. Finally, this also fixes a regression introduced with VK_KHR_display, where if that extension is enabled but we are running inside a compositor, we would acquire a display_fd that is not authenticated and try to use that instead of acquiring an authenticated display_fd from the display server. Fixes: b1188c9451 (v3dv: VK_KHR_display extension support) Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7546>
2020-11-11 09:45:33 +01:00
int render_fd = device->pdevice->render_fd;
if (device->pdevice->caps.multisync) {
struct v3dv_queue *queue = &device->queue;
/* We signal the fence once all submitted command buffers have completed
* execution. For this, we emit a noop job that waits on the completion
* of all submitted jobs and signal the fence for this submission.
* FIXME: In simpler cases (for instance, when all jobs were submitted to
* the same queue), we can just import the last out sync produced into
* the fence.
*/
struct v3dv_submit_info_semaphores sems_info = {
.wait_sem_count = 0,
.wait_sems = NULL,
.signal_sem_count = 0,
.signal_sems = NULL,
.fence = _fence,
};
return queue_submit_noop_job(queue, &sems_info, false, true);
}
int fd;
mtx_lock(&device->mutex);
drmSyncobjExportSyncFile(render_fd,
device->last_job_syncs.syncs[V3DV_QUEUE_ANY],
&fd);
mtx_unlock(&device->mutex);
if (fd == -1)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
uint32_t sync = fence_get_sync(fence);
int ret = drmSyncobjImportSyncFile(render_fd, sync, fd);
assert(fd >= 0);
close(fd);
return ret ? VK_ERROR_OUT_OF_HOST_MEMORY : VK_SUCCESS;
}
static void
multisync_free(struct v3dv_device *device,
struct drm_v3d_multi_sync *ms)
{
vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->out_syncs);
vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->in_syncs);
}
static struct drm_v3d_sem *
set_in_syncs(struct v3dv_device *device,
struct v3dv_job *job,
enum v3dv_queue_type queue,
uint32_t *count,
struct v3dv_submit_info_semaphores *sems_info)
{
uint32_t n_sems = 0;
/* If this is the first job submitted to a given GPU queue in this cmd buf
* batch, it has to wait on wait semaphores (if any) before running.
*/
if (device->last_job_syncs.first[queue])
n_sems = sems_info->wait_sem_count;
/* If we don't need to wait on wait semaphores but the serialize flag is
* set, this job waits for completion of all GPU jobs submitted in any
* queue V3DV_QUEUE_(CL/TFU/CSD) before running.
*/
*count = n_sems == 0 && job->serialize ? 3 : n_sems;
if (!*count)
return NULL;
struct drm_v3d_sem *syncs =
vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!syncs)
return NULL;
if (n_sems) {
for (int i = 0; i < *count; i++) {
struct v3dv_semaphore *sem =
v3dv_semaphore_from_handle(sems_info->wait_sems[i]);
syncs[i].handle = semaphore_get_sync(sem);
/* From the Vulkan 1.0 spec:
*
* "If the import is temporary, the implementation must restore
* the semaphore to its prior permanent state after submitting
* the next semaphore wait operation."
*
* We can't destroy the temporary sync until the kernel is done
* with it, this is why we need to have this 'has_temp' flag instead
* of checking temp_sync for 0 to know if we have a temporary
* payload. The temporary sync will be destroyed if we import into
* the semaphore again or if the semaphore is destroyed by the
* client.
*/
sem->has_temp = false;
}
} else {
for (int i = 0; i < *count; i++)
syncs[i].handle = device->last_job_syncs.syncs[i];
}
return syncs;
}
static struct drm_v3d_sem *
set_out_syncs(struct v3dv_device *device,
struct v3dv_job *job,
enum v3dv_queue_type queue,
uint32_t *count,
struct v3dv_submit_info_semaphores *sems_info)
{
uint32_t n_sems = job->do_sem_signal ? sems_info->signal_sem_count : 0;
/* We always signal the syncobj from `device->last_job_syncs` related to
* this v3dv_queue_type to track the last job submitted to this queue.
*/
(*count) = n_sems + 1;
if (sems_info->fence)
(*count)++;
struct drm_v3d_sem *syncs =
vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!syncs)
return NULL;
if (n_sems) {
for (unsigned i = 0; i < n_sems; i++) {
struct v3dv_semaphore *sem =
v3dv_semaphore_from_handle(sems_info->signal_sems[i]);
syncs[i].handle = semaphore_get_sync(sem);
}
}
syncs[n_sems].handle = device->last_job_syncs.syncs[queue];
if (sems_info->fence) {
struct v3dv_fence *fence = v3dv_fence_from_handle(sems_info->fence);
syncs[++n_sems].handle = fence_get_sync(fence);
}
return syncs;
}
static void
set_ext(struct drm_v3d_extension *ext,
struct drm_v3d_extension *next,
uint32_t id,
uintptr_t flags)
{
ext->next = (uintptr_t)(void *)next;
ext->id = id;
ext->flags = flags;
}
/* This function sets the extension for multiple in/out syncobjs. When it is
* successful, it sets the extension id to DRM_V3D_EXT_ID_MULTI_SYNC.
* Otherwise, the extension id is 0, which means an out-of-memory error.
*/
static void
set_multisync(struct drm_v3d_multi_sync *ms,
struct v3dv_submit_info_semaphores *sems_info,
struct drm_v3d_extension *next,
struct v3dv_device *device,
struct v3dv_job *job,
enum v3dv_queue_type queue_sync,
enum v3d_queue wait_stage)
{
uint32_t out_sync_count = 0, in_sync_count = 0;
struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL;
in_syncs = set_in_syncs(device, job, queue_sync,
&in_sync_count, sems_info);
if (!in_syncs && in_sync_count)
goto fail;
out_syncs = set_out_syncs(device, job, queue_sync,
&out_sync_count, sems_info);
assert(out_sync_count > 0);
if (!out_syncs)
goto fail;
set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0);
ms->wait_stage = wait_stage;
ms->out_sync_count = out_sync_count;
ms->out_syncs = (uintptr_t)(void *)out_syncs;
ms->in_sync_count = in_sync_count;
ms->in_syncs = (uintptr_t)(void *)in_syncs;
device->last_job_syncs.first[queue_sync] = false;
return;
fail:
if (in_syncs)
vk_free(&device->vk.alloc, in_syncs);
assert(!out_syncs);
return;
}
static VkResult
handle_cl_job(struct v3dv_queue *queue,
struct v3dv_job *job,
struct v3dv_submit_info_semaphores *sems_info)
{
struct v3dv_device *device = queue->device;
struct drm_v3d_submit_cl submit = { 0 };
/* Sanity check: we should only flag a bcl sync on a job that needs to be
* serialized.
*/
assert(job->serialize || !job->needs_bcl_sync);
/* We expect to have just one RCL per job which should fit in just one BO.
* Our BCL, could chain multiple BOS together though.
*/
assert(list_length(&job->rcl.bo_list) == 1);
assert(list_length(&job->bcl.bo_list) >= 1);
struct v3dv_bo *bcl_fist_bo =
list_first_entry(&job->bcl.bo_list, struct v3dv_bo, list_link);
submit.bcl_start = bcl_fist_bo->offset;
submit.bcl_end = job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
submit.rcl_start = job->rcl.bo->offset;
submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
submit.qma = job->tile_alloc->offset;
submit.qms = job->tile_alloc->size;
submit.qts = job->tile_state->offset;
submit.flags = 0;
if (job->tmu_dirty_rcl)
submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
submit.bo_handle_count = job->bo_count;
uint32_t *bo_handles =
(uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count);
uint32_t bo_idx = 0;
set_foreach(job->bos, entry) {
struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
bo_handles[bo_idx++] = bo->handle;
}
assert(bo_idx == submit.bo_handle_count);
submit.bo_handles = (uintptr_t)(void *)bo_handles;
/* We need a binning sync if we are waiting on a semaphore or if the job
* comes after a pipeline barrier that involves geometry stages
* (needs_bcl_sync).
*
* We need a render sync if the job doesn't need a binning sync but has
* still been flagged for serialization. It should be noted that RCL jobs
* don't start until the previous RCL job has finished so we don't really
* need to add a fence for those, however, we might need to wait on a CSD or
* TFU job, which are not automatically serialized with CL jobs.
*
* FIXME: see if we can do better and avoid bcl syncs for any jobs in the
* command buffer after the first job where we should be able to track bcl
* dependencies strictly through barriers.
*/
const bool needs_bcl_sync =
sems_info->wait_sem_count > 0 || job->needs_bcl_sync;
const bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
mtx_lock(&queue->device->mutex);
/* Replace single semaphore settings whenever our kernel-driver supports
* multiple semaphores extension.
*/
struct drm_v3d_multi_sync ms = { 0 };
if (device->pdevice->caps.multisync) {
enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN;
set_multisync(&ms, sems_info, NULL, device, job,
V3DV_QUEUE_CL, wait_stage);
if (!ms.base.id) {
mtx_unlock(&queue->device->mutex);
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
}
submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
submit.extensions = (uintptr_t)(void *)&ms;
/* Disable legacy sync interface when multisync extension is used */
submit.in_sync_rcl = 0;
submit.in_sync_bcl = 0;
submit.out_sync = 0;
} else {
uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY];
submit.in_sync_bcl = needs_bcl_sync ? last_job_sync : 0;
submit.in_sync_rcl = needs_rcl_sync ? last_job_sync : 0;
submit.out_sync = last_job_sync;
}
v3dv_clif_dump(device, job, &submit);
v3dv: move authenticated display fd acquisition to swapchain creation time So far, we have only been supporting X11, so we assumed that we were running inside X11 and would always try to get an authenticated fd from Xorg during device initialization. While this works for desktop Raspbian, it is not really correct and it is not what we want to do when we start considering other WSIs. Initially, one could think we can still do this by guarding the WSI code under the proper instance extension check. This, however, doesn't work reliably, as the Vulkan loader can call vkEnumerateDevices without enabling surface extensions on the instance, which then can lead to us not initializing any display_fd and failing with VK_ERROR_INITIALIZATION_FAILED, which is not correct, so while we can try to acquire the display_fd here, it might not always work, and we should definitely not fail initialization of the physical device for that. Instead, with this change we move acquisition of display_fd to swapchain creation time where required extensions need to be enabled in the instance. This was also suggested by Daniel Stone during review of a work-in-progress implementation for the Wayland WSI. There is a special case to consider though: applications like Zink that don't use Vulkan's swapchains at all but still allocate images that they intend to use for WSI. We need to handle these by checking that we have indeed acquired a display_fd before doing any memory allocation for WSI, and acquiring one at that time if that's not the case. This change also removes the render_fd and display_fd fields from the logical device (which we were copying from the physical device), because now there is no guarantee that we have acquired a display_fd at the time we create a logical device. Instead, we now put a reference to the physical device on the logical device from which we can access these. Finally, this also fixes a regression introduced with VK_KHR_display, where if that extension is enabled but we are running inside a compositor, we would acquire a display_fd that is not authenticated and try to use that instead of acquiring an authenticated display_fd from the display server. Fixes: b1188c9451 (v3dv: VK_KHR_display extension support) Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7546>
2020-11-11 09:45:33 +01:00
int ret = v3dv_ioctl(device->pdevice->render_fd,
DRM_IOCTL_V3D_SUBMIT_CL, &submit);
mtx_unlock(&queue->device->mutex);
static bool warned = false;
if (ret && !warned) {
fprintf(stderr, "Draw call returned %s. Expect corruption.\n",
strerror(errno));
warned = true;
}
free(bo_handles);
multisync_free(device, &ms);
if (ret)
return vk_error(device, VK_ERROR_DEVICE_LOST);
return VK_SUCCESS;
}
static VkResult
handle_tfu_job(struct v3dv_queue *queue,
struct v3dv_job *job,
struct v3dv_submit_info_semaphores *sems_info)
{
struct v3dv_device *device = queue->device;
const bool needs_sync = sems_info->wait_sem_count || job->serialize;
mtx_lock(&device->mutex);
/* Replace single semaphore settings whenever our kernel-driver supports
* multiple semaphore extension.
*/
struct drm_v3d_multi_sync ms = { 0 };
if (device->pdevice->caps.multisync) {
set_multisync(&ms, sems_info, NULL, device, job,
V3DV_QUEUE_TFU, V3D_TFU);
if (!ms.base.id) {
mtx_unlock(&device->mutex);
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
}
job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION;
job->tfu.extensions = (uintptr_t)(void *)&ms;
/* Disable legacy sync interface when multisync extension is used */
job->tfu.in_sync = 0;
job->tfu.out_sync = 0;
} else {
uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY];
job->tfu.in_sync = needs_sync ? last_job_sync : 0;
job->tfu.out_sync = last_job_sync;
}
v3dv: move authenticated display fd acquisition to swapchain creation time So far, we have only been supporting X11, so we assumed that we were running inside X11 and would always try to get an authenticated fd from Xorg during device initialization. While this works for desktop Raspbian, it is not really correct and it is not what we want to do when we start considering other WSIs. Initially, one could think we can still do this by guarding the WSI code under the proper instance extension check. This, however, doesn't work reliably, as the Vulkan loader can call vkEnumerateDevices without enabling surface extensions on the instance, which then can lead to us not initializing any display_fd and failing with VK_ERROR_INITIALIZATION_FAILED, which is not correct, so while we can try to acquire the display_fd here, it might not always work, and we should definitely not fail initialization of the physical device for that. Instead, with this change we move acquisition of display_fd to swapchain creation time where required extensions need to be enabled in the instance. This was also suggested by Daniel Stone during review of a work-in-progress implementation for the Wayland WSI. There is a special case to consider though: applications like Zink that don't use Vulkan's swapchains at all but still allocate images that they intend to use for WSI. We need to handle these by checking that we have indeed acquired a display_fd before doing any memory allocation for WSI, and acquiring one at that time if that's not the case. This change also removes the render_fd and display_fd fields from the logical device (which we were copying from the physical device), because now there is no guarantee that we have acquired a display_fd at the time we create a logical device. Instead, we now put a reference to the physical device on the logical device from which we can access these. Finally, this also fixes a regression introduced with VK_KHR_display, where if that extension is enabled but we are running inside a compositor, we would acquire a display_fd that is not authenticated and try to use that instead of acquiring an authenticated display_fd from the display server. Fixes: b1188c9451 (v3dv: VK_KHR_display extension support) Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7546>
2020-11-11 09:45:33 +01:00
int ret = v3dv_ioctl(device->pdevice->render_fd,
DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
mtx_unlock(&device->mutex);
multisync_free(device, &ms);
if (ret != 0) {
fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
return vk_error(device, VK_ERROR_DEVICE_LOST);
}
return VK_SUCCESS;
}
static VkResult
handle_csd_job(struct v3dv_queue *queue,
struct v3dv_job *job,
struct v3dv_submit_info_semaphores *sems_info)
{
struct v3dv_device *device = queue->device;
struct drm_v3d_submit_csd *submit = &job->csd.submit;
submit->bo_handle_count = job->bo_count;
uint32_t *bo_handles =
(uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));
uint32_t bo_idx = 0;
set_foreach(job->bos, entry) {
struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
bo_handles[bo_idx++] = bo->handle;
}
assert(bo_idx == submit->bo_handle_count);
submit->bo_handles = (uintptr_t)(void *)bo_handles;
const bool needs_sync = sems_info->wait_sem_count || job->serialize;
mtx_lock(&queue->device->mutex);
/* Replace single semaphore settings whenever our kernel-driver supports
* multiple semaphore extension.
*/
struct drm_v3d_multi_sync ms = { 0 };
if (device->pdevice->caps.multisync) {
set_multisync(&ms, sems_info, NULL, device, job,
V3DV_QUEUE_CSD, V3D_CSD);
if (!ms.base.id) {
mtx_unlock(&queue->device->mutex);
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
}
submit->flags |= DRM_V3D_SUBMIT_EXTENSION;
submit->extensions = (uintptr_t)(void *)&ms;
/* Disable legacy sync interface when multisync extension is used */
submit->in_sync = 0;
submit->out_sync = 0;
} else {
uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY];
submit->in_sync = needs_sync ? last_job_sync : 0;
submit->out_sync = last_job_sync;
}
v3dv: move authenticated display fd acquisition to swapchain creation time So far, we have only been supporting X11, so we assumed that we were running inside X11 and would always try to get an authenticated fd from Xorg during device initialization. While this works for desktop Raspbian, it is not really correct and it is not what we want to do when we start considering other WSIs. Initially, one could think we can still do this by guarding the WSI code under the proper instance extension check. This, however, doesn't work reliably, as the Vulkan loader can call vkEnumerateDevices without enabling surface extensions on the instance, which then can lead to us not initializing any display_fd and failing with VK_ERROR_INITIALIZATION_FAILED, which is not correct, so while we can try to acquire the display_fd here, it might not always work, and we should definitely not fail initialization of the physical device for that. Instead, with this change we move acquisition of display_fd to swapchain creation time where required extensions need to be enabled in the instance. This was also suggested by Daniel Stone during review of a work-in-progress implementation for the Wayland WSI. There is a special case to consider though: applications like Zink that don't use Vulkan's swapchains at all but still allocate images that they intend to use for WSI. We need to handle these by checking that we have indeed acquired a display_fd before doing any memory allocation for WSI, and acquiring one at that time if that's not the case. This change also removes the render_fd and display_fd fields from the logical device (which we were copying from the physical device), because now there is no guarantee that we have acquired a display_fd at the time we create a logical device. Instead, we now put a reference to the physical device on the logical device from which we can access these. Finally, this also fixes a regression introduced with VK_KHR_display, where if that extension is enabled but we are running inside a compositor, we would acquire a display_fd that is not authenticated and try to use that instead of acquiring an authenticated display_fd from the display server. Fixes: b1188c9451 (v3dv: VK_KHR_display extension support) Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7546>
2020-11-11 09:45:33 +01:00
int ret = v3dv_ioctl(device->pdevice->render_fd,
DRM_IOCTL_V3D_SUBMIT_CSD, submit);
mtx_unlock(&queue->device->mutex);
static bool warned = false;
if (ret && !warned) {
fprintf(stderr, "Compute dispatch returned %s. Expect corruption.\n",
strerror(errno));
warned = true;
}
free(bo_handles);
multisync_free(device, &ms);
if (ret)
return vk_error(device, VK_ERROR_DEVICE_LOST);
return VK_SUCCESS;
}
static VkResult
queue_submit_job(struct v3dv_queue *queue,
struct v3dv_job *job,
struct v3dv_submit_info_semaphores *sems_info,
pthread_t *wait_thread)
{
assert(job);
/* CPU jobs typically execute explicit waits before they are processed. For
* example, a query reset CPU job will explicitly wait for the queries
* being unused before proceeding, etc. However, if we have any wait
* semaphores, we need to honour that too for the first CPU job we process
* in the command buffer batch. We do that by waiting for idle to ensure
* that any previous work has been completed, at which point any wait
* semaphores must be signalled, and we never need to do this again for the
* same batch.
*
* There is a corner case here when the semaphore has been imported from
* another instance/process. In that scenario, the Vulkan spec still requires
* that a signaling operation has been submitted before this semaphore wait
* but our wait for idle checks won't know about that submission (since they
* are based on the last jobs sent from our instance). To fix that we submit
* a noop job to "consume" the semaphores and then we wait for idle, which
* will ensure that our CPU job waits for the semaphores to be signaled even
* if they are signaled from another instance or process.
*/
if (!v3dv_job_type_is_gpu(job) && sems_info->wait_sem_count) {
queue_submit_noop_job(queue, sems_info, false, false);
v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue));
#ifdef DEBUG
/* Loop through wait sems and check they are all signalled */
for (int i = 0; i < sems_info->wait_sem_count; i++) {
int render_fd = queue->device->pdevice->render_fd;
struct v3dv_semaphore *sem =
v3dv_semaphore_from_handle(sems_info->wait_sems[i]);
uint32_t sem_sync = semaphore_get_sync(sem);
int ret = drmSyncobjWait(render_fd, &sem_sync, 1, 0, 0, NULL);
assert(ret == 0);
}
#endif
sems_info->wait_sem_count = 0;
}
switch (job->type) {
case V3DV_JOB_TYPE_GPU_CL:
return handle_cl_job(queue, job, sems_info);
case V3DV_JOB_TYPE_GPU_TFU:
return handle_tfu_job(queue, job, sems_info);
case V3DV_JOB_TYPE_GPU_CSD:
return handle_csd_job(queue, job, sems_info);
case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
return handle_reset_query_cpu_job(job);
case V3DV_JOB_TYPE_CPU_END_QUERY:
return handle_end_query_cpu_job(job);
case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
return handle_copy_query_results_cpu_job(job);
case V3DV_JOB_TYPE_CPU_SET_EVENT:
return handle_set_event_cpu_job(job);
case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
return handle_wait_events_cpu_job(job, sems_info, wait_thread);
case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE:
return handle_copy_buffer_to_image_cpu_job(job);
case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
return handle_csd_indirect_cpu_job(queue, job, sems_info);
case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY:
return handle_timestamp_query_cpu_job(job);
default:
unreachable("Unhandled job type");
}
}
static VkResult
queue_create_noop_job(struct v3dv_queue *queue)
{
struct v3dv_device *device = queue->device;
queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!queue->noop_job)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1);
v3dv: start to move and wrap hw-version code with v3dv_queue The idea would be to move all the code that uses cl_emit, cl_emit_with_prepack, v3dx_pack, and any enum/structure definition defined on the v3d pack headers. All those methods would be defined on v3dvx_private (that would be the equivalent to v3dx_context.h on v3d). This commit includes the definition of v3dX for the current version supported (42), a function calling wrapper, and the move for v3dv_queue methods as a reference. About the function calling wrapper, I took the idea from anv. We don't have on v3d, but we added it because we foresee that we will need that functionality more often. So without that macro, in order to call the correct version of the method from the general code we would need to do like we do on v3d, and doing something like this: if (devinfo->ver >= 42) return v3d42_pack_sampler_state(sampler, pCreateInfo); else return v3d33_pack_sampler_state(sampler, pCreateInfo); So with the macro we can just do this: v3dv_X(device, pack_sampler_state)(sampler, pCreateInfo). Note that as mentioned, that is to be used on the general code, so a runtime decision. If we are already on version-dependant code (so at v3dx_queue for example) we just use v3dX, as at that point is a build time decision. Also, fwiw, I don't like too much the name of that macro, but I was not able to think on a better one. v2: merge job_emit_noop_bin and job_emit_noop_render (Iago) Reviewed-by: Iago Toral Quiroga <itoral@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11310>
2021-06-10 13:39:35 +02:00
v3dv_X(device, job_emit_noop)(queue->noop_job);
return VK_SUCCESS;
}
static VkResult
queue_submit_noop_job(struct v3dv_queue *queue,
struct v3dv_submit_info_semaphores *sems_info,
bool do_sem_signal, bool serialize)
{
if (!do_sem_signal && !serialize && !sems_info->wait_sem_count)
return VK_SUCCESS;
/* We need to protect noop_job against concurrent access. While
* the client must externally synchronize queue submissions, we
* may spawn threads that can submit noop jobs themselves.
*/
mtx_lock(&queue->noop_mutex);
if (!queue->noop_job) {
VkResult result = queue_create_noop_job(queue);
if (result != VK_SUCCESS) {
mtx_unlock(&queue->noop_mutex);
return result;
}
}
queue->noop_job->do_sem_signal = do_sem_signal;
queue->noop_job->serialize = serialize;
VkResult result =
queue_submit_job(queue, queue->noop_job, sems_info, NULL);
mtx_unlock(&queue->noop_mutex);
return result;
}
/* This function takes a job type and returns True if we have
* previously submitted any jobs for the same command buffer batch
* to a queue different to the one for this job type.
*/
static bool
cmd_buffer_batch_is_multi_queue(struct v3dv_device *device,
enum v3dv_job_type job_type)
{
enum v3dv_queue_type queue_type = V3DV_QUEUE_ANY;
struct v3dv_last_job_sync last_job_syncs;
mtx_lock(&device->mutex);
memcpy(&last_job_syncs, &device->last_job_syncs, sizeof(last_job_syncs));
mtx_unlock(&device->mutex);
switch (job_type) {
case V3DV_JOB_TYPE_GPU_CL:
case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
queue_type = V3DV_QUEUE_CL;
break;
case V3DV_JOB_TYPE_GPU_TFU:
queue_type = V3DV_QUEUE_TFU;
break;
case V3DV_JOB_TYPE_GPU_CSD:
queue_type = V3DV_QUEUE_CSD;
break;
default:
unreachable("Queue type is undefined");
break;
}
for (int i = 0; i < V3DV_QUEUE_ANY; i++) {
if (i != queue_type && !last_job_syncs.first[i]) {
return true;
}
}
return false;
}
static VkResult
queue_submit_cmd_buffer(struct v3dv_queue *queue,
struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_submit_info_semaphores *sems_info,
bool is_last_cmd_buffer,
pthread_t *wait_thread)
{
struct v3dv_job *last;
bool do_sem_signal = is_last_cmd_buffer && sems_info->signal_sem_count > 0;
assert(cmd_buffer);
assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_EXECUTABLE);
if (list_is_empty(&cmd_buffer->jobs))
return queue_submit_noop_job(queue, sems_info, do_sem_signal, false);
/* When we are in the last cmd buffer and there are semaphores to signal,
* we process semaphores in the last job, following these conditions:
* - CPU-job: we can't signal until all GPU work has completed, so we
* submit a serialized noop GPU job to handle signaling when all on-going
* GPU work on all queues has completed.
* - GPU-job: can signal semaphores only if we have not submitted jobs to
* a queue other than the queue of this job. Otherwise, we submit a
* serialized noop job to handle signaling.
*/
if (do_sem_signal) {
last = list_last_entry(&cmd_buffer->jobs, struct v3dv_job, list_link);
if (v3dv_job_type_is_gpu(last))
last->do_sem_signal = true;
}
list_for_each_entry_safe(struct v3dv_job, job,
&cmd_buffer->jobs, list_link) {
if (job->do_sem_signal &&
cmd_buffer_batch_is_multi_queue(queue->device, job->type))
job->do_sem_signal = false;
VkResult result = queue_submit_job(queue, job, sems_info, wait_thread);
if (result != VK_SUCCESS)
return result;
}
/* If we are in the last cmd buffer batch, but the last job cannot handle
* signal semaphores, we emit a serialized noop_job for signalling.
*/
if (do_sem_signal && !(last && last->do_sem_signal))
return queue_submit_noop_job(queue, sems_info, true, true);
return VK_SUCCESS;
}
static void
add_wait_thread_to_list(struct v3dv_device *device,
pthread_t thread,
struct v3dv_queue_submit_wait_info **wait_info)
{
/* If this is the first time we spawn a wait thread for this queue
* submission create a v3dv_queue_submit_wait_info to track this and
* any other threads in the same submission and add it to the global list
* in the queue.
*/
if (*wait_info == NULL) {
*wait_info =
vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_queue_submit_wait_info), 8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
(*wait_info)->device = device;
}
/* And add the thread to the list of wait threads for this submission */
const uint32_t thread_idx = (*wait_info)->wait_thread_count;
assert(thread_idx < 16);
(*wait_info)->wait_threads[thread_idx].thread = thread;
(*wait_info)->wait_threads[thread_idx].finished = false;
(*wait_info)->wait_thread_count++;
}
static void
add_signal_semaphores_to_wait_list(struct v3dv_device *device,
const VkSubmitInfo *pSubmit,
struct v3dv_queue_submit_wait_info *wait_info)
{
assert(wait_info);
if (pSubmit->signalSemaphoreCount == 0)
return;
/* Otherwise, we put all the semaphores in a list and we signal all of them
* together from the submit master thread when the last wait thread in the
* submit completes.
*/
/* Check the size of the current semaphore list */
const uint32_t prev_count = wait_info->signal_semaphore_count;
const uint32_t prev_alloc_size = prev_count * sizeof(VkSemaphore);
VkSemaphore *prev_list = wait_info->signal_semaphores;
/* Resize the list to hold the additional semaphores */
const uint32_t extra_alloc_size =
pSubmit->signalSemaphoreCount * sizeof(VkSemaphore);
wait_info->signal_semaphore_count += pSubmit->signalSemaphoreCount;
wait_info->signal_semaphores =
vk_alloc(&device->vk.alloc, prev_alloc_size + extra_alloc_size, 8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
/* Copy the old list to the new allocation and free the old list */
if (prev_count > 0) {
memcpy(wait_info->signal_semaphores, prev_list, prev_alloc_size);
vk_free(&device->vk.alloc, prev_list);
}
/* Add the new semaphores to the list */
memcpy(wait_info->signal_semaphores + prev_count,
pSubmit->pSignalSemaphores, extra_alloc_size);
}
static VkResult
queue_submit_cmd_buffer_batch(struct v3dv_queue *queue,
const VkSubmitInfo *pSubmit,
struct v3dv_queue_submit_wait_info **wait_info)
{
VkResult result = VK_SUCCESS;
bool has_wait_threads = false;
/* Wrap wait semaphores info from VkSubmitInfo to use it whenever we need
* the data to submit all jobs in the same command buffer batch.
*/
struct v3dv_submit_info_semaphores sems_info = {
.wait_sem_count = pSubmit->waitSemaphoreCount,
.wait_sems = (VkSemaphore *) pSubmit->pWaitSemaphores,
.signal_sem_count = pSubmit->signalSemaphoreCount,
.signal_sems = (VkSemaphore *) pSubmit->pSignalSemaphores,
.fence = 0,
};
/* In the beginning of a cmd buffer batch, we set all last_job_syncs as
* first. It helps to determine wait semaphores conditions.
*/
for (unsigned i = 0; i < V3DV_QUEUE_COUNT; i++)
queue->device->last_job_syncs.first[i] = true;
/* Even if we don't have any actual work to submit we still need to wait
* on the wait semaphores and signal the signal semaphores and fence, so
* in this scenario we just submit a trivial no-op job so we don't have
* to do anything special, it should not be a common case anyway.
*/
if (pSubmit->commandBufferCount == 0) {
result = queue_submit_noop_job(queue, &sems_info,
sems_info.signal_sem_count > 0, false);
} else {
const uint32_t last_cmd_buffer_idx = pSubmit->commandBufferCount - 1;
for (uint32_t i = 0; i < pSubmit->commandBufferCount; i++) {
pthread_t wait_thread;
struct v3dv_cmd_buffer *cmd_buffer =
v3dv_cmd_buffer_from_handle(pSubmit->pCommandBuffers[i]);
result = queue_submit_cmd_buffer(queue, cmd_buffer, &sems_info,
(i == last_cmd_buffer_idx),
&wait_thread);
/* We get VK_NOT_READY if we had to spawn a wait thread for the
* command buffer. In that scenario, we want to continue submitting
* any pending command buffers in the batch, but we don't want to
* process any signal semaphores for the batch until we know we have
* submitted every job for every command buffer in the batch.
*/
if (result == VK_NOT_READY) {
result = VK_SUCCESS;
add_wait_thread_to_list(queue->device, wait_thread, wait_info);
has_wait_threads = true;
}
if (result != VK_SUCCESS)
break;
}
}
if (result != VK_SUCCESS)
return result;
/* If had to emit any wait threads in this submit we need to wait for all
* of them to complete before we can signal any semaphores.
*/
if (!has_wait_threads) {
return process_semaphores_to_signal(queue->device,
pSubmit->signalSemaphoreCount,
pSubmit->pSignalSemaphores,
false);
} else {
assert(*wait_info);
add_signal_semaphores_to_wait_list(queue->device, pSubmit, *wait_info);
return VK_NOT_READY;
}
}
static void *
master_wait_thread_func(void *_wait_info)
{
struct v3dv_queue_submit_wait_info *wait_info =
(struct v3dv_queue_submit_wait_info *) _wait_info;
struct v3dv_queue *queue = &wait_info->device->queue;
/* Wait for all command buffer wait threads to complete */
for (uint32_t i = 0; i < wait_info->wait_thread_count; i++) {
int res = pthread_join(wait_info->wait_threads[i].thread, NULL);
if (res != 0)
fprintf(stderr, "Wait thread failed to join.\n");
}
/* Signal semaphores and fences */
VkResult result;
result = process_semaphores_to_signal(wait_info->device,
wait_info->signal_semaphore_count,
wait_info->signal_semaphores,
true);
if (result != VK_SUCCESS)
fprintf(stderr, "Wait thread semaphore signaling failed.");
result = process_fence_to_signal(wait_info->device, wait_info->fence);
if (result != VK_SUCCESS)
fprintf(stderr, "Wait thread fence signaling failed.");
/* Release wait_info */
mtx_lock(&queue->mutex);
list_del(&wait_info->list_link);
mtx_unlock(&queue->mutex);
vk_free(&wait_info->device->vk.alloc, wait_info->signal_semaphores);
vk_free(&wait_info->device->vk.alloc, wait_info);
return NULL;
}
static VkResult
spawn_master_wait_thread(struct v3dv_queue *queue,
struct v3dv_queue_submit_wait_info *wait_info)
{
VkResult result = VK_SUCCESS;
mtx_lock(&queue->mutex);
if (pthread_create(&wait_info->master_wait_thread, NULL,
master_wait_thread_func, wait_info)) {
result = vk_error(queue, VK_ERROR_DEVICE_LOST);
goto done;
}
list_addtail(&wait_info->list_link, &queue->submit_wait_list);
done:
mtx_unlock(&queue->mutex);
return result;
}
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_QueueSubmit(VkQueue _queue,
uint32_t submitCount,
const VkSubmitInfo* pSubmits,
VkFence fence)
{
V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
struct v3dv_queue_submit_wait_info *wait_info = NULL;
VkResult result = VK_SUCCESS;
for (uint32_t i = 0; i < submitCount; i++) {
result = queue_submit_cmd_buffer_batch(queue, &pSubmits[i], &wait_info);
if (result != VK_SUCCESS && result != VK_NOT_READY)
goto done;
}
if (!wait_info) {
assert(result != VK_NOT_READY);
result = process_fence_to_signal(queue->device, fence);
goto done;
}
/* We emitted wait threads, so we have to spwan a master thread for this
* queue submission that waits for all other threads to complete and then
* will signal any semaphores and fences.
*/
assert(wait_info);
wait_info->fence = fence;
result = spawn_master_wait_thread(queue, wait_info);
done:
return result;
}
static void
destroy_syncobj(uint32_t device_fd, uint32_t *sync)
{
assert(sync);
drmSyncobjDestroy(device_fd, *sync);
*sync = 0;
}
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateSemaphore(VkDevice _device,
const VkSemaphoreCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
VkSemaphore *pSemaphore)
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO);
struct v3dv_semaphore *sem =
vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_semaphore),
VK_OBJECT_TYPE_SEMAPHORE);
if (sem == NULL)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
v3dv: move authenticated display fd acquisition to swapchain creation time So far, we have only been supporting X11, so we assumed that we were running inside X11 and would always try to get an authenticated fd from Xorg during device initialization. While this works for desktop Raspbian, it is not really correct and it is not what we want to do when we start considering other WSIs. Initially, one could think we can still do this by guarding the WSI code under the proper instance extension check. This, however, doesn't work reliably, as the Vulkan loader can call vkEnumerateDevices without enabling surface extensions on the instance, which then can lead to us not initializing any display_fd and failing with VK_ERROR_INITIALIZATION_FAILED, which is not correct, so while we can try to acquire the display_fd here, it might not always work, and we should definitely not fail initialization of the physical device for that. Instead, with this change we move acquisition of display_fd to swapchain creation time where required extensions need to be enabled in the instance. This was also suggested by Daniel Stone during review of a work-in-progress implementation for the Wayland WSI. There is a special case to consider though: applications like Zink that don't use Vulkan's swapchains at all but still allocate images that they intend to use for WSI. We need to handle these by checking that we have indeed acquired a display_fd before doing any memory allocation for WSI, and acquiring one at that time if that's not the case. This change also removes the render_fd and display_fd fields from the logical device (which we were copying from the physical device), because now there is no guarantee that we have acquired a display_fd at the time we create a logical device. Instead, we now put a reference to the physical device on the logical device from which we can access these. Finally, this also fixes a regression introduced with VK_KHR_display, where if that extension is enabled but we are running inside a compositor, we would acquire a display_fd that is not authenticated and try to use that instead of acquiring an authenticated display_fd from the display server. Fixes: b1188c9451 (v3dv: VK_KHR_display extension support) Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7546>
2020-11-11 09:45:33 +01:00
int ret = drmSyncobjCreate(device->pdevice->render_fd, 0, &sem->sync);
if (ret) {
vk_object_free(&device->vk, pAllocator, sem);
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
}
*pSemaphore = v3dv_semaphore_to_handle(sem);
return VK_SUCCESS;
}
VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceExternalSemaphoreProperties(
VkPhysicalDevice physicalDevice,
const VkPhysicalDeviceExternalSemaphoreInfo *pExternalSemaphoreInfo,
VkExternalSemaphoreProperties *pExternalSemaphoreProperties)
{
V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, physicalDevice);
switch (pExternalSemaphoreInfo->handleType) {
case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT:
pExternalSemaphoreProperties->exportFromImportedHandleTypes =
VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT |
VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
pExternalSemaphoreProperties->compatibleHandleTypes =
VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT |
VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
/* We need to have multisync support in our kernel interface to support
* external semaphore imports because once we have an imported semaphore
* in our list of semaphores to wait on, we can no longer use the
* workaround of waiting on the last syncobj fence produced from the
* device, since the imported semaphore may not (and in fact, it would
* typically not) have been produced from same device.
*/
pExternalSemaphoreProperties->externalSemaphoreFeatures =
pdevice->caps.multisync ?
VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT : 0;
/* FIXME: See comment in GetPhysicalDeviceExternalFenceProperties
* for details on why we can't export to SYNC_FD.
*/
if (pExternalSemaphoreInfo->handleType !=
VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) {
pExternalSemaphoreProperties->externalSemaphoreFeatures |=
VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT;
}
break;
default:
pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0;
pExternalSemaphoreProperties->compatibleHandleTypes = 0;
pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
break;
}
}
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_ImportSemaphoreFdKHR(
VkDevice _device,
const VkImportSemaphoreFdInfoKHR *pImportSemaphoreFdInfo)
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
V3DV_FROM_HANDLE(v3dv_semaphore, sem, pImportSemaphoreFdInfo->semaphore);
assert(pImportSemaphoreFdInfo->sType ==
VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR);
int fd = pImportSemaphoreFdInfo->fd;
int render_fd = device->pdevice->render_fd;
bool is_temporary =
pImportSemaphoreFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT ||
(pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT);
uint32_t new_sync;
switch (pImportSemaphoreFdInfo->handleType) {
case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: {
/* "If handleType is VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT, the
* special value -1 for fd is treated like a valid sync file descriptor
* referring to an object that has already signaled. The import
* operation will succeed and the VkSemaphore will have a temporarily
* imported payload as if a valid file descriptor had been provided."
*/
unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0;
if (drmSyncobjCreate(render_fd, flags, &new_sync))
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
if (fd != -1) {
if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) {
drmSyncobjDestroy(render_fd, new_sync);
return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
}
}
break;
}
case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: {
if (drmSyncobjFDToHandle(render_fd, fd, &new_sync))
return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
break;
}
default:
return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
}
if (sem->temp_sync) {
destroy_syncobj(render_fd, &sem->temp_sync);
sem->has_temp = false;
}
if (is_temporary) {
sem->temp_sync = new_sync;
sem->has_temp = true;
} else {
destroy_syncobj(render_fd, &sem->sync);
sem->sync = new_sync;
}
/* From the Vulkan 1.0.53 spec:
*
* "Importing a semaphore payload from a file descriptor transfers
* ownership of the file descriptor from the application to the
* Vulkan implementation. The application must not perform any
* operations on the file descriptor after a successful import."
*
* If the import fails, we leave the file descriptor open.
*/
if (fd != -1)
close(fd);
return VK_SUCCESS;
}
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetSemaphoreFdKHR(VkDevice _device,
const VkSemaphoreGetFdInfoKHR *pGetFdInfo,
int *pFd)
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
V3DV_FROM_HANDLE(v3dv_semaphore, sem, pGetFdInfo->semaphore);
assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR);
*pFd = -1;
int render_fd = device->pdevice->render_fd;
switch (pGetFdInfo->handleType) {
case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: {
drmSyncobjExportSyncFile(render_fd, sem->sync, pFd);
if (*pFd == -1)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
break;
case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
drmSyncobjHandleToFD(render_fd, sem->sync, pFd);
if (*pFd == -1)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
break;
}
default:
unreachable("Unsupported external semaphore handle type");
}
return VK_SUCCESS;
}
VKAPI_ATTR void VKAPI_CALL
v3dv_DestroySemaphore(VkDevice _device,
VkSemaphore semaphore,
const VkAllocationCallbacks *pAllocator)
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
V3DV_FROM_HANDLE(v3dv_semaphore, sem, semaphore);
if (sem == NULL)
return;
destroy_syncobj(device->pdevice->render_fd, &sem->sync);
if (sem->temp_sync)
destroy_syncobj(device->pdevice->render_fd, &sem->temp_sync);
vk_object_free(&device->vk, pAllocator, sem);
}
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateFence(VkDevice _device,
const VkFenceCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
VkFence *pFence)
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO);
struct v3dv_fence *fence =
vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_fence),
VK_OBJECT_TYPE_FENCE);
if (fence == NULL)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
unsigned flags = 0;
if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT)
flags |= DRM_SYNCOBJ_CREATE_SIGNALED;
v3dv: move authenticated display fd acquisition to swapchain creation time So far, we have only been supporting X11, so we assumed that we were running inside X11 and would always try to get an authenticated fd from Xorg during device initialization. While this works for desktop Raspbian, it is not really correct and it is not what we want to do when we start considering other WSIs. Initially, one could think we can still do this by guarding the WSI code under the proper instance extension check. This, however, doesn't work reliably, as the Vulkan loader can call vkEnumerateDevices without enabling surface extensions on the instance, which then can lead to us not initializing any display_fd and failing with VK_ERROR_INITIALIZATION_FAILED, which is not correct, so while we can try to acquire the display_fd here, it might not always work, and we should definitely not fail initialization of the physical device for that. Instead, with this change we move acquisition of display_fd to swapchain creation time where required extensions need to be enabled in the instance. This was also suggested by Daniel Stone during review of a work-in-progress implementation for the Wayland WSI. There is a special case to consider though: applications like Zink that don't use Vulkan's swapchains at all but still allocate images that they intend to use for WSI. We need to handle these by checking that we have indeed acquired a display_fd before doing any memory allocation for WSI, and acquiring one at that time if that's not the case. This change also removes the render_fd and display_fd fields from the logical device (which we were copying from the physical device), because now there is no guarantee that we have acquired a display_fd at the time we create a logical device. Instead, we now put a reference to the physical device on the logical device from which we can access these. Finally, this also fixes a regression introduced with VK_KHR_display, where if that extension is enabled but we are running inside a compositor, we would acquire a display_fd that is not authenticated and try to use that instead of acquiring an authenticated display_fd from the display server. Fixes: b1188c9451 (v3dv: VK_KHR_display extension support) Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7546>
2020-11-11 09:45:33 +01:00
int ret = drmSyncobjCreate(device->pdevice->render_fd, flags, &fence->sync);
if (ret) {
vk_object_free(&device->vk, pAllocator, fence);
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
}
*pFence = v3dv_fence_to_handle(fence);
return VK_SUCCESS;
}
VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceExternalFenceProperties(
VkPhysicalDevice physicalDevice,
const VkPhysicalDeviceExternalFenceInfo *pExternalFenceInfo,
VkExternalFenceProperties *pExternalFenceProperties)
{
switch (pExternalFenceInfo->handleType) {
case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT:
pExternalFenceProperties->exportFromImportedHandleTypes =
VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
pExternalFenceProperties->compatibleHandleTypes =
VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
pExternalFenceProperties->externalFenceFeatures =
VK_EXTERNAL_FENCE_FEATURE_IMPORTABLE_BIT;
/* FIXME: SYNC_FD exports the actual fence referenced by the syncobj, not
* the syncobj itself, and that fence is only created after we have
* submitted to the kernel and updated the syncobj for the fence to import
* the actual DRM fence created with the submission. Unfortunately, if the
* queue submission has a 'wait for events' we may hold any jobs after the
* wait in a user-space thread until the events are signaled, and in that
* case we don't update the out fence of the submit until the events are
* signaled and we can submit all the jobs involved with the vkQueueSubmit
* call. This means that if the applications submits with an out fence and
* a wait for events, trying to export the out fence to a SYNC_FD rigth
* after the submission and before the events are signaled will fail,
* because the actual DRM fence won't exist yet. This is not a problem
* with OPAQUE_FD because in this case we export the entire syncobj, not
* the underlying DRM fence. To fix this we need to rework our kernel
* interface to be more flexible and accept multiple in/out syncobjs so
* we can implement event waits as regular fence waits on the kernel side,
* until then, we can only reliably export OPAQUE_FD.
*/
if (pExternalFenceInfo->handleType !=
VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT) {
pExternalFenceProperties->externalFenceFeatures |=
VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT;
}
break;
default:
pExternalFenceProperties->exportFromImportedHandleTypes = 0;
pExternalFenceProperties->compatibleHandleTypes = 0;
pExternalFenceProperties->externalFenceFeatures = 0;
break;
}
}
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_ImportFenceFdKHR(VkDevice _device,
const VkImportFenceFdInfoKHR *pImportFenceFdInfo)
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
V3DV_FROM_HANDLE(v3dv_fence, fence, pImportFenceFdInfo->fence);
assert(pImportFenceFdInfo->sType ==
VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR);
int fd = pImportFenceFdInfo->fd;
int render_fd = device->pdevice->render_fd;
bool is_temporary =
pImportFenceFdInfo->handleType == VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT ||
(pImportFenceFdInfo->flags & VK_FENCE_IMPORT_TEMPORARY_BIT);
uint32_t new_sync;
switch (pImportFenceFdInfo->handleType) {
case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
/* "If handleType is VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT, the
* special value -1 for fd is treated like a valid sync file descriptor
* referring to an object that has already signaled. The import
* operation will succeed and the VkFence will have a temporarily
* imported payload as if a valid file descriptor had been provided."
*/
unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0;
if (drmSyncobjCreate(render_fd, flags, &new_sync))
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
if (fd != -1) {
if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) {
drmSyncobjDestroy(render_fd, new_sync);
return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
}
}
break;
}
case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: {
if (drmSyncobjFDToHandle(render_fd, fd, &new_sync))
return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
break;
}
default:
return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
}
if (fence->temp_sync) {
destroy_syncobj(render_fd, &fence->temp_sync);
fence->has_temp = false;
}
if (is_temporary) {
fence->temp_sync = new_sync;
fence->has_temp = true;
} else {
destroy_syncobj(render_fd, &fence->sync);
fence->sync = new_sync;
}
/* From the Vulkan 1.0.53 spec:
*
* "Importing a fence payload from a file descriptor transfers
* ownership of the file descriptor from the application to the
* Vulkan implementation. The application must not perform any
* operations on the file descriptor after a successful import."
*
* If the import fails, we leave the file descriptor open.
*/
if (fd != -1)
close(fd);
return VK_SUCCESS;
}
VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyFence(VkDevice _device,
VkFence _fence,
const VkAllocationCallbacks *pAllocator)
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
V3DV_FROM_HANDLE(v3dv_fence, fence, _fence);
if (fence == NULL)
return;
destroy_syncobj(device->pdevice->render_fd, &fence->sync);
if (fence->temp_sync)
destroy_syncobj(device->pdevice->render_fd, &fence->temp_sync);
vk_object_free(&device->vk, pAllocator, fence);
}
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetFenceStatus(VkDevice _device, VkFence _fence)
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
V3DV_FROM_HANDLE(v3dv_fence, fence, _fence);
uint32_t sync = fence_get_sync(fence);
int ret = drmSyncobjWait(device->pdevice->render_fd, &sync, 1,
0, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, NULL);
if (ret == -ETIME)
return VK_NOT_READY;
else if (ret)
return vk_error(device, VK_ERROR_DEVICE_LOST);
return VK_SUCCESS;
}
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetFenceFdKHR(VkDevice _device,
const VkFenceGetFdInfoKHR *pGetFdInfo,
int *pFd)
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
V3DV_FROM_HANDLE(v3dv_fence, fence, pGetFdInfo->fence);
assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR);
*pFd = -1;
int render_fd = device->pdevice->render_fd;
switch (pGetFdInfo->handleType) {
case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
drmSyncobjExportSyncFile(render_fd, fence->sync, pFd);
if (*pFd == -1)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
break;
case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
drmSyncobjHandleToFD(render_fd, fence->sync, pFd);
if (*pFd == -1)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
break;
}
default:
unreachable("Unsupported external fence handle type");
}
return VK_SUCCESS;
}
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_ResetFences(VkDevice _device, uint32_t fenceCount, const VkFence *pFences)
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
uint32_t *syncobjs = vk_alloc(&device->vk.alloc,
sizeof(*syncobjs) * fenceCount, 8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!syncobjs)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
int render_fd = device->pdevice->render_fd;
uint32_t reset_count = 0;
for (uint32_t i = 0; i < fenceCount; i++) {
struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]);
/* From the Vulkan spec, section 'Importing Fence Payloads':
*
* "If the import is temporary, the fence will be restored to its
* permanent state the next time that fence is passed to
* vkResetFences.
*
* Note: Restoring a fence to its prior permanent payload is a
* distinct operation from resetting a fence payload."
*
* To restore the previous state, we just need to destroy the temporary.
*/
if (fence->has_temp) {
assert(fence->temp_sync);
destroy_syncobj(render_fd, &fence->temp_sync);
fence->has_temp = false;
} else {
syncobjs[reset_count++] = fence->sync;
}
}
int ret = 0;
if (reset_count > 0)
ret = drmSyncobjReset(render_fd, syncobjs, reset_count);
vk_free(&device->vk.alloc, syncobjs);
if (ret)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
return VK_SUCCESS;
}
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_WaitForFences(VkDevice _device,
uint32_t fenceCount,
const VkFence *pFences,
VkBool32 waitAll,
uint64_t timeout)
{
V3DV_FROM_HANDLE(v3dv_device, device, _device);
const uint64_t abs_timeout = get_absolute_timeout(timeout);
uint32_t *syncobjs = vk_alloc(&device->vk.alloc,
sizeof(*syncobjs) * fenceCount, 8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!syncobjs)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
for (uint32_t i = 0; i < fenceCount; i++) {
struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]);
syncobjs[i] = fence_get_sync(fence);
}
unsigned flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
if (waitAll)
flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL;
int ret;
do {
v3dv: move authenticated display fd acquisition to swapchain creation time So far, we have only been supporting X11, so we assumed that we were running inside X11 and would always try to get an authenticated fd from Xorg during device initialization. While this works for desktop Raspbian, it is not really correct and it is not what we want to do when we start considering other WSIs. Initially, one could think we can still do this by guarding the WSI code under the proper instance extension check. This, however, doesn't work reliably, as the Vulkan loader can call vkEnumerateDevices without enabling surface extensions on the instance, which then can lead to us not initializing any display_fd and failing with VK_ERROR_INITIALIZATION_FAILED, which is not correct, so while we can try to acquire the display_fd here, it might not always work, and we should definitely not fail initialization of the physical device for that. Instead, with this change we move acquisition of display_fd to swapchain creation time where required extensions need to be enabled in the instance. This was also suggested by Daniel Stone during review of a work-in-progress implementation for the Wayland WSI. There is a special case to consider though: applications like Zink that don't use Vulkan's swapchains at all but still allocate images that they intend to use for WSI. We need to handle these by checking that we have indeed acquired a display_fd before doing any memory allocation for WSI, and acquiring one at that time if that's not the case. This change also removes the render_fd and display_fd fields from the logical device (which we were copying from the physical device), because now there is no guarantee that we have acquired a display_fd at the time we create a logical device. Instead, we now put a reference to the physical device on the logical device from which we can access these. Finally, this also fixes a regression introduced with VK_KHR_display, where if that extension is enabled but we are running inside a compositor, we would acquire a display_fd that is not authenticated and try to use that instead of acquiring an authenticated display_fd from the display server. Fixes: b1188c9451 (v3dv: VK_KHR_display extension support) Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7546>
2020-11-11 09:45:33 +01:00
ret = drmSyncobjWait(device->pdevice->render_fd, syncobjs, fenceCount,
timeout, flags, NULL);
} while (ret == -ETIME && gettime_ns() < abs_timeout);
vk_free(&device->vk.alloc, syncobjs);
if (ret == -ETIME)
return VK_TIMEOUT;
else if (ret)
return vk_error(device, VK_ERROR_DEVICE_LOST);
return VK_SUCCESS;
}
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_QueueBindSparse(VkQueue _queue,
uint32_t bindInfoCount,
const VkBindSparseInfo *pBindInfo,
VkFence fence)
{
V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
}