From 60fdab22a5f38f7ecef040961d1401d27d4d83b1 Mon Sep 17 00:00:00 2001 From: Roman Stratiienko Date: Wed, 29 Apr 2026 19:02:11 +0300 Subject: [PATCH] v3dv: Emulate multi-queue support via vk_queue for Android Android14+ relies on at least 2 queues for vulkan skia/UI rendering. More explained [here][1] [1]: https://gitlab.freedesktop.org/mesa/mesa/-/work_items/11326 Signed-off-by: Roman Stratiienko Reviewed-by: Iago Toral Quiroga Part-of: --- src/broadcom/vulkan/v3dv_device.c | 52 ++++++++++++++++------ src/broadcom/vulkan/v3dv_device.h | 18 +++++--- src/broadcom/vulkan/v3dv_limits.h | 8 ++++ src/broadcom/vulkan/v3dv_queue.c | 71 +++++++++++++++++-------------- 4 files changed, 100 insertions(+), 49 deletions(-) diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c index ccce410fa4f..cf7fa43675e 100644 --- a/src/broadcom/vulkan/v3dv_device.c +++ b/src/broadcom/vulkan/v3dv_device.c @@ -1684,13 +1684,13 @@ v3dv_physical_device_device_id(const struct v3dv_physical_device *dev) } } -/* We support exactly one queue family. */ +/* We support multiqueue emulation */ static const VkQueueFamilyProperties v3dv_queue_family_properties = { .queueFlags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT, - .queueCount = 1, + .queueCount = V3DV_MAX_QUEUES, .timestampValidBits = 64, .minImageTransferGranularity = { 1, 1, 1 }, }; @@ -1909,14 +1909,16 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO); - /* Check requested queues (we only expose one queue ) */ - assert(pCreateInfo->queueCreateInfoCount == 1); + /* Check requested queues */ + uint32_t total_queues = 0; for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { assert(pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex == 0); - assert(pCreateInfo->pQueueCreateInfos[i].queueCount == 1); + assert(pCreateInfo->pQueueCreateInfos[i].queueCount <= V3DV_MAX_QUEUES); if (pCreateInfo->pQueueCreateInfos[i].flags != 0) return vk_error(instance, VK_ERROR_INITIALIZATION_FAILED); + total_queues += pCreateInfo->pQueueCreateInfos[i].queueCount; } + assert(total_queues <= V3DV_MAX_QUEUES); device = vk_zalloc2(&physical_device->vk.instance->alloc, pAllocator, sizeof(*device), 8, @@ -1939,6 +1941,7 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, device->instance = instance; device->pdevice = physical_device; + mtx_init(&device->queue_mutex, mtx_plain); mtx_init(&device->query_mutex, mtx_plain); cnd_init(&device->query_ended); @@ -1948,10 +1951,25 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, vk_device_set_drm_fd(&device->vk, physical_device->render_fd); vk_device_enable_threaded_submit(&device->vk); - result = queue_init(device, &device->queue, - pCreateInfo->pQueueCreateInfos, 0); - if (result != VK_SUCCESS) - goto fail; + device->queues = vk_zalloc2(&device->vk.alloc, pAllocator, + sizeof(*device->queues) * total_queues, 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!device->queues) { + result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_queues_alloc; + } + + device->queue_count = 0; + for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { + for (uint32_t j = 0; j < pCreateInfo->pQueueCreateInfos[i].queueCount; j++) { + result = queue_init(device, &device->queues[device->queue_count], + &pCreateInfo->pQueueCreateInfos[i], j); + if (result != VK_SUCCESS) + goto fail; + + device->queue_count++; + } + } device->devinfo = physical_device->devinfo; @@ -2000,9 +2018,13 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, return VK_SUCCESS; fail: + for (uint32_t i = 0; i < device->queue_count; i++) + queue_finish(&device->queues[i]); + vk_free2(&device->vk.alloc, pAllocator, device->queues); +fail_queues_alloc: cnd_destroy(&device->query_ended); mtx_destroy(&device->query_mutex); - queue_finish(&device->queue); + mtx_destroy(&device->queue_mutex); if (device->noop_job) v3dv_job_destroy(device->noop_job); destroy_device_meta(device); @@ -2022,7 +2044,9 @@ v3dv_DestroyDevice(VkDevice _device, V3DV_FROM_HANDLE(v3dv_device, device, _device); device->vk.dispatch_table.DeviceWaitIdle(_device); - queue_finish(&device->queue); + for (uint32_t i = 0; i < device->queue_count; i++) + queue_finish(&device->queues[i]); + vk_free2(&device->vk.alloc, pAllocator, device->queues); if (device->noop_job) v3dv_job_destroy(device->noop_job); @@ -2049,6 +2073,7 @@ v3dv_DestroyDevice(VkDevice _device, cnd_destroy(&device->query_ended); mtx_destroy(&device->query_mutex); + mtx_destroy(&device->queue_mutex); vk_device_finish(&device->vk); vk_free2(&device->vk.alloc, pAllocator, device); @@ -2258,8 +2283,11 @@ free_memory(struct v3dv_device *device, if (mem->bo->map) device_unmap(device, mem); - if (mem->is_for_device_address) + if (mem->is_for_device_address) { + mtx_lock(&device->queue_mutex); device_remove_device_address_bo(device, mem->bo); + mtx_unlock(&device->queue_mutex); + } device_free(device, mem); diff --git a/src/broadcom/vulkan/v3dv_device.h b/src/broadcom/vulkan/v3dv_device.h index 835030bfacd..68635f868fc 100644 --- a/src/broadcom/vulkan/v3dv_device.h +++ b/src/broadcom/vulkan/v3dv_device.h @@ -204,11 +204,6 @@ struct v3dv_queue { struct v3dv_device *device; struct v3dv_last_job_sync last_job_syncs; - - /* The last active perfmon ID to prevent mixing of counter results when a - * job is submitted with a different perfmon id. - */ - uint32_t last_perfmon_id; }; VkResult v3dv_queue_driver_submit(struct vk_queue *vk_queue, @@ -253,10 +248,21 @@ struct v3dv_device { struct v3dv_physical_device *pdevice; struct v3d_device_info devinfo; - struct v3dv_queue queue; + struct v3dv_queue *queues; + uint32_t queue_count; + + /* In cases where we instantiate more than one queue (Android), this protects + * against concurrent access from multiple queues. + */ + mtx_t queue_mutex; struct v3dv_job *noop_job; + /* The last active perfmon ID to prevent mixing of counter results when a + * job is submitted with a different perfmon id. + */ + uint32_t last_perfmon_id; + /* Guards query->maybe_available and value for timestamps */ mtx_t query_mutex; diff --git a/src/broadcom/vulkan/v3dv_limits.h b/src/broadcom/vulkan/v3dv_limits.h index 482cb8137e0..2d16e2d087f 100644 --- a/src/broadcom/vulkan/v3dv_limits.h +++ b/src/broadcom/vulkan/v3dv_limits.h @@ -24,6 +24,14 @@ #define V3DV_LIMITS_H #include "drm-uapi/v3d_drm.h" +#include "util/detect_os.h" + +#if DETECT_OS_ANDROID +#define V3DV_MAX_QUEUES 4 +#else +#define V3DV_MAX_QUEUES 1 +#endif + /* From vulkan spec "If the multiple viewports feature is not enabled, * scissorCount must be 1", ditto for viewportCount. For now we don't support diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c index 3ef2a853f05..9f14c7629c4 100644 --- a/src/broadcom/vulkan/v3dv_queue.c +++ b/src/broadcom/vulkan/v3dv_queue.c @@ -258,14 +258,14 @@ set_multisync(struct drm_v3d_multi_sync *ms, struct vk_sync_wait *waits, unsigned wait_count, struct drm_v3d_extension *next, - struct v3dv_device *device, + struct v3dv_queue *queue, struct v3dv_job *job, enum v3dv_queue_type in_queue_sync, enum v3dv_queue_type out_queue_sync, enum v3d_queue wait_stage, bool signal_syncs) { - struct v3dv_queue *queue = &device->queue; + struct v3dv_device *device = queue->device; uint32_t out_sync_count = 0, in_sync_count = 0; struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL; @@ -339,7 +339,7 @@ handle_reset_query_cpu_job(struct v3dv_queue *queue, reset.syncs = (uintptr_t)(void *)syncs; - set_multisync(&ms, sync_info, NULL, 0, (void *)&reset, device, job, + set_multisync(&ms, sync_info, NULL, 0, (void *)&reset, queue, job, V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs); if (!ms.base.id) { free(syncs); @@ -381,7 +381,7 @@ handle_reset_query_cpu_job(struct v3dv_queue *queue, reset.syncs = (uintptr_t)(void *)syncs; reset.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids; - set_multisync(&ms, sync_info, waits, wait_count, (void *)&reset, device, job, + set_multisync(&ms, sync_info, waits, wait_count, (void *)&reset, queue, job, V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs); if (!ms.base.id) { free(syncs); @@ -481,7 +481,7 @@ export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int if (err) { close(*fd); - return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN, + return vk_errorf(queue, VK_ERROR_UNKNOWN, "sync file export failed: %m"); } @@ -490,7 +490,7 @@ export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int if (err) { close(tmp_fd); close(*fd); - return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN, + return vk_errorf(queue, VK_ERROR_UNKNOWN, "failed to accumulate sync files: %m"); } } @@ -499,7 +499,7 @@ export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int } static VkResult -handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx) +handle_end_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job, uint32_t counter_pass_idx) { MESA_TRACE_FUNC(); VkResult result = VK_SUCCESS; @@ -507,7 +507,6 @@ handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx) mtx_lock(&job->device->query_mutex); struct v3dv_end_query_info *info = &job->cpu.query_end; - struct v3dv_queue *queue = &job->device->queue; int err = 0; int fd = -1; @@ -611,7 +610,7 @@ handle_copy_query_results_cpu_job(struct v3dv_queue *queue, copy.offsets = (uintptr_t)(void *)offsets; copy.syncs = (uintptr_t)(void *)syncs; - set_multisync(&ms, sync_info, NULL, 0, (void *)©, device, job, + set_multisync(&ms, sync_info, NULL, 0, (void *)©, queue, job, V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs); if (!ms.base.id) { free(bo_handles); @@ -668,7 +667,7 @@ handle_copy_query_results_cpu_job(struct v3dv_queue *queue, copy.syncs = (uintptr_t)(void *)syncs; copy.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids; - set_multisync(&ms, sync_info, waits, wait_count, (void *)©, device, job, + set_multisync(&ms, sync_info, waits, wait_count, (void *)©, queue, job, V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs); if (!ms.base.id) { free(kperfmon_ids); @@ -796,7 +795,7 @@ handle_timestamp_query_cpu_job(struct v3dv_queue *queue, */ job->serialize = V3DV_BARRIER_ALL; - set_multisync(&ms, sync_info, NULL, 0, (void *)×tamp, device, job, + set_multisync(&ms, sync_info, NULL, 0, (void *)×tamp, queue, job, V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs); if (!ms.base.id) { free(offsets); @@ -901,7 +900,7 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue, * CSD job, as the CPU job must obey to the CSD job synchronization * demands, such as barriers. */ - set_multisync(&ms, sync_info, NULL, 0, (void *)&indirect, device, csd_job, + set_multisync(&ms, sync_info, NULL, 0, (void *)&indirect, queue, csd_job, V3DV_QUEUE_CPU, V3DV_QUEUE_CSD, V3D_CPU, signal_syncs); if (!ms.base.id) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); @@ -924,6 +923,22 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue, return VK_SUCCESS; } +static inline void +job_add_device_address_bos(struct v3dv_job *job, struct v3dv_queue *queue) +{ + if (!job->uses_buffer_device_address) + return; + + struct v3dv_device *device = queue->device; + + mtx_lock(&device->queue_mutex); + util_dynarray_foreach(&device->device_address_bo_list, + struct v3dv_bo *, bo) { + v3dv_job_add_bo(job, *bo); + } + mtx_unlock(&device->queue_mutex); +} + static VkResult handle_cl_job(struct v3dv_queue *queue, struct v3dv_job *job, @@ -966,12 +981,7 @@ handle_cl_job(struct v3dv_queue *queue, * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT * are included. */ - if (job->uses_buffer_device_address) { - util_dynarray_foreach(&queue->device->device_address_bo_list, - struct v3dv_bo *, bo) { - v3dv_job_add_bo(job, *bo); - } - } + job_add_device_address_bos(job, queue); submit.bo_handle_count = job->bo_count; uint32_t *bo_handles = @@ -986,8 +996,10 @@ handle_cl_job(struct v3dv_queue *queue, submit.perfmon_id = job->perf ? job->perf->kperfmon_ids[counter_pass_idx] : 0; - const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id; - queue->last_perfmon_id = submit.perfmon_id; + mtx_lock(&device->queue_mutex); + const bool needs_perf_sync = device->last_perfmon_id != submit.perfmon_id; + device->last_perfmon_id = submit.perfmon_id; + mtx_unlock(&device->queue_mutex); /* We need a binning sync if we are the first CL job waiting on a semaphore * with a wait stage that involves the geometry pipeline, or if the job @@ -1026,7 +1038,7 @@ handle_cl_job(struct v3dv_queue *queue, */ struct drm_v3d_multi_sync ms = { 0 }; enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN; - set_multisync(&ms, sync_info, NULL, 0, NULL, device, job, + set_multisync(&ms, sync_info, NULL, 0, NULL, queue, job, V3DV_QUEUE_CL, V3DV_QUEUE_CL, wait_stage, signal_syncs); if (!ms.base.id) { free(bo_handles); @@ -1078,7 +1090,7 @@ handle_tfu_job(struct v3dv_queue *queue, * multiple semaphore extension. */ struct drm_v3d_multi_sync ms = { 0 }; - set_multisync(&ms, sync_info, NULL, 0, NULL, device, job, + set_multisync(&ms, sync_info, NULL, 0, NULL, queue, job, V3DV_QUEUE_TFU, V3DV_QUEUE_TFU, V3D_TFU, signal_syncs); if (!ms.base.id) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); @@ -1118,12 +1130,7 @@ handle_csd_job(struct v3dv_queue *queue, * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT * are included. */ - if (job->uses_buffer_device_address) { - util_dynarray_foreach(&queue->device->device_address_bo_list, - struct v3dv_bo *, bo) { - v3dv_job_add_bo(job, *bo); - } - } + job_add_device_address_bos(job, queue); submit->bo_handle_count = job->bo_count; uint32_t *bo_handles = @@ -1140,7 +1147,7 @@ handle_csd_job(struct v3dv_queue *queue, * multiple semaphore extension. */ struct drm_v3d_multi_sync ms = { 0 }; - set_multisync(&ms, sync_info, NULL, 0, NULL, device, job, + set_multisync(&ms, sync_info, NULL, 0, NULL, queue, job, V3DV_QUEUE_CSD, V3DV_QUEUE_CSD, V3D_CSD, signal_syncs); if (!ms.base.id) return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); @@ -1154,7 +1161,9 @@ handle_csd_job(struct v3dv_queue *queue, submit->perfmon_id = job->perf ? job->perf->kperfmon_ids[counter_pass_idx] : 0; - queue->last_perfmon_id = submit->perfmon_id; + mtx_lock(&device->queue_mutex); + device->last_perfmon_id = submit->perfmon_id; + mtx_unlock(&device->queue_mutex); int ret = v3d_ioctl(device->pdevice->render_fd, DRM_IOCTL_V3D_SUBMIT_CSD, submit); @@ -1220,7 +1229,7 @@ queue_handle_job(struct v3dv_queue *queue, case V3DV_JOB_TYPE_CPU_RESET_QUERIES: return handle_reset_query_cpu_job(queue, job, sync_info, signal_syncs); case V3DV_JOB_TYPE_CPU_END_QUERY: - return handle_end_query_cpu_job(job, counter_pass_idx); + return handle_end_query_cpu_job(queue, job, counter_pass_idx); case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS: return handle_copy_query_results_cpu_job(queue, job, sync_info, signal_syncs); case V3DV_JOB_TYPE_CPU_CSD_INDIRECT: