radv: Always allow sparse on normal GFX/COMPUTE/DMA queues.

Forcing a dedicated sparse queue is problematic in real-world scenarios.

In the current implicit sync world for sparse updates, we can rely on
submission order.

For use cases where an application can take advantage of the separate
sparse queue to do "async" updates, the existing implementation works
well, but problems arise when trying to implement D3D-style submission
ordering. E.g., when a game does sparse on a graphics or compute queue,
we need to guarantee that previous submissions, sparse update and future
submissions are properly ordered.
The Vulkan way of implementing this is to:

- Signal graphics queue to timeline N (i.e. last submission made)
- Wait on timeline N on the sparse queue
- Do sparse updates
- Signal timeline N + 1 on sparse queue
- Wait for timeline N + 1 on graphics queue (can be deferred until next
  graphics submit)

This causes an unavoidable bubble in GPU execution, since the
existing sparse queue ends up doing:

- Wait pending signal. The implication here is that all previous GPU
  work must have been submitted.
- Do VM operations on CPU timeline
- Wait for semaphores to signal (this is required for signal ordering)
- ... GPU is meanwhile stalling in a bubble due to GPU -> CPU -> GPU roundtrip.
- Signal semaphore on CPU (unblocks GPU work)

Letting the GPU go idle here is not great, and we can be screwed over by bad thread scheduling.

Another knock-on effect is that the graphics queue is now forced into
using a thread for submissions. This is because when the graphics queue
wants to wait for timeline N + 1, the sparse queue may not have
signalled the timeline yet on CPU, so effectively, we have created a
wait-before-signal situation internally in RADV. Throwing another thread
under the bus is not great either.

Just letting the queue in question support sparse binding solves all
these issues and I don't see a path forward where the D3D use case can
be solved in a separate queue world.

It is also friendlier to the ecosystem at large. RADV is the only driver
I know of that insists on separate sparse queues and multiple games
assume that graphics queue can support sparse.

Signed-off-by: Hans-Kristian Arntzen <post@arntzen-software.no>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33166>
This commit is contained in:
Hans-Kristian Arntzen 2025-01-22 17:09:41 +01:00 committed by Marge Bot
parent 4209f8ebf6
commit f58630f07c
4 changed files with 12 additions and 21 deletions

View file

@ -355,7 +355,7 @@ radv_describe_begin_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
if (cmd_buffer->qf == RADV_QUEUE_GENERAL)
marker.queue_flags |= VK_QUEUE_GRAPHICS_BIT;
if (!radv_sparse_queue_enabled(pdev))
if (!radv_dedicated_sparse_queue_enabled(pdev))
marker.queue_flags |= VK_QUEUE_SPARSE_BINDING_BIT;
radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4);

View file

@ -318,7 +318,7 @@ radv_physical_device_init_queue_table(struct radv_physical_device *pdev)
}
}
if (radv_sparse_queue_enabled(pdev)) {
if (radv_dedicated_sparse_queue_enabled(pdev)) {
pdev->vk_queue_to_radv[idx] = RADV_QUEUE_SPARSE;
idx++;
}
@ -2401,7 +2401,7 @@ radv_get_physical_device_queue_family_properties(struct radv_physical_device *pd
num_queue_families++;
}
if (radv_sparse_queue_enabled(pdev)) {
if (radv_dedicated_sparse_queue_enabled(pdev)) {
num_queue_families++;
}
@ -2415,9 +2415,8 @@ radv_get_physical_device_queue_family_properties(struct radv_physical_device *pd
idx = 0;
if (*pCount >= 1) {
VkQueueFlags gfx_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT;
if (!radv_sparse_queue_enabled(pdev))
gfx_flags |= VK_QUEUE_SPARSE_BINDING_BIT;
VkQueueFlags gfx_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT |
VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT;
*pQueueFamilyProperties[idx] = (VkQueueFamilyProperties){
.queueFlags = gfx_flags,
.queueCount = 1,
@ -2428,9 +2427,7 @@ radv_get_physical_device_queue_family_properties(struct radv_physical_device *pd
}
if (pdev->info.ip[AMD_IP_COMPUTE].num_queues > 0 && !(instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) {
VkQueueFlags compute_flags = VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT;
if (!radv_sparse_queue_enabled(pdev))
compute_flags |= VK_QUEUE_SPARSE_BINDING_BIT;
VkQueueFlags compute_flags = VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT;
if (*pCount > idx) {
*pQueueFamilyProperties[idx] = (VkQueueFamilyProperties){
.queueFlags = compute_flags,
@ -2459,7 +2456,7 @@ radv_get_physical_device_queue_family_properties(struct radv_physical_device *pd
if (radv_transfer_queue_enabled(pdev)) {
if (*pCount > idx) {
*pQueueFamilyProperties[idx] = (VkQueueFamilyProperties){
.queueFlags = VK_QUEUE_TRANSFER_BIT,
.queueFlags = VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT,
.queueCount = pdev->info.ip[AMD_IP_SDMA].num_queues,
.timestampValidBits = 64,
.minImageTransferGranularity = (VkExtent3D){16, 16, 8},
@ -2482,7 +2479,7 @@ radv_get_physical_device_queue_family_properties(struct radv_physical_device *pd
}
}
if (radv_sparse_queue_enabled(pdev)) {
if (radv_dedicated_sparse_queue_enabled(pdev)) {
if (*pCount > idx) {
*pQueueFamilyProperties[idx] = (VkQueueFamilyProperties){
.queueFlags = VK_QUEUE_SPARSE_BINDING_BIT,

View file

@ -195,7 +195,7 @@ radv_physical_device_instance(const struct radv_physical_device *pdev)
}
static inline bool
radv_sparse_queue_enabled(const struct radv_physical_device *pdev)
radv_dedicated_sparse_queue_enabled(const struct radv_physical_device *pdev)
{
const struct radv_instance *instance = radv_physical_device_instance(pdev);

View file

@ -1896,16 +1896,10 @@ radv_queue_submit(struct vk_queue *vqueue, struct vk_queue_submit *submission)
{
struct radv_queue *queue = (struct radv_queue *)vqueue;
struct radv_device *device = radv_queue_device(queue);
const struct radv_physical_device *pdev = radv_device_physical(device);
VkResult result;
if (!radv_sparse_queue_enabled(pdev)) {
result = radv_queue_submit_bind_sparse_memory(device, submission);
if (result != VK_SUCCESS)
goto fail;
} else {
assert(!submission->buffer_bind_count && !submission->image_bind_count && !submission->image_opaque_bind_count);
}
VkResult result = radv_queue_submit_bind_sparse_memory(device, submission);
if (result != VK_SUCCESS)
goto fail;
if (!submission->command_buffer_count && !submission->wait_count && !submission->signal_count)
return VK_SUCCESS;