radv: Always allow sparse on normal GFX/COMPUTE/DMA queues.

Forcing a dedicated sparse queue is problematic in real-world scenarios. In the current implicit sync world for sparse updates, we can rely on submission order. For use cases where an application can take advantage of the separate sparse queue to do "async" updates, the existing implementation works well, but problems arise when trying to implement D3D-style submission ordering. E.g., when a game does sparse on a graphics or compute queue, we need to guarantee that previous submissions, sparse update and future submissions are properly ordered. The Vulkan way of implementing this is to: - Signal graphics queue to timeline N (i.e. last submission made) - Wait on timeline N on the sparse queue - Do sparse updates - Signal timeline N + 1 on sparse queue - Wait for timeline N + 1 on graphics queue (can be deferred until next graphics submit) This causes an unavoidable bubble in GPU execution, since the existing sparse queue ends up doing: - Wait pending signal. The implication here is that all previous GPU work must have been submitted. - Do VM operations on CPU timeline - Wait for semaphores to signal (this is required for signal ordering) - ... GPU is meanwhile stalling in a bubble due to GPU -> CPU -> GPU roundtrip. - Signal semaphore on CPU (unblocks GPU work) Letting the GPU go idle here is not great, and we can be screwed over by bad thread scheduling. Another knock-on effect is that the graphics queue is now forced into using a thread for submissions. This is because when the graphics queue wants to wait for timeline N + 1, the sparse queue may not have signalled the timeline yet on CPU, so effectively, we have created a wait-before-signal situation internally in RADV. Throwing another thread under the bus is not great either. Just letting the queue in question support sparse binding solves all these issues and I don't see a path forward where the D3D use case can be solved in a separate queue world. It is also friendlier to the ecosystem at large. RADV is the only driver I know of that insists on separate sparse queues and multiple games assume that graphics queue can support sparse. Signed-off-by: Hans-Kristian Arntzen <post@arntzen-software.no> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33166>
2025-12-25 02:10:11 +01:00 · 2025-01-22 17:09:41 +01:00 · 2025-01-22 17:09:41 +01:00 · f58630f07c
commit f58630f07c
parent 4209f8ebf6
4 changed files with 12 additions and 21 deletions
--- a/src/amd/vulkan/layers/radv_sqtt_layer.c
+++ b/src/amd/vulkan/layers/radv_sqtt_layer.c
@ -355,7 +355,7 @@ radv_describe_begin_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
   if (cmd_buffer->qf == RADV_QUEUE_GENERAL)
      marker.queue_flags |= VK_QUEUE_GRAPHICS_BIT;

-   if (!radv_sparse_queue_enabled(pdev))
+   if (!radv_dedicated_sparse_queue_enabled(pdev))
      marker.queue_flags |= VK_QUEUE_SPARSE_BINDING_BIT;

   radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
--- a/src/amd/vulkan/radv_physical_device.c
+++ b/src/amd/vulkan/radv_physical_device.c
@ -318,7 +318,7 @@ radv_physical_device_init_queue_table(struct radv_physical_device *pdev)
      }
   }

-   if (radv_sparse_queue_enabled(pdev)) {
+   if (radv_dedicated_sparse_queue_enabled(pdev)) {
      pdev->vk_queue_to_radv[idx] = RADV_QUEUE_SPARSE;
      idx++;
   }
@ -2401,7 +2401,7 @@ radv_get_physical_device_queue_family_properties(struct radv_physical_device *pd
       num_queue_families++;
   }

-   if (radv_sparse_queue_enabled(pdev)) {
+   if (radv_dedicated_sparse_queue_enabled(pdev)) {
      num_queue_families++;
   }

@ -2415,9 +2415,8 @@ radv_get_physical_device_queue_family_properties(struct radv_physical_device *pd

   idx = 0;
   if (*pCount >= 1) {
-      VkQueueFlags gfx_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT;
-      if (!radv_sparse_queue_enabled(pdev))
-         gfx_flags |= VK_QUEUE_SPARSE_BINDING_BIT;
+      VkQueueFlags gfx_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT |
+                               VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT;
      *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties){
         .queueFlags = gfx_flags,
         .queueCount = 1,
@ -2428,9 +2427,7 @@ radv_get_physical_device_queue_family_properties(struct radv_physical_device *pd
   }

   if (pdev->info.ip[AMD_IP_COMPUTE].num_queues > 0 && !(instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) {
-      VkQueueFlags compute_flags = VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT;
-      if (!radv_sparse_queue_enabled(pdev))
-         compute_flags |= VK_QUEUE_SPARSE_BINDING_BIT;
+      VkQueueFlags compute_flags = VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT;
      if (*pCount > idx) {
         *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties){
            .queueFlags = compute_flags,
@ -2459,7 +2456,7 @@ radv_get_physical_device_queue_family_properties(struct radv_physical_device *pd
   if (radv_transfer_queue_enabled(pdev)) {
      if (*pCount > idx) {
         *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties){
-            .queueFlags = VK_QUEUE_TRANSFER_BIT,
+            .queueFlags = VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT,
            .queueCount = pdev->info.ip[AMD_IP_SDMA].num_queues,
            .timestampValidBits = 64,
            .minImageTransferGranularity = (VkExtent3D){16, 16, 8},
@ -2482,7 +2479,7 @@ radv_get_physical_device_queue_family_properties(struct radv_physical_device *pd
      }
   }

-   if (radv_sparse_queue_enabled(pdev)) {
+   if (radv_dedicated_sparse_queue_enabled(pdev)) {
      if (*pCount > idx) {
         *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties){
            .queueFlags = VK_QUEUE_SPARSE_BINDING_BIT,
--- a/src/amd/vulkan/radv_physical_device.h
+++ b/src/amd/vulkan/radv_physical_device.h
@ -195,7 +195,7 @@ radv_physical_device_instance(const struct radv_physical_device *pdev)
 }

 static inline bool
-radv_sparse_queue_enabled(const struct radv_physical_device *pdev)
+radv_dedicated_sparse_queue_enabled(const struct radv_physical_device *pdev)
 {
   const struct radv_instance *instance = radv_physical_device_instance(pdev);

--- a/src/amd/vulkan/radv_queue.c
+++ b/src/amd/vulkan/radv_queue.c
@ -1896,16 +1896,10 @@ radv_queue_submit(struct vk_queue *vqueue, struct vk_queue_submit *submission)
 {
   struct radv_queue *queue = (struct radv_queue *)vqueue;
   struct radv_device *device = radv_queue_device(queue);
-   const struct radv_physical_device *pdev = radv_device_physical(device);
-   VkResult result;

-   if (!radv_sparse_queue_enabled(pdev)) {
-      result = radv_queue_submit_bind_sparse_memory(device, submission);
-      if (result != VK_SUCCESS)
-         goto fail;
-   } else {
-      assert(!submission->buffer_bind_count && !submission->image_bind_count && !submission->image_opaque_bind_count);
-   }
+   VkResult result = radv_queue_submit_bind_sparse_memory(device, submission);
+   if (result != VK_SUCCESS)
+      goto fail;

   if (!submission->command_buffer_count && !submission->wait_count && !submission->signal_count)
      return VK_SUCCESS;