radv: handle indirect pipeline binds with scratch and DGC

vkCmdUpdatePipelineIndirectBufferNV() can be called on any queue supporting transfer operations and it's not required to call it on the same queue as DGC execute. This is very annoying if the compute shader has scratch because it needs to be configured per queue. The solution is to gather the maximum possible scratch size used by indirect compute pipelines and use that to configure scratch. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27495>
2026-05-04 20:38:06 +02:00 · 2024-02-19 17:06:25 +01:00 · 2024-02-19 17:06:25 +01:00 · c253a76f5d
commit c253a76f5d
parent a2d67adff1
5 changed files with 37 additions and 0 deletions
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@ -461,6 +461,7 @@ radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, UNUSED VkCommandB
   cmd_buffer->gang.sem.emitted_leader_value = 0;
   cmd_buffer->gang.sem.va = 0;
   cmd_buffer->shader_upload_seq = 0;
+   cmd_buffer->has_indirect_pipeline_binds = false;

   if (cmd_buffer->upload.upload_bo)
      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
@ -9937,6 +9938,9 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre

   if (compute) {
      radv_dgc_before_dispatch(cmd_buffer);
+
+      if (!pGeneratedCommandsInfo->pipeline)
+         cmd_buffer->has_indirect_pipeline_binds = true;
   } else {
      struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
      struct radv_draw_info info;
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@ -740,6 +740,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
   simple_mtx_init(&device->trace_mtx, mtx_plain);
   simple_mtx_init(&device->pstate_mtx, mtx_plain);
   simple_mtx_init(&device->rt_handles_mtx, mtx_plain);
+   simple_mtx_init(&device->compute_scratch_mtx, mtx_plain);

   device->rt_handles = _mesa_hash_table_create(NULL, _mesa_hash_u32, _mesa_key_u32_equal);

@ -1122,6 +1123,7 @@ fail_queue:
   simple_mtx_destroy(&device->pstate_mtx);
   simple_mtx_destroy(&device->trace_mtx);
   simple_mtx_destroy(&device->rt_handles_mtx);
+   simple_mtx_destroy(&device->compute_scratch_mtx);
   mtx_destroy(&device->overallocation_mutex);

   vk_device_finish(&device->vk);
@ -1185,6 +1187,7 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
   simple_mtx_destroy(&device->pstate_mtx);
   simple_mtx_destroy(&device->trace_mtx);
   simple_mtx_destroy(&device->rt_handles_mtx);
+   simple_mtx_destroy(&device->compute_scratch_mtx);

   radv_trap_handler_finish(device);
   radv_finish_trace(device);
--- a/src/amd/vulkan/radv_pipeline_compute.c
+++ b/src/amd/vulkan/radv_pipeline_compute.c
@ -320,9 +320,21 @@ radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache, const VkC
   if (pipeline->base.create_flags & VK_PIPELINE_CREATE_INDIRECT_BINDABLE_BIT_NV) {
      const VkComputePipelineIndirectBufferInfoNV *indirect_buffer =
         vk_find_struct_const(pCreateInfo->pNext, COMPUTE_PIPELINE_INDIRECT_BUFFER_INFO_NV);
+      struct radv_shader *cs = pipeline->base.shaders[MESA_SHADER_COMPUTE];

      pipeline->indirect.va = indirect_buffer->deviceAddress;
      pipeline->indirect.size = indirect_buffer->size;
+
+      /* vkCmdUpdatePipelineIndirectBufferNV() can be called on any queues supporting transfer
+       * operations and it's not required to call it on the same queue as the DGC execute. Because
+       * it's not possible to know if the compute shader uses scratch when DGC execute is called,
+       * the only solution is gather the max scratch size of all indirect pipelines.
+       */
+      simple_mtx_lock(&device->compute_scratch_mtx);
+      device->compute_scratch_size_per_wave =
+         MAX2(device->compute_scratch_size_per_wave, cs->config.scratch_bytes_per_wave);
+      device->compute_scratch_waves = MAX2(device->compute_scratch_waves, radv_get_max_scratch_waves(device, cs));
+      simple_mtx_unlock(&device->compute_scratch_mtx);
   }

   *pPipeline = radv_pipeline_to_handle(&pipeline->base);
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@ -1268,6 +1268,11 @@ struct radv_device {

   /* Not NULL if a GPU hang report has been generated for VK_EXT_device_fault. */
   char *gpu_hang_report;
+
+   /* For indirect compute pipeline binds with DGC only. */
+   simple_mtx_t compute_scratch_mtx;
+   uint32_t compute_scratch_size_per_wave;
+   uint32_t compute_scratch_waves;
 };

 bool radv_device_set_pstate(struct radv_device *device, bool enable);
@ -1857,6 +1862,7 @@ struct radv_cmd_buffer {
   bool gds_needed;    /* for GFX10 streamout and NGG GS queries */
   bool gds_oa_needed; /* for GFX10 streamout */
   bool sample_positions_needed;
+   bool has_indirect_pipeline_binds;

   uint64_t gfx9_fence_va;
   uint32_t gfx9_fence_idx;
--- a/src/amd/vulkan/radv_queue.c
+++ b/src/amd/vulkan/radv_queue.c
@ -1169,6 +1169,8 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
                      struct vk_command_buffer *const *cmd_buffers, uint32_t cmd_buffer_count, bool *use_perf_counters,
                      bool *has_follower)
 {
+   bool has_indirect_pipeline_binds = false;
+
   if (queue->qf != RADV_QUEUE_GENERAL && queue->qf != RADV_QUEUE_COMPUTE) {
      for (uint32_t j = 0; j < cmd_buffer_count; j++) {
         struct radv_cmd_buffer *cmd_buffer = container_of(cmd_buffers[j], struct radv_cmd_buffer, vk);
@ -1207,6 +1209,16 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
      needs.sample_positions |= cmd_buffer->sample_positions_needed;
      *use_perf_counters |= cmd_buffer->state.uses_perf_counters;
      *has_follower |= !!cmd_buffer->gang.cs;
+
+      has_indirect_pipeline_binds |= cmd_buffer->has_indirect_pipeline_binds;
+   }
+
+   if (has_indirect_pipeline_binds) {
+      /* Use the maximum possible scratch size for indirect compute pipelines with DGC. */
+      simple_mtx_lock(&device->compute_scratch_mtx);
+      needs.compute_scratch_size_per_wave = MAX2(needs.compute_scratch_waves, device->compute_scratch_size_per_wave);
+      needs.compute_scratch_waves = MAX2(needs.compute_scratch_waves, device->compute_scratch_waves);
+      simple_mtx_unlock(&device->compute_scratch_mtx);
   }

   /* Sanitize scratch size information. */