radv: handle indirect pipeline binds with scratch and DGC

vkCmdUpdatePipelineIndirectBufferNV() can be called on any queue
supporting transfer operations and it's not required to call it on the
same queue as DGC execute. This is very annoying if the compute shader
has scratch because it needs to be configured per queue.

The solution is to gather the maximum possible scratch size used by
indirect compute pipelines and use that to configure scratch.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27495>
This commit is contained in:
Samuel Pitoiset 2024-02-19 17:06:25 +01:00 committed by Marge Bot
parent a2d67adff1
commit c253a76f5d
5 changed files with 37 additions and 0 deletions

View file

@ -461,6 +461,7 @@ radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, UNUSED VkCommandB
cmd_buffer->gang.sem.emitted_leader_value = 0;
cmd_buffer->gang.sem.va = 0;
cmd_buffer->shader_upload_seq = 0;
cmd_buffer->has_indirect_pipeline_binds = false;
if (cmd_buffer->upload.upload_bo)
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
@ -9937,6 +9938,9 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre
if (compute) {
radv_dgc_before_dispatch(cmd_buffer);
if (!pGeneratedCommandsInfo->pipeline)
cmd_buffer->has_indirect_pipeline_binds = true;
} else {
struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
struct radv_draw_info info;

View file

@ -740,6 +740,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
simple_mtx_init(&device->trace_mtx, mtx_plain);
simple_mtx_init(&device->pstate_mtx, mtx_plain);
simple_mtx_init(&device->rt_handles_mtx, mtx_plain);
simple_mtx_init(&device->compute_scratch_mtx, mtx_plain);
device->rt_handles = _mesa_hash_table_create(NULL, _mesa_hash_u32, _mesa_key_u32_equal);
@ -1122,6 +1123,7 @@ fail_queue:
simple_mtx_destroy(&device->pstate_mtx);
simple_mtx_destroy(&device->trace_mtx);
simple_mtx_destroy(&device->rt_handles_mtx);
simple_mtx_destroy(&device->compute_scratch_mtx);
mtx_destroy(&device->overallocation_mutex);
vk_device_finish(&device->vk);
@ -1185,6 +1187,7 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
simple_mtx_destroy(&device->pstate_mtx);
simple_mtx_destroy(&device->trace_mtx);
simple_mtx_destroy(&device->rt_handles_mtx);
simple_mtx_destroy(&device->compute_scratch_mtx);
radv_trap_handler_finish(device);
radv_finish_trace(device);

View file

@ -320,9 +320,21 @@ radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache, const VkC
if (pipeline->base.create_flags & VK_PIPELINE_CREATE_INDIRECT_BINDABLE_BIT_NV) {
const VkComputePipelineIndirectBufferInfoNV *indirect_buffer =
vk_find_struct_const(pCreateInfo->pNext, COMPUTE_PIPELINE_INDIRECT_BUFFER_INFO_NV);
struct radv_shader *cs = pipeline->base.shaders[MESA_SHADER_COMPUTE];
pipeline->indirect.va = indirect_buffer->deviceAddress;
pipeline->indirect.size = indirect_buffer->size;
/* vkCmdUpdatePipelineIndirectBufferNV() can be called on any queues supporting transfer
* operations and it's not required to call it on the same queue as the DGC execute. Because
* it's not possible to know if the compute shader uses scratch when DGC execute is called,
* the only solution is gather the max scratch size of all indirect pipelines.
*/
simple_mtx_lock(&device->compute_scratch_mtx);
device->compute_scratch_size_per_wave =
MAX2(device->compute_scratch_size_per_wave, cs->config.scratch_bytes_per_wave);
device->compute_scratch_waves = MAX2(device->compute_scratch_waves, radv_get_max_scratch_waves(device, cs));
simple_mtx_unlock(&device->compute_scratch_mtx);
}
*pPipeline = radv_pipeline_to_handle(&pipeline->base);

View file

@ -1268,6 +1268,11 @@ struct radv_device {
/* Not NULL if a GPU hang report has been generated for VK_EXT_device_fault. */
char *gpu_hang_report;
/* For indirect compute pipeline binds with DGC only. */
simple_mtx_t compute_scratch_mtx;
uint32_t compute_scratch_size_per_wave;
uint32_t compute_scratch_waves;
};
bool radv_device_set_pstate(struct radv_device *device, bool enable);
@ -1857,6 +1862,7 @@ struct radv_cmd_buffer {
bool gds_needed; /* for GFX10 streamout and NGG GS queries */
bool gds_oa_needed; /* for GFX10 streamout */
bool sample_positions_needed;
bool has_indirect_pipeline_binds;
uint64_t gfx9_fence_va;
uint32_t gfx9_fence_idx;

View file

@ -1169,6 +1169,8 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
struct vk_command_buffer *const *cmd_buffers, uint32_t cmd_buffer_count, bool *use_perf_counters,
bool *has_follower)
{
bool has_indirect_pipeline_binds = false;
if (queue->qf != RADV_QUEUE_GENERAL && queue->qf != RADV_QUEUE_COMPUTE) {
for (uint32_t j = 0; j < cmd_buffer_count; j++) {
struct radv_cmd_buffer *cmd_buffer = container_of(cmd_buffers[j], struct radv_cmd_buffer, vk);
@ -1207,6 +1209,16 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
needs.sample_positions |= cmd_buffer->sample_positions_needed;
*use_perf_counters |= cmd_buffer->state.uses_perf_counters;
*has_follower |= !!cmd_buffer->gang.cs;
has_indirect_pipeline_binds |= cmd_buffer->has_indirect_pipeline_binds;
}
if (has_indirect_pipeline_binds) {
/* Use the maximum possible scratch size for indirect compute pipelines with DGC. */
simple_mtx_lock(&device->compute_scratch_mtx);
needs.compute_scratch_size_per_wave = MAX2(needs.compute_scratch_waves, device->compute_scratch_size_per_wave);
needs.compute_scratch_waves = MAX2(needs.compute_scratch_waves, device->compute_scratch_waves);
simple_mtx_unlock(&device->compute_scratch_mtx);
}
/* Sanitize scratch size information. */