mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-04 20:38:06 +02:00
radv: handle indirect pipeline binds with scratch and DGC
vkCmdUpdatePipelineIndirectBufferNV() can be called on any queue supporting transfer operations and it's not required to call it on the same queue as DGC execute. This is very annoying if the compute shader has scratch because it needs to be configured per queue. The solution is to gather the maximum possible scratch size used by indirect compute pipelines and use that to configure scratch. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27495>
This commit is contained in:
parent
a2d67adff1
commit
c253a76f5d
5 changed files with 37 additions and 0 deletions
|
|
@ -461,6 +461,7 @@ radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, UNUSED VkCommandB
|
|||
cmd_buffer->gang.sem.emitted_leader_value = 0;
|
||||
cmd_buffer->gang.sem.va = 0;
|
||||
cmd_buffer->shader_upload_seq = 0;
|
||||
cmd_buffer->has_indirect_pipeline_binds = false;
|
||||
|
||||
if (cmd_buffer->upload.upload_bo)
|
||||
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
|
||||
|
|
@ -9937,6 +9938,9 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre
|
|||
|
||||
if (compute) {
|
||||
radv_dgc_before_dispatch(cmd_buffer);
|
||||
|
||||
if (!pGeneratedCommandsInfo->pipeline)
|
||||
cmd_buffer->has_indirect_pipeline_binds = true;
|
||||
} else {
|
||||
struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
|
||||
struct radv_draw_info info;
|
||||
|
|
|
|||
|
|
@ -740,6 +740,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
|
|||
simple_mtx_init(&device->trace_mtx, mtx_plain);
|
||||
simple_mtx_init(&device->pstate_mtx, mtx_plain);
|
||||
simple_mtx_init(&device->rt_handles_mtx, mtx_plain);
|
||||
simple_mtx_init(&device->compute_scratch_mtx, mtx_plain);
|
||||
|
||||
device->rt_handles = _mesa_hash_table_create(NULL, _mesa_hash_u32, _mesa_key_u32_equal);
|
||||
|
||||
|
|
@ -1122,6 +1123,7 @@ fail_queue:
|
|||
simple_mtx_destroy(&device->pstate_mtx);
|
||||
simple_mtx_destroy(&device->trace_mtx);
|
||||
simple_mtx_destroy(&device->rt_handles_mtx);
|
||||
simple_mtx_destroy(&device->compute_scratch_mtx);
|
||||
mtx_destroy(&device->overallocation_mutex);
|
||||
|
||||
vk_device_finish(&device->vk);
|
||||
|
|
@ -1185,6 +1187,7 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
|
|||
simple_mtx_destroy(&device->pstate_mtx);
|
||||
simple_mtx_destroy(&device->trace_mtx);
|
||||
simple_mtx_destroy(&device->rt_handles_mtx);
|
||||
simple_mtx_destroy(&device->compute_scratch_mtx);
|
||||
|
||||
radv_trap_handler_finish(device);
|
||||
radv_finish_trace(device);
|
||||
|
|
|
|||
|
|
@ -320,9 +320,21 @@ radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache, const VkC
|
|||
if (pipeline->base.create_flags & VK_PIPELINE_CREATE_INDIRECT_BINDABLE_BIT_NV) {
|
||||
const VkComputePipelineIndirectBufferInfoNV *indirect_buffer =
|
||||
vk_find_struct_const(pCreateInfo->pNext, COMPUTE_PIPELINE_INDIRECT_BUFFER_INFO_NV);
|
||||
struct radv_shader *cs = pipeline->base.shaders[MESA_SHADER_COMPUTE];
|
||||
|
||||
pipeline->indirect.va = indirect_buffer->deviceAddress;
|
||||
pipeline->indirect.size = indirect_buffer->size;
|
||||
|
||||
/* vkCmdUpdatePipelineIndirectBufferNV() can be called on any queues supporting transfer
|
||||
* operations and it's not required to call it on the same queue as the DGC execute. Because
|
||||
* it's not possible to know if the compute shader uses scratch when DGC execute is called,
|
||||
* the only solution is gather the max scratch size of all indirect pipelines.
|
||||
*/
|
||||
simple_mtx_lock(&device->compute_scratch_mtx);
|
||||
device->compute_scratch_size_per_wave =
|
||||
MAX2(device->compute_scratch_size_per_wave, cs->config.scratch_bytes_per_wave);
|
||||
device->compute_scratch_waves = MAX2(device->compute_scratch_waves, radv_get_max_scratch_waves(device, cs));
|
||||
simple_mtx_unlock(&device->compute_scratch_mtx);
|
||||
}
|
||||
|
||||
*pPipeline = radv_pipeline_to_handle(&pipeline->base);
|
||||
|
|
|
|||
|
|
@ -1268,6 +1268,11 @@ struct radv_device {
|
|||
|
||||
/* Not NULL if a GPU hang report has been generated for VK_EXT_device_fault. */
|
||||
char *gpu_hang_report;
|
||||
|
||||
/* For indirect compute pipeline binds with DGC only. */
|
||||
simple_mtx_t compute_scratch_mtx;
|
||||
uint32_t compute_scratch_size_per_wave;
|
||||
uint32_t compute_scratch_waves;
|
||||
};
|
||||
|
||||
bool radv_device_set_pstate(struct radv_device *device, bool enable);
|
||||
|
|
@ -1857,6 +1862,7 @@ struct radv_cmd_buffer {
|
|||
bool gds_needed; /* for GFX10 streamout and NGG GS queries */
|
||||
bool gds_oa_needed; /* for GFX10 streamout */
|
||||
bool sample_positions_needed;
|
||||
bool has_indirect_pipeline_binds;
|
||||
|
||||
uint64_t gfx9_fence_va;
|
||||
uint32_t gfx9_fence_idx;
|
||||
|
|
|
|||
|
|
@ -1169,6 +1169,8 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
|
|||
struct vk_command_buffer *const *cmd_buffers, uint32_t cmd_buffer_count, bool *use_perf_counters,
|
||||
bool *has_follower)
|
||||
{
|
||||
bool has_indirect_pipeline_binds = false;
|
||||
|
||||
if (queue->qf != RADV_QUEUE_GENERAL && queue->qf != RADV_QUEUE_COMPUTE) {
|
||||
for (uint32_t j = 0; j < cmd_buffer_count; j++) {
|
||||
struct radv_cmd_buffer *cmd_buffer = container_of(cmd_buffers[j], struct radv_cmd_buffer, vk);
|
||||
|
|
@ -1207,6 +1209,16 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
|
|||
needs.sample_positions |= cmd_buffer->sample_positions_needed;
|
||||
*use_perf_counters |= cmd_buffer->state.uses_perf_counters;
|
||||
*has_follower |= !!cmd_buffer->gang.cs;
|
||||
|
||||
has_indirect_pipeline_binds |= cmd_buffer->has_indirect_pipeline_binds;
|
||||
}
|
||||
|
||||
if (has_indirect_pipeline_binds) {
|
||||
/* Use the maximum possible scratch size for indirect compute pipelines with DGC. */
|
||||
simple_mtx_lock(&device->compute_scratch_mtx);
|
||||
needs.compute_scratch_size_per_wave = MAX2(needs.compute_scratch_waves, device->compute_scratch_size_per_wave);
|
||||
needs.compute_scratch_waves = MAX2(needs.compute_scratch_waves, device->compute_scratch_waves);
|
||||
simple_mtx_unlock(&device->compute_scratch_mtx);
|
||||
}
|
||||
|
||||
/* Sanitize scratch size information. */
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue