radv: Add compute DGC preprocessing support.

This should reduce the overhead due to reduced syncs.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25835>
This commit is contained in:
Bas Nieuwenhuizen 2023-10-18 12:04:42 +02:00 committed by Marge Bot
parent 108227a84e
commit c4fb827441
3 changed files with 56 additions and 17 deletions

View file

@ -9519,18 +9519,6 @@ radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer commandBuffer, VkBuffer _b
static void radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer);
static void radv_dgc_after_dispatch(struct radv_cmd_buffer *cmd_buffer);
static bool
radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
{
VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer);
/* Enable conditional rendering (if not enabled by user) to skip prepare/execute DGC calls when
* the indirect sequence count might be zero. This can only be enabled on GFX because on ACE it's
* not possible to skip the execute DGC call (ie. no INDIRECT_PACKET)
*/
return cmd_buffer->qf == RADV_QUEUE_GENERAL && seq_count_buffer && !cmd_buffer->state.predicating;
}
VKAPI_ATTR void VKAPI_CALL
radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
@ -9557,7 +9545,11 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre
cmd_buffer->state.predicating = true;
}
radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo);
if (!layout->use_preprocess) {
radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo);
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2;
}
if (compute) {
radv_dgc_before_dispatch(cmd_buffer);

View file

@ -1337,6 +1337,18 @@ radv_CreateIndirectCommandsLayoutNV(VkDevice _device, const VkIndirectCommandsLa
if (!layout->indexed)
layout->binds_index_buffer = false;
layout->use_preprocess = pCreateInfo->flags & VK_INDIRECT_COMMANDS_LAYOUT_USAGE_EXPLICIT_PREPROCESS_BIT_NV;
/* From the Vulkan spec (1.3.269, chapter 32):
* "The bound descriptor sets and push constants that will be used with indirect command generation for the compute
* piplines must already be specified at the time of preprocessing commands with vkCmdPreprocessGeneratedCommandsNV.
* They must not change until the execution of indirect commands is submitted with vkCmdExecuteGeneratedCommandsNV."
*
* So we can always preprocess compute layouts.
*/
if (layout->pipeline_bind_point != VK_PIPELINE_BIND_POINT_COMPUTE)
layout->use_preprocess = false;
*pIndirectCommandsLayout = radv_indirect_command_layout_to_handle(layout);
return VK_SUCCESS;
}
@ -1379,12 +1391,45 @@ radv_GetGeneratedCommandsMemoryRequirementsNV(VkDevice _device,
align(cmd_buf_size + upload_buf_size, pMemoryRequirements->memoryRequirements.alignment);
}
bool
radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
{
VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer);
/* Enable conditional rendering (if not enabled by user) to skip prepare/execute DGC calls when
* the indirect sequence count might be zero. This can only be enabled on GFX because on ACE it's
* not possible to skip the execute DGC call (ie. no INDIRECT_PACKET)
*/
return cmd_buffer->qf == RADV_QUEUE_GENERAL && seq_count_buffer && !cmd_buffer->state.predicating;
}
VKAPI_ATTR void VKAPI_CALL
radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer,
const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
{
/* Can't do anything here as we depend on some dynamic state in some cases that we only know
* at draw time. */
VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
if (!layout->use_preprocess)
return;
const bool use_predication = radv_use_dgc_predication(cmd_buffer, pGeneratedCommandsInfo);
if (use_predication) {
VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer);
const uint64_t va = radv_buffer_get_va(seq_count_buffer->bo) + seq_count_buffer->offset +
pGeneratedCommandsInfo->sequencesCountOffset;
radv_begin_conditional_rendering(cmd_buffer, va, true);
cmd_buffer->state.predicating = true;
}
radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo);
if (use_predication) {
cmd_buffer->state.predicating = false;
radv_end_conditional_rendering(cmd_buffer);
}
}
/* Always need to call this directly before draw due to dependence on bound state. */
@ -1658,8 +1703,6 @@ radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsIn
radv_buffer_finish(&token_buffer);
radv_meta_restore(&saved_state, cmd_buffer);
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2;
}
/* VK_NV_device_generated_commands_compute */

View file

@ -3257,11 +3257,15 @@ struct radv_indirect_command_layout {
uint32_t ibo_type_32;
uint32_t ibo_type_8;
bool use_preprocess;
VkIndirectCommandsLayoutTokenNV tokens[0];
};
uint32_t radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info);
bool radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer,
const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo);
void radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo);
static inline uint32_t