From 3004923ae38fcd21b5d0f0289e84de90d32c3ce1 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 16 Feb 2024 18:36:53 +0100 Subject: [PATCH] radv: add support for conditional rendering on the compute queue with DGC Conditional rendering is annoying to implement on ACE because there is no predication packet like on GFX. With DGC it's even worse because ACE is missing the IB2 packet which means it's not possible to predicate the DGC IB entirely. The provided solution is to always run the DGC prepare shader if conditional rendering is enabled in order to generate a cmdbuf which only contains NOPs. Signed-off-by: Samuel Pitoiset Part-of: --- src/amd/vulkan/radv_cmd_buffer.c | 16 +++++- .../vulkan/radv_device_generated_commands.c | 52 +++++++++++++++++-- src/amd/vulkan/radv_private.h | 3 +- src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 2 - 4 files changed, 65 insertions(+), 8 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index aabb9de440e..b9bd4f5cd42 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -9913,7 +9913,21 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre } if (!radv_dgc_can_preprocess(layout, pipeline)) { - radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo); + const bool old_predicating = cmd_buffer->state.predicating; + + if (cmd_buffer->qf == RADV_QUEUE_COMPUTE && cmd_buffer->state.predicating) { + /* Suspend conditional rendering when the DGC execute is called on the compute queue to + * generate a cmdbuf which will skips dispatches when necessary. This is because the + * compute queue is missing IB2 which means it's not possible to skip the cmdbuf entirely. + */ + cmd_buffer->state.predicating = false; + } + + radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, old_predicating); + + if (cmd_buffer->qf == RADV_QUEUE_COMPUTE) { + cmd_buffer->state.predicating = old_predicating; + } cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2; } diff --git a/src/amd/vulkan/radv_device_generated_commands.c b/src/amd/vulkan/radv_device_generated_commands.c index ff46436b328..b33271f41fb 100644 --- a/src/amd/vulkan/radv_device_generated_commands.c +++ b/src/amd/vulkan/radv_device_generated_commands.c @@ -230,6 +230,11 @@ struct radv_dgc_params { uint8_t is_dispatch; uint8_t use_preamble; + + /* For conditional rendering on ACE. */ + uint8_t predicating; + uint8_t predication_type; + uint64_t predication_va; }; enum { @@ -1194,6 +1199,28 @@ dgc_emit_draw_mesh_tasks(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_ nir_pop_if(b, NULL); } +static nir_def * +dgc_is_cond_render_enabled(nir_builder *b) +{ + nir_def *res1, *res2; + + nir_push_if(b, nir_ieq_imm(b, load_param8(b, predicating), 1)); + { + nir_def *val = nir_load_global(b, load_param64(b, predication_va), 4, 1, 32); + /* By default, all rendering commands are discarded if the 32-bit value is zero. If the + * inverted flag is set, they are discarded if the value is non-zero. + */ + res1 = nir_ixor(b, nir_i2b(b, load_param8(b, predication_type)), nir_ine_imm(b, val, 0)); + } + nir_push_else(b, 0); + { + res2 = nir_imm_bool(b, false); + } + nir_pop_if(b, 0); + + return nir_if_phi(b, res1, res2); +} + static nir_shader * build_dgc_prepare_shader(struct radv_device *dev) { @@ -1231,6 +1258,15 @@ build_dgc_prepare_shader(struct radv_device *dev) } nir_pop_if(&b, NULL); + nir_push_if(&b, dgc_is_cond_render_enabled(&b)); + { + /* Reset the number of sequences when conditional rendering is enabled in order to skip the + * entire shader and pad the cmdbuf with NOPs. + */ + nir_store_var(&b, count_var, nir_imm_int(&b, 0), 0x1); + } + nir_pop_if(&b, NULL); + sequence_count = nir_load_var(&b, count_var); nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count)); @@ -1634,7 +1670,7 @@ radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer, const bool old_predicating = cmd_buffer->state.predicating; cmd_buffer->state.predicating = false; - radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo); + radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, false); /* Restore conditional rendering. */ cmd_buffer->state.predicating = old_predicating; @@ -1716,7 +1752,7 @@ radv_prepare_dgc_graphics(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedC static void radv_prepare_dgc_compute(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo, unsigned *upload_size, unsigned *upload_offset, void **upload_data, - struct radv_dgc_params *params) + struct radv_dgc_params *params, bool cond_render_enabled) { VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout); VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline); @@ -1741,6 +1777,12 @@ radv_prepare_dgc_compute(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCo params->dispatch_initiator = dispatch_initiator; params->is_dispatch = 1; + if (cond_render_enabled) { + params->predicating = true; + params->predication_va = cmd_buffer->state.predication_va; + params->predication_type = cmd_buffer->state.predication_type; + } + const struct radv_userdata_info *loc = radv_get_user_sgpr(cs, AC_UD_CS_GRID_SIZE); if (loc->sgpr_idx != -1) { params->grid_base_sgpr = (cs->info.user_data_0 + 4 * loc->sgpr_idx - SI_SH_REG_OFFSET) >> 2; @@ -1748,7 +1790,8 @@ radv_prepare_dgc_compute(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCo } void -radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo) +radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo, + bool cond_render_enabled) { VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout); VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline); @@ -1792,7 +1835,8 @@ radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsIn ¶ms); } else { assert(layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE); - radv_prepare_dgc_compute(cmd_buffer, pGeneratedCommandsInfo, &upload_size, &upload_offset, &upload_data, ¶ms); + radv_prepare_dgc_compute(cmd_buffer, pGeneratedCommandsInfo, &upload_size, &upload_offset, &upload_data, ¶ms, + cond_render_enabled); } if (layout->push_constant_mask) { diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index f35e1a35e03..3a435e7b0ae 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -3282,7 +3282,8 @@ uint32_t radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info bool radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo); -void radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo); +void radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo, + bool cond_render_enabled); bool radv_dgc_can_preprocess(const struct radv_indirect_command_layout *layout, struct radv_pipeline *pipeline); diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c index 889284f932f..29f6a5ff75d 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -779,8 +779,6 @@ radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo const uint32_t ib_size = radv_amdgpu_cs_get_initial_size(cs->ws, cs->hw_ip); VkResult result; - assert(!predicate); - /* Finalize the current CS without chaining to execute the external IB. */ radv_amdgpu_cs_finalize(_cs);