diff --git a/src/amd/vulkan/layers/radv_sqtt_layer.c b/src/amd/vulkan/layers/radv_sqtt_layer.c index eda1fdf00b2..520dc2c741d 100644 --- a/src/amd/vulkan/layers/radv_sqtt_layer.c +++ b/src/amd/vulkan/layers/radv_sqtt_layer.c @@ -1263,11 +1263,12 @@ sqtt_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou } VKAPI_ATTR void VKAPI_CALL -sqtt_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed, - const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo) +sqtt_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed, + const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo) { /* There is no ExecuteIndirect Vulkan event in RGP yet. */ - API_MARKER_ALIAS(ExecuteGeneratedCommandsNV, ExecuteCommands, commandBuffer, isPreprocessed, pGeneratedCommandsInfo); + API_MARKER_ALIAS(ExecuteGeneratedCommandsEXT, ExecuteCommands, commandBuffer, isPreprocessed, + pGeneratedCommandsInfo); } VKAPI_ATTR void VKAPI_CALL diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build index 087b31b5af6..46af9008134 100644 --- a/src/amd/vulkan/meson.build +++ b/src/amd/vulkan/meson.build @@ -111,8 +111,8 @@ libradv_files = files( 'radv_device_memory.h', 'radv_descriptor_set.c', 'radv_descriptor_set.h', - 'radv_device_generated_commands.c', - 'radv_device_generated_commands.h', + 'radv_dgc.c', + 'radv_dgc.h', 'radv_event.c', 'radv_event.h', 'radv_formats.c', diff --git a/src/amd/vulkan/meta/radv_meta.c b/src/amd/vulkan/meta/radv_meta.c index 0eac0e6450d..63a94854f5b 100644 --- a/src/amd/vulkan/meta/radv_meta.c +++ b/src/amd/vulkan/meta/radv_meta.c @@ -511,7 +511,7 @@ radv_device_init_meta(struct radv_device *device) if (result != VK_SUCCESS) goto fail_astc_decode; - if (radv_uses_device_generated_commands(device)) { + if (device->vk.enabled_features.deviceGeneratedCommands) { result = radv_device_init_dgc_prepare_state(device, on_demand); if (result != VK_SUCCESS) goto fail_dgc; diff --git a/src/amd/vulkan/radv_buffer.c b/src/amd/vulkan/radv_buffer.c index 0bff0f7b20d..b265255d8cf 100644 --- a/src/amd/vulkan/radv_buffer.c +++ b/src/amd/vulkan/radv_buffer.c @@ -194,23 +194,11 @@ radv_get_buffer_memory_requirements(struct radv_device *device, VkDeviceSize siz pMemoryRequirements->memoryRequirements.memoryTypeBits = ((1u << pdev->memory_properties.memoryTypeCount) - 1u) & ~pdev->memory_types_32bit; - /* Allow 32-bit address-space for DGC usage, as this buffer will contain - * cmd buffer upload buffers, and those get passed to shaders through 32-bit - * pointers. - * - * We only allow it with this usage set, to "protect" the 32-bit address space - * from being overused. The actual requirement is done as part of - * vkGetGeneratedCommandsMemoryRequirementsNV. (we have to make sure their - * intersection is non-zero at least) - */ - if ((usage & VK_BUFFER_USAGE_2_INDIRECT_BUFFER_BIT_KHR) && radv_uses_device_generated_commands(device)) - pMemoryRequirements->memoryRequirements.memoryTypeBits |= pdev->memory_types_32bit; - /* Force 32-bit address-space for descriptor buffers usage because they are passed to shaders * through 32-bit pointers. */ - if (usage & - (VK_BUFFER_USAGE_2_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT | VK_BUFFER_USAGE_2_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT)) + if (usage & (VK_BUFFER_USAGE_2_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT | + VK_BUFFER_USAGE_2_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT | VK_BUFFER_USAGE_2_PREPROCESS_BUFFER_BIT_EXT)) pMemoryRequirements->memoryRequirements.memoryTypeBits = pdev->memory_types_32bit; if (flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index af770825ead..d205cebbda6 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -13,7 +13,7 @@ #include "radv_cp_dma.h" #include "radv_cs.h" #include "radv_debug.h" -#include "radv_device_generated_commands.h" +#include "radv_dgc.h" #include "radv_event.h" #include "radv_pipeline_rt.h" #include "radv_radeon_winsys.h" @@ -477,7 +477,6 @@ radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, UNUSED VkCommandB cmd_buffer->gang.sem.emitted_leader_value = 0; cmd_buffer->gang.sem.va = 0; cmd_buffer->shader_upload_seq = 0; - cmd_buffer->has_indirect_pipeline_binds = false; if (cmd_buffer->upload.upload_bo) radv_cs_add_buffer(device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo); @@ -646,8 +645,8 @@ radv_gang_barrier(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_ /* Add stage flush only when necessary. */ if (src_stage_mask & (VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | - VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT | - VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV)) + VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_EXT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) cmd_buffer->gang.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH; /* Block task shaders when we have to wait for CP DMA on the GFX cmdbuf. */ @@ -6645,9 +6644,10 @@ radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_s if (src_stage_mask & (VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | - VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV | VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR | + VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR | VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR | VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR | - VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) { + VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_EXT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) { cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH; } @@ -6719,7 +6719,7 @@ radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 has_DB_meta = false; } - if (src_flags & VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_NV) + if (src_flags & VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_EXT) flush_bits |= RADV_CMD_FLAG_INV_L2; if (src_flags & (VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT | VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR)) { @@ -6808,9 +6808,8 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 flush_bits |= RADV_CMD_FLAG_INV_SCACHE; /* Ensure the DGC meta shader can read the commands. */ - if (radv_uses_device_generated_commands(device)) { + if (device->vk.enabled_features.deviceGeneratedCommands) { flush_bits |= RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE; - if (pdev->info.gfx_level < GFX9) flush_bits |= RADV_CMD_FLAG_INV_L2; } @@ -6849,7 +6848,7 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 flush_bits |= RADV_CMD_FLAG_INV_L2; } - if (dst_flags & VK_ACCESS_2_COMMAND_PREPROCESS_READ_BIT_NV) { + if (dst_flags & VK_ACCESS_2_COMMAND_PREPROCESS_READ_BIT_EXT) { flush_bits |= RADV_CMD_FLAG_INV_VCACHE; if (pdev->info.gfx_level < GFX9) flush_bits |= RADV_CMD_FLAG_INV_L2; @@ -11558,52 +11557,31 @@ radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer commandBuffer, VkBuffer _b } /* TODO: Use these functions with the normal dispatch path. */ -static void radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer); +static void radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point); static void radv_dgc_after_dispatch(struct radv_cmd_buffer *cmd_buffer); -VKAPI_ATTR void VKAPI_CALL -radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer, - const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo) -{ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout); - VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline); - - if (!radv_dgc_can_preprocess(layout, pipeline)) - return; - - /* VK_EXT_conditional_rendering says that copy commands should not be - * affected by conditional rendering. - */ - const bool old_predicating = cmd_buffer->state.predicating; - cmd_buffer->state.predicating = false; - - radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, old_predicating); - - /* Restore conditional rendering. */ - cmd_buffer->state.predicating = old_predicating; -} - +/* VK_EXT_device_generated_commands */ static void -radv_dgc_execute_ib(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo) +radv_dgc_execute_ib(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo) { - VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - const bool has_task_shader = radv_dgc_with_task_shader(pGeneratedCommandsInfo); - + const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + const VkGeneratedCommandsPipelineInfoEXT *pipeline_info = + vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT); + const VkGeneratedCommandsShaderInfoEXT *eso_info = + vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT); + const struct radv_shader *task_shader = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK); const uint32_t cmdbuf_size = radv_get_indirect_main_cmdbuf_size(pGeneratedCommandsInfo); - const uint64_t ib_va = - radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset + pGeneratedCommandsInfo->preprocessOffset; - const uint64_t main_trailer_va = ib_va + radv_get_indirect_main_trailer_offset(pGeneratedCommandsInfo); + const uint64_t ib_va = pGeneratedCommandsInfo->preprocessAddress; const uint64_t main_ib_va = ib_va + radv_get_indirect_main_cmdbuf_offset(pGeneratedCommandsInfo); + const uint64_t main_trailer_va = ib_va + radv_get_indirect_main_trailer_offset(pGeneratedCommandsInfo); device->ws->cs_chain_dgc_ib(cmd_buffer->cs, main_ib_va, cmdbuf_size >> 2, main_trailer_va, cmd_buffer->state.predicating); - if (has_task_shader) { + if (task_shader) { const uint32_t ace_cmdbuf_size = radv_get_indirect_ace_cmdbuf_size(pGeneratedCommandsInfo); - const uint64_t ace_trailer_va = ib_va + radv_get_indirect_ace_trailer_offset(pGeneratedCommandsInfo); const uint64_t ace_ib_va = ib_va + radv_get_indirect_ace_cmdbuf_offset(pGeneratedCommandsInfo); + const uint64_t ace_trailer_va = ib_va + radv_get_indirect_ace_trailer_offset(pGeneratedCommandsInfo); assert(cmd_buffer->gang.cs); device->ws->cs_chain_dgc_ib(cmd_buffer->gang.cs, ace_ib_va, ace_cmdbuf_size >> 2, ace_trailer_va, @@ -11612,82 +11590,82 @@ radv_dgc_execute_ib(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommand } VKAPI_ATTR void VKAPI_CALL -radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed, - const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo) +radv_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed, + const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo) { - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout); - VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline); - VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - const bool compute = layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE; + VK_FROM_HANDLE(radv_indirect_execution_set, ies, pGeneratedCommandsInfo->indirectExecutionSet); + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); const bool use_predication = radv_use_dgc_predication(cmd_buffer, pGeneratedCommandsInfo); + const bool compute = !!(layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)); + const bool rt = !!(layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)); + const VkGeneratedCommandsPipelineInfoEXT *pipeline_info = + vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT); + const VkGeneratedCommandsShaderInfoEXT *eso_info = + vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT); - /* Secondary command buffers are needed for the full extension but can't use - * PKT3_INDIRECT_BUFFER. - */ + if (ies) { + radv_cs_add_buffer(device->ws, cmd_buffer->cs, ies->bo); + + cmd_buffer->compute_scratch_size_per_wave_needed = + MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, ies->compute_scratch_size_per_wave); + cmd_buffer->compute_scratch_waves_wanted = + MAX2(cmd_buffer->compute_scratch_waves_wanted, ies->compute_scratch_waves); + } + + /* Secondary command buffers are banned. */ assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); if (use_predication) { - VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer); - const uint64_t va = radv_buffer_get_va(seq_count_buffer->bo) + seq_count_buffer->offset + - pGeneratedCommandsInfo->sequencesCountOffset; - + const uint64_t va = pGeneratedCommandsInfo->sequenceCountAddress; radv_begin_conditional_rendering(cmd_buffer, va, true); } - if (!radv_dgc_can_preprocess(layout, pipeline)) { + if (!(layout->vk.usage & VK_INDIRECT_COMMANDS_LAYOUT_USAGE_EXPLICIT_PREPROCESS_BIT_EXT)) { /* Suspend conditional rendering when the DGC execute is called on the compute queue to - * generate a cmdbuf which will skips dispatches when necessary. This is because the - * compute queue is missing IB2 which means it's not possible to skip the cmdbuf entirely. - * It should also be suspended when task shaders are used because the DGC ACE IB would be + * generate a cmdbuf which will skips dispatches when necessary. This is because the compute + * queue is missing IB2 which means it's not possible to skip the cmdbuf entirely. This + * should also be suspended when task shaders are used because the DGC ACE IB would be * uninitialized otherwise. */ - const bool suspend_cond_render = - (cmd_buffer->qf == RADV_QUEUE_COMPUTE || radv_dgc_with_task_shader(pGeneratedCommandsInfo)); + const bool suspend_conditional_rendering = + (cmd_buffer->qf == RADV_QUEUE_COMPUTE || radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK)); const bool old_predicating = cmd_buffer->state.predicating; - if (suspend_cond_render && cmd_buffer->state.predicating) { + if (suspend_conditional_rendering && cmd_buffer->state.predicating) { cmd_buffer->state.predicating = false; } - radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, old_predicating); + radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, cmd_buffer, old_predicating); - if (suspend_cond_render) { + if (suspend_conditional_rendering) { cmd_buffer->state.predicating = old_predicating; } cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2; - if (radv_dgc_with_task_shader(pGeneratedCommandsInfo)) { - /* Make sure the DGC ACE IB will wait for the DGC prepare shader before the execution - * starts. - */ + /* Make sure the DGC ACE IB will wait for the DGC prepare shader before the execution + * starts. + */ + if (radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK)) { radv_gang_barrier(cmd_buffer, VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV, VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT); } } - if (compute) { - radv_dgc_before_dispatch(cmd_buffer); - - if (!pGeneratedCommandsInfo->pipeline) - cmd_buffer->has_indirect_pipeline_binds = true; + if (rt) { + radv_dgc_before_dispatch(cmd_buffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR); + } else if (compute) { + radv_dgc_before_dispatch(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE); } else { - struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline); - struct radv_draw_info info; + struct radv_draw_info info = { + .count = pGeneratedCommandsInfo->maxSequenceCount, + .indirect = (void *)&info, + .indexed = !!(layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_INDEXED)), + }; - info.count = pGeneratedCommandsInfo->sequencesCount; - info.indirect = prep_buffer; /* We're not really going use it this way, but a good signal - that this is not direct. */ - info.indirect_offset = 0; - info.stride = 0; - info.strmout_buffer = NULL; - info.count_buffer = NULL; - info.indexed = layout->indexed; - info.instance_count = 0; - - if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH)) { + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) { if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1, true)) return; } else { @@ -11696,46 +11674,63 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre } } - const uint32_t view_mask = cmd_buffer->state.render.view_mask; - if (!radv_cmd_buffer_uses_mec(cmd_buffer)) { radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating)); radeon_emit(cmd_buffer->cs, 0); } - radv_cs_add_buffer(device->ws, cmd_buffer->cs, prep_buffer->bo); - - if (compute || !view_mask) { + const uint32_t view_mask = cmd_buffer->state.render.view_mask; + if (rt || compute || !view_mask) { radv_dgc_execute_ib(cmd_buffer, pGeneratedCommandsInfo); } else { u_foreach_bit (view, view_mask) { radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view); - radv_dgc_execute_ib(cmd_buffer, pGeneratedCommandsInfo); } } - if (compute) { + if (rt) { + cmd_buffer->push_constant_stages |= RADV_RT_STAGE_BITS; + + radv_dgc_after_dispatch(cmd_buffer); + } else if (compute) { cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT; - if (!pGeneratedCommandsInfo->pipeline) + if (ies) radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE); radv_dgc_after_dispatch(cmd_buffer); } else { - struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline); - - if (layout->binds_index_buffer) { + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IB)) { cmd_buffer->state.last_index_type = -1; cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER; } - if (layout->bind_vbo_mask) + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER; - cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages; + if (pipeline_info) { + VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline); + struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline); + + cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages; + } else { + assert(eso_info); + + for (unsigned i = 0; i < eso_info->shaderCount; ++i) { + VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]); + + cmd_buffer->push_constant_stages |= mesa_to_vk_shader_stage(shader_object->stage); + } + } + + if (!(layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_INDEXED))) { + /* Non-indexed draws overwrite VGT_INDEX_TYPE, so the state must be + * re-emitted before the next indexed draw. + */ + cmd_buffer->state.last_index_type = -1; + } - cmd_buffer->state.last_index_type = -1; cmd_buffer->state.last_num_instances = -1; cmd_buffer->state.last_vertex_offset_valid = false; cmd_buffer->state.last_first_instance = -1; @@ -12102,12 +12097,16 @@ radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_inf } static void -radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer) +radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point) { struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); const struct radv_physical_device *pdev = radv_device_physical(device); - struct radv_compute_pipeline *pipeline = cmd_buffer->state.compute_pipeline; - struct radv_shader *compute_shader = cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]; + struct radv_compute_pipeline *pipeline = bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR + ? &cmd_buffer->state.rt_pipeline->base + : cmd_buffer->state.compute_pipeline; + struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR + ? cmd_buffer->state.rt_pipeline->prolog + : cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]; bool pipeline_is_dirty = pipeline != cmd_buffer->state.emitted_compute_pipeline; /* We will have run the DGC patch shaders before, so we can assume that there is something to @@ -12119,9 +12118,11 @@ radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer) if (pipeline) radv_emit_compute_pipeline(cmd_buffer, pipeline); + if (bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR) + radv_emit_rt_stack_size(cmd_buffer); radv_emit_cache_flush(cmd_buffer); - radv_upload_compute_shader_descriptors(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE); + radv_upload_compute_shader_descriptors(cmd_buffer, bind_point); if (pipeline_is_dirty) { const bool has_prefetch = pdev->info.gfx_level >= GFX7; @@ -12136,7 +12137,9 @@ radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer) * We only need to do this when the pipeline is dirty because when we switch between * the two we always need to switch pipelines. */ - radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR); + radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE + ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR + : VK_PIPELINE_BIND_POINT_COMPUTE); } } @@ -13672,42 +13675,6 @@ radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, VkPipelineStageFlag assert(cmd_buffer->cs->cdw <= cdw_max); } -VKAPI_ATTR void VKAPI_CALL -radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, - VkPipeline pipeline, uint32_t groupIndex) -{ - fprintf(stderr, "radv: unimplemented vkCmdBindPipelineShaderGroupNV\n"); - abort(); -} - -/* VK_NV_device_generated_commands_compute */ -VKAPI_ATTR void VKAPI_CALL -radv_CmdUpdatePipelineIndirectBufferNV(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, - VkPipeline _pipeline) -{ - VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - VK_FROM_HANDLE(radv_pipeline, pipeline, _pipeline); - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - const struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline); - const struct radeon_cmdbuf *cs = &compute_pipeline->indirect.cs; - const uint64_t va = compute_pipeline->indirect.va; - struct radv_compute_pipeline_metadata metadata; - uint32_t offset = 0; - - radv_get_compute_shader_metadata(device, compute_pipeline->base.shaders[MESA_SHADER_COMPUTE], &metadata); - - radv_write_data(cmd_buffer, V_370_ME, va + offset, sizeof(metadata) / 4, (const uint32_t *)&metadata, false); - offset += sizeof(metadata); - - radv_write_data(cmd_buffer, V_370_ME, va + offset, 1, (const uint32_t *)&cs->cdw, false); - offset += sizeof(uint32_t); - - radv_write_data(cmd_buffer, V_370_ME, va + offset, cs->cdw, (const uint32_t *)cs->buf, false); - offset += cs->cdw * sizeof(uint32_t); - - assert(offset < compute_pipeline->indirect.size); -} - /* VK_EXT_descriptor_buffer */ VKAPI_ATTR void VKAPI_CALL radv_CmdBindDescriptorBuffersEXT(VkCommandBuffer commandBuffer, uint32_t bufferCount, diff --git a/src/amd/vulkan/radv_cmd_buffer.h b/src/amd/vulkan/radv_cmd_buffer.h index f5a7c7761ba..f3e2c230730 100644 --- a/src/amd/vulkan/radv_cmd_buffer.h +++ b/src/amd/vulkan/radv_cmd_buffer.h @@ -537,7 +537,6 @@ struct radv_cmd_buffer { bool gds_needed; /* for GFX10 streamout and NGG GS queries */ bool gds_oa_needed; /* for GFX10 streamout */ bool sample_positions_needed; - bool has_indirect_pipeline_binds; uint64_t gfx9_fence_va; uint32_t gfx9_fence_idx; diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 2e6b940425c..02810372a1d 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -1100,7 +1100,6 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr simple_mtx_init(&device->trace_mtx, mtx_plain); simple_mtx_init(&device->pstate_mtx, mtx_plain); simple_mtx_init(&device->rt_handles_mtx, mtx_plain); - simple_mtx_init(&device->compute_scratch_mtx, mtx_plain); simple_mtx_init(&device->pso_cache_stats_mtx, mtx_plain); device->rt_handles = _mesa_hash_table_create(NULL, _mesa_hash_u32, _mesa_key_u32_equal); @@ -1359,7 +1358,6 @@ fail_queue: simple_mtx_destroy(&device->pstate_mtx); simple_mtx_destroy(&device->trace_mtx); simple_mtx_destroy(&device->rt_handles_mtx); - simple_mtx_destroy(&device->compute_scratch_mtx); simple_mtx_destroy(&device->pso_cache_stats_mtx); mtx_destroy(&device->overallocation_mutex); @@ -1417,7 +1415,6 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) simple_mtx_destroy(&device->pstate_mtx); simple_mtx_destroy(&device->trace_mtx); simple_mtx_destroy(&device->rt_handles_mtx); - simple_mtx_destroy(&device->compute_scratch_mtx); simple_mtx_destroy(&device->pso_cache_stats_mtx); radv_destroy_shader_arenas(device); diff --git a/src/amd/vulkan/radv_device.h b/src/amd/vulkan/radv_device.h index 39e4907728f..9fd158557d2 100644 --- a/src/amd/vulkan/radv_device.h +++ b/src/amd/vulkan/radv_device.h @@ -541,11 +541,6 @@ struct radv_device { /* Not NULL if a GPU hang report has been generated for VK_EXT_device_fault. */ char *gpu_hang_report; - /* For indirect compute pipeline binds with DGC only. */ - simple_mtx_t compute_scratch_mtx; - uint32_t compute_scratch_size_per_wave; - uint32_t compute_scratch_waves; - /* PSO cache stats */ simple_mtx_t pso_cache_stats_mtx; struct radv_pso_cache_stats pso_cache_stats[RADV_PIPELINE_TYPE_COUNT]; @@ -559,12 +554,6 @@ radv_device_physical(const struct radv_device *dev) return (struct radv_physical_device *)dev->vk.physical; } -static inline bool -radv_uses_device_generated_commands(const struct radv_device *device) -{ - return device->vk.enabled_features.deviceGeneratedCommandsNV || device->vk.enabled_features.deviceGeneratedCompute; -} - static inline bool radv_uses_primitives_generated_query(const struct radv_device *device) { diff --git a/src/amd/vulkan/radv_device_generated_commands.h b/src/amd/vulkan/radv_device_generated_commands.h deleted file mode 100644 index 0e739db3233..00000000000 --- a/src/amd/vulkan/radv_device_generated_commands.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright © 2016 Red Hat. - * Copyright © 2016 Bas Nieuwenhuizen - * - * based in part on anv driver which is: - * Copyright © 2015 Intel Corporation - * - * SPDX-License-Identifier: MIT - */ - -#ifndef RADV_DEVICE_GENERATED_COMMANDS_H -#define RADV_DEVICE_GENERATED_COMMANDS_H - -#include "vk_object.h" - -#include "radv_constants.h" - -struct radv_cmd_buffer; -struct radv_pipeline; - -struct radv_indirect_command_layout { - struct vk_object_base base; - - VkIndirectCommandsLayoutUsageFlagsNV flags; - VkPipelineBindPoint pipeline_bind_point; - - uint32_t input_stride; - uint32_t token_count; - - bool indexed; - bool binds_index_buffer; - bool draw_mesh_tasks; - uint16_t draw_params_offset; - uint16_t index_buffer_offset; - - uint16_t dispatch_params_offset; - - bool bind_pipeline; - uint16_t pipeline_params_offset; - - bool vertex_dynamic_stride; - uint32_t bind_vbo_mask; - uint32_t vbo_offsets[MAX_VBS]; - - uint64_t push_constant_mask; - uint32_t push_constant_offsets[MAX_PUSH_CONSTANTS_SIZE / 4]; - uint32_t push_constant_size; - - uint32_t ibo_type_32; - uint32_t ibo_type_8; - - VkPipeline pipeline; - - VkIndirectCommandsLayoutTokenNV tokens[0]; -}; - -VK_DEFINE_NONDISP_HANDLE_CASTS(radv_indirect_command_layout, base, VkIndirectCommandsLayoutNV, - VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV) - -uint32_t radv_get_indirect_main_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info); - -uint32_t radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info); - -uint32_t radv_get_indirect_main_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info); - -uint32_t radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info); - -uint32_t radv_get_indirect_main_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info); - -uint32_t radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info); - -bool radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, - const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo); - -bool radv_dgc_can_preprocess(const struct radv_indirect_command_layout *layout, struct radv_pipeline *pipeline); - -bool radv_dgc_with_task_shader(const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo); - -void radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo, - bool cond_render_enabled); - -#endif /* RADV_DEVICE_GENERATED_COMMANDS_H */ diff --git a/src/amd/vulkan/radv_device_generated_commands.c b/src/amd/vulkan/radv_dgc.c similarity index 61% rename from src/amd/vulkan/radv_device_generated_commands.c rename to src/amd/vulkan/radv_dgc.c index 18149ac2f8d..febeb6d657e 100644 --- a/src/amd/vulkan/radv_device_generated_commands.c +++ b/src/amd/vulkan/radv_dgc.c @@ -1,23 +1,24 @@ /* - * Copyright © 2021 Google + * Copyright © 2024 Valve Corporation * * SPDX-License-Identifier: MIT */ -#include "radv_device_generated_commands.h" +#include "radv_dgc.h" #include "meta/radv_meta.h" -#include "radv_cmd_buffer.h" #include "radv_entrypoints.h" +#include "radv_pipeline_rt.h" #include "ac_rgp.h" #include "nir_builder.h" #include "vk_common_entrypoints.h" +#include "vk_device_generated_commands.h" #include "vk_shader_module.h" -#define DGC_VBO_INFO_SIZE (sizeof(struct radv_vbo_info) + 4 /* vbo_offsets */) #define PKT3_INDIRECT_BUFFER_BYTES 16 +#define DGC_VBO_INFO_SIZE (sizeof(struct radv_vbo_info) + 4 /* vbo_offsets */) /* The DGC command buffer layout is quite complex, here's some explanations: * @@ -27,10 +28,9 @@ * | trailer | commands | padding | jump to trailer | * +---------+----------+---------+-----------------+ * - * The trailer is used to implement IB chaining for compute queue because IB2 - * isn't supported. The trailer is patched at execute time on the CPU to chain - * back the DGC command buffer. The trailer is added at the beginning to make - * sure the offset is fixed (ie. not possible to know the offset with a + * The trailer is used to implement IB chaining for compute queue because IB2 isn't supported. The + * trailer is patched at execute time to chain back the DGC command buffer. The trailer is added at + * the beginning to make sure the offset is fixed (ie. not possible to know the offset with a * preamble). In practice the execution looks like: * * +----------+---------+-----------------+ +---------+ +-----------------------+ @@ -53,21 +53,100 @@ * * The execution of this DGC command buffer is different if it's GFX or COMPUTE queue: * - on GFX, the driver uses the IB2 packet which the easiest solution - * - on COMPUTE, IB2 isn't supported and the driver chains the DGC command - * buffer by patching the trailer + * - on COMPUTE, IB2 isn't supported and the driver chains the DGC command buffer by patching the + * trailer */ -static void -radv_get_sequence_size_compute(const struct radv_indirect_command_layout *layout, - const struct radv_compute_pipeline *pipeline, uint32_t *cmd_size, uint32_t *upload_size) + +static uint32_t +radv_pad_cmdbuf(const struct radv_device *device, uint32_t size, enum amd_ip_type ip_type) { - const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk); const struct radv_physical_device *pdev = radv_device_physical(device); + const uint32_t ib_alignment = (pdev->info.ip[ip_type].ib_pad_dw_mask + 1) * 4; + + return align(size, ib_alignment); +} + +static uint32_t +radv_align_cmdbuf(const struct radv_device *device, uint32_t size, enum amd_ip_type ip_type) +{ + const struct radv_physical_device *pdev = radv_device_physical(device); + const uint32_t ib_alignment = pdev->info.ip[ip_type].ib_alignment; + + return align(size, ib_alignment); +} + +static unsigned +radv_dgc_preamble_cmdbuf_size(const struct radv_device *device, enum amd_ip_type ip_type) +{ + return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type); +} + +static unsigned +radv_dgc_trailer_cmdbuf_size(const struct radv_device *device, enum amd_ip_type ip_type) +{ + return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type); +} + +static bool +radv_dgc_use_preamble(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo) +{ + /* Heuristic on when the overhead for the preamble (i.e. double jump) is worth it. Obviously + * a bit of a guess as it depends on the actual count which we don't know. */ + return pGeneratedCommandsInfo->sequenceCountAddress != 0 && pGeneratedCommandsInfo->maxSequenceCount >= 64; +} + +struct radv_shader * +radv_dgc_get_shader(const VkGeneratedCommandsPipelineInfoEXT *pipeline_info, + const VkGeneratedCommandsShaderInfoEXT *eso_info, gl_shader_stage stage) +{ + if (pipeline_info) { + VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline); + return radv_get_shader(pipeline->shaders, stage); + } else if (eso_info) { + VkShaderStageFlags stages = 0; + + for (uint32_t i = 0; i < eso_info->shaderCount; i++) { + VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]); + stages |= mesa_to_vk_shader_stage(shader_object->stage); + } + + for (uint32_t i = 0; i < eso_info->shaderCount; i++) { + VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]); + + if (shader_object->stage != stage) + continue; + + if (stage == MESA_SHADER_VERTEX && (stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)) { + return shader_object->as_ls.shader; + } else if ((stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL) && + (stages & VK_SHADER_STAGE_GEOMETRY_BIT)) { + return shader_object->as_es.shader; + } else { + return shader_object->shader; + } + } + } + + return NULL; +} + +static void +radv_get_sequence_size_compute(const struct radv_indirect_command_layout *layout, const void *pNext, uint32_t *cmd_size, + uint32_t *upload_size) +{ + const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk); + const struct radv_physical_device *pdev = radv_device_physical(device); + + const VkGeneratedCommandsPipelineInfoEXT *pipeline_info = + vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT); + const VkGeneratedCommandsShaderInfoEXT *eso_info = vk_find_struct_const(pNext, GENERATED_COMMANDS_SHADER_INFO_EXT); + + struct radv_shader *cs = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_COMPUTE); /* dispatch */ *cmd_size += 5 * 4; - if (pipeline) { - struct radv_shader *cs = radv_get_shader(pipeline->base.shaders, MESA_SHADER_COMPUTE); + if (cs) { const struct radv_userdata_info *loc = radv_get_user_sgpr_info(cs, AC_UD_CS_GRID_SIZE); if (loc->sgpr_idx != -1) { if (device->load_grid_size_from_user_sgpr) { @@ -112,60 +191,89 @@ radv_get_sequence_size_compute(const struct radv_indirect_command_layout *layout } static void -radv_get_sequence_size_graphics(const struct radv_indirect_command_layout *layout, - const struct radv_graphics_pipeline *pipeline, uint32_t *cmd_size, - uint32_t *ace_cmd_size, uint32_t *upload_size) +radv_get_sequence_size_graphics(const struct radv_indirect_command_layout *layout, const void *pNext, + uint32_t *cmd_size, uint32_t *ace_cmd_size, uint32_t *upload_size) { - const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk); + const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk); const struct radv_physical_device *pdev = radv_device_physical(device); - const struct radv_shader *vs = radv_get_shader(pipeline->base.shaders, MESA_SHADER_VERTEX); - if (layout->bind_vbo_mask) { + const VkGeneratedCommandsPipelineInfoEXT *pipeline_info = + vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT); + const VkGeneratedCommandsShaderInfoEXT *eso_info = vk_find_struct_const(pNext, GENERATED_COMMANDS_SHADER_INFO_EXT); + + struct radv_shader *vs = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_VERTEX); + + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) { *upload_size += 16 * util_bitcount(vs->info.vs.vb_desc_usage_mask); /* One PKT3_SET_SH_REG for emitting VBO pointer (32-bit) */ *cmd_size += 3 * 4; } - if (layout->indexed) { - if (layout->binds_index_buffer) { - /* Index type write (normal reg write) + index buffer base write (64-bits, but special packet - * so only 1 word overhead) + index buffer size (again, special packet so only 1 word - * overhead) - */ - *cmd_size += (3 + 3 + 2) * 4; + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IB)) { + /* Index type write (normal reg write) + index buffer base write (64-bits, but special packet + * so only 1 word overhead) + index buffer size (again, special packet so only 1 word + * overhead) + */ + *cmd_size += (3 + 3 + 2) * 4; + } - /* userdata writes + instance count + indexed draw */ - *cmd_size += (5 + 2 + 5) * 4; - } else { - /* PKT3_SET_BASE + PKT3_DRAW_{INDEX}_INDIRECT_MULTI */ - *cmd_size += (4 + (pipeline->uses_drawid ? 10 : 5)) * 4; - } - } else { - if (layout->draw_mesh_tasks) { - const struct radv_shader *task_shader = radv_get_shader(pipeline->base.shaders, MESA_SHADER_TASK); + if (layout->vk.draw_count) { + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) { + const struct radv_shader *task_shader = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK); if (task_shader) { - const struct radv_userdata_info *xyz_loc = radv_get_user_sgpr_info(task_shader, AC_UD_CS_GRID_SIZE); - const struct radv_userdata_info *draw_id_loc = radv_get_user_sgpr_info(task_shader, AC_UD_CS_TASK_DRAW_ID); - /* PKT3_DISPATCH_TASKMESH_GFX */ *cmd_size += 4 * 4; - if (xyz_loc->sgpr_idx != -1) - *ace_cmd_size += 5 * 4; - if (draw_id_loc->sgpr_idx != -1) - *ace_cmd_size += 3 * 4; - - /* PKT3_DISPATCH_TASKMESH_DIRECT_ACE */ - *ace_cmd_size += 6 * 4; + /* PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE */ + *ace_cmd_size += 11 * 4; } else { - /* userdata writes + instance count + non-indexed draw */ - *cmd_size += (6 + 2 + (pdev->mesh_fast_launch_2 ? 5 : 3)) * 4; + struct radv_shader *ms = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_MESH); + + /* PKT3_SET_BASE + PKT3_SET_SH_REG + PKT3_DISPATCH_MESH_INDIRECT_MULTI */ + *cmd_size += (4 + (ms->info.vs.needs_draw_id ? 3 : 0) + 9) * 4; } } else { - /* userdata writes + instance count + non-indexed draw */ - *cmd_size += (5 + 2 + 3) * 4; + /* PKT3_SET_BASE + PKT3_DRAW_{INDEX}_INDIRECT_MULTI */ + *cmd_size += (4 + 10) * 4; + } + } else { + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_INDEXED)) { + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IB)) { + /* userdata writes + instance count + indexed draw */ + *cmd_size += (5 + 2 + 5) * 4; + } else { + /* PKT3_SET_BASE + PKT3_SET_SH_REG + PKT3_DRAW_{INDEX}_INDIRECT_MULTI */ + *cmd_size += (4 + (vs->info.vs.needs_draw_id ? 10 : 5)) * 4; + } + } else { + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) { + const struct radv_shader *task_shader = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK); + + if (task_shader) { + const struct radv_userdata_info *xyz_loc = radv_get_user_sgpr_info(task_shader, AC_UD_CS_GRID_SIZE); + const struct radv_userdata_info *draw_id_loc = + radv_get_user_sgpr_info(task_shader, AC_UD_CS_TASK_DRAW_ID); + + /* PKT3_DISPATCH_TASKMESH_GFX */ + *cmd_size += 4 * 4; + + if (xyz_loc->sgpr_idx != -1) + *ace_cmd_size += 5 * 4; + if (draw_id_loc->sgpr_idx != -1) + *ace_cmd_size += 3 * 4; + + /* PKT3_DISPATCH_TASKMESH_DIRECT_ACE */ + *ace_cmd_size += 6 * 4; + } else { + /* userdata writes + instance count + non-indexed draw */ + *cmd_size += (6 + 2 + (pdev->mesh_fast_launch_2 ? 5 : 3)) * 4; + } + } else { + /* userdata writes + instance count + non-indexed draw */ + *cmd_size += (5 + 2 + 3) * 4; + } } } @@ -176,24 +284,106 @@ radv_get_sequence_size_graphics(const struct radv_indirect_command_layout *layou } static void -radv_get_sequence_size(const struct radv_indirect_command_layout *layout, struct radv_pipeline *pipeline, - uint32_t *cmd_size, uint32_t *ace_cmd_size, uint32_t *upload_size) +radv_get_sequence_size_rt(const struct radv_indirect_command_layout *layout, const void *pNext, uint32_t *cmd_size, + uint32_t *upload_size) { - const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk); + const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk); + + const VkGeneratedCommandsPipelineInfoEXT *pipeline_info = + vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT); + VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline); + const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline); + const struct radv_shader *rt_prolog = rt_pipeline->prolog; + + /* dispatch */ + *cmd_size += 5 * 4; + + const struct radv_userdata_info *cs_grid_size_loc = radv_get_user_sgpr_info(rt_prolog, AC_UD_CS_GRID_SIZE); + if (cs_grid_size_loc->sgpr_idx != -1) { + if (device->load_grid_size_from_user_sgpr) { + /* PKT3_LOAD_SH_REG_INDEX */ + *cmd_size += 5 * 4; + } else { + /* PKT3_SET_SH_REG for pointer */ + *cmd_size += 4 * 4; + } + } + + const struct radv_userdata_info *cs_sbt_descriptors_loc = + radv_get_user_sgpr_info(rt_prolog, AC_UD_CS_SBT_DESCRIPTORS); + if (cs_sbt_descriptors_loc->sgpr_idx != -1) { + /* PKT3_SET_SH_REG for pointer */ + *cmd_size += 4 * 4; + } + + const struct radv_userdata_info *cs_ray_launch_size_addr_loc = + radv_get_user_sgpr_info(rt_prolog, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR); + if (cs_ray_launch_size_addr_loc->sgpr_idx != -1) { + /* PKT3_SET_SH_REG for pointer */ + *cmd_size += 4 * 4; + } + + if (device->sqtt.bo) { + /* sqtt markers */ + *cmd_size += 5 * 3 * 4; + } +} + +static void +radv_get_sequence_size(const struct radv_indirect_command_layout *layout, const void *pNext, uint32_t *cmd_size, + uint32_t *ace_cmd_size, uint32_t *upload_size) +{ + const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk); + const VkGeneratedCommandsPipelineInfoEXT *pipeline_info = + vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT); + const VkGeneratedCommandsShaderInfoEXT *eso_info = vk_find_struct_const(pNext, GENERATED_COMMANDS_SHADER_INFO_EXT); *cmd_size = 0; *ace_cmd_size = 0; *upload_size = 0; - if (layout->push_constant_mask) { + if (layout->vk.dgc_info & (BITFIELD_BIT(MESA_VK_DGC_PC) | BITFIELD_BIT(MESA_VK_DGC_SI))) { + VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout); bool need_copy = false; - if (pipeline) { - for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); ++i) { - if (!pipeline->shaders[i]) + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) { + /* Assume the compute shader needs both user SGPRs because we can't know the information + * for indirect pipelines. + */ + *cmd_size += 3 * 4; + need_copy = true; + + *cmd_size += (3 * util_bitcount64(layout->push_constant_mask)) * 4; + } else { + struct radv_shader *shaders[MESA_VULKAN_SHADER_STAGES] = {0}; + if (pipeline_info) { + VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline); + + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) { + const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline); + struct radv_shader *rt_prolog = rt_pipeline->prolog; + + shaders[MESA_SHADER_COMPUTE] = rt_prolog; + } else { + memcpy(shaders, pipeline->shaders, sizeof(shaders)); + } + } else if (eso_info) { + for (unsigned i = 0; i < eso_info->shaderCount; ++i) { + VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]); + struct radv_shader *shader = shader_object->shader; + gl_shader_stage stage = shader->info.stage; + + shaders[stage] = shader; + } + } + + for (unsigned i = 0; i < ARRAY_SIZE(shaders); ++i) { + const struct radv_shader *shader = shaders[i]; + + if (!shader) continue; - struct radv_userdata_locations *locs = &pipeline->shaders[i]->info.user_sgprs_locs; + const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs; if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) { /* One PKT3_SET_SH_REG for emitting push constants pointer (32-bit) */ if (i == MESA_SHADER_TASK) { @@ -214,19 +404,10 @@ radv_get_sequence_size(const struct radv_indirect_command_layout *layout, struct } } } - } else { - /* Assume the compute shader needs both user SGPRs because we can't know the information - * for indirect pipelines. - */ - assert(layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE); - *cmd_size += 3 * 4; - need_copy = true; - - *cmd_size += (3 * util_bitcount64(layout->push_constant_mask)) * 4; } if (need_copy) { - *upload_size += align(layout->push_constant_size, 16); + *upload_size += align(pipeline_layout->push_constant_size, 16); } } @@ -235,54 +416,15 @@ radv_get_sequence_size(const struct radv_indirect_command_layout *layout, struct *cmd_size += 2 * 4; } - if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) { - struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline); - radv_get_sequence_size_graphics(layout, graphics_pipeline, cmd_size, ace_cmd_size, upload_size); + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) { + radv_get_sequence_size_compute(layout, pNext, cmd_size, upload_size); + } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) { + radv_get_sequence_size_rt(layout, pNext, cmd_size, upload_size); } else { - assert(layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE); - struct radv_compute_pipeline *compute_pipeline = pipeline ? radv_pipeline_to_compute(pipeline) : NULL; - radv_get_sequence_size_compute(layout, compute_pipeline, cmd_size, upload_size); + radv_get_sequence_size_graphics(layout, pNext, cmd_size, ace_cmd_size, upload_size); } } -static uint32_t -radv_pad_cmdbuf(const struct radv_device *device, uint32_t size, enum amd_ip_type ip_type) -{ - const struct radv_physical_device *pdev = radv_device_physical(device); - const uint32_t ib_alignment = (pdev->info.ip[ip_type].ib_pad_dw_mask + 1) * 4; - - return align(size, ib_alignment); -} - -static uint32_t -radv_align_cmdbuf(const struct radv_device *device, uint32_t size, enum amd_ip_type ip_type) -{ - const struct radv_physical_device *pdev = radv_device_physical(device); - const uint32_t ib_alignment = pdev->info.ip[ip_type].ib_alignment; - - return align(size, ib_alignment); -} - -static unsigned -radv_dgc_preamble_cmdbuf_size(const struct radv_device *device, enum amd_ip_type ip_type) -{ - return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type); -} - -static unsigned -radv_dgc_trailer_cmdbuf_size(const struct radv_device *device, enum amd_ip_type ip_type) -{ - return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type); -} - -static bool -radv_dgc_use_preamble(const VkGeneratedCommandsInfoNV *cmd_info) -{ - /* Heuristic on when the overhead for the preamble (i.e. double jump) is worth it. Obviously - * a bit of a guess as it depends on the actual count which we don't know. */ - return cmd_info->sequencesCountBuffer != VK_NULL_HANDLE && cmd_info->sequencesCount >= 64; -} - struct dgc_cmdbuf_layout { bool use_preamble; uint32_t alloc_size; @@ -308,15 +450,13 @@ struct dgc_cmdbuf_layout { static void get_dgc_cmdbuf_layout(const struct radv_device *device, const struct radv_indirect_command_layout *dgc_layout, - struct radv_pipeline *pipeline, uint32_t sequences_count, bool use_preamble, - struct dgc_cmdbuf_layout *layout) + const void *pNext, uint32_t sequences_count, bool use_preamble, struct dgc_cmdbuf_layout *layout) { uint32_t offset = 0; memset(layout, 0, sizeof(*layout)); - radv_get_sequence_size(dgc_layout, pipeline, &layout->main_cmd_stride, &layout->ace_cmd_stride, - &layout->upload_stride); + radv_get_sequence_size(dgc_layout, pNext, &layout->main_cmd_stride, &layout->ace_cmd_stride, &layout->upload_stride); layout->use_preamble = use_preamble; if (layout->use_preamble) { @@ -352,6 +492,7 @@ get_dgc_cmdbuf_layout(const struct radv_device *device, const struct radv_indire offset += radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE); offset = radv_align_cmdbuf(device, offset, AMD_IP_COMPUTE); + layout->ace_preamble_offset = offset; if (layout->use_preamble) @@ -370,16 +511,15 @@ get_dgc_cmdbuf_layout(const struct radv_device *device, const struct radv_indire } static uint32_t -radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info, enum amd_ip_type ip_type) +radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, enum amd_ip_type ip_type) { - VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout); - VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline); - const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk); - const bool use_preamble = radv_dgc_use_preamble(cmd_info); - const uint32_t sequences_count = cmd_info->sequencesCount; + VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout); + const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk); + const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo); + const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount; struct dgc_cmdbuf_layout cmdbuf_layout; - get_dgc_cmdbuf_layout(device, layout, pipeline, sequences_count, use_preamble, &cmdbuf_layout); + get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout); if (use_preamble) return ip_type == AMD_IP_GFX ? cmdbuf_layout.main_preamble_size : cmdbuf_layout.ace_preamble_size; @@ -388,31 +528,29 @@ radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info, enum am } static uint32_t -radv_get_indirect_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info, enum amd_ip_type ip_type) +radv_get_indirect_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, enum amd_ip_type ip_type) { - VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout); - VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline); - const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk); - const bool use_preamble = radv_dgc_use_preamble(cmd_info); - const uint32_t sequences_count = cmd_info->sequencesCount; + VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout); + const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk); + const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo); + const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount; struct dgc_cmdbuf_layout cmdbuf_layout; - get_dgc_cmdbuf_layout(device, layout, pipeline, sequences_count, use_preamble, &cmdbuf_layout); + get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout); return ip_type == AMD_IP_GFX ? cmdbuf_layout.main_preamble_offset : cmdbuf_layout.ace_preamble_offset; } static uint32_t -radv_get_indirect_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info, enum amd_ip_type ip_type) +radv_get_indirect_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, enum amd_ip_type ip_type) { - VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout); - VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline); - const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk); - const bool use_preamble = radv_dgc_use_preamble(cmd_info); - const uint32_t sequences_count = cmd_info->sequencesCount; + VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout); + const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk); + const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo); + const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount; struct dgc_cmdbuf_layout cmdbuf_layout; - get_dgc_cmdbuf_layout(device, layout, pipeline, sequences_count, use_preamble, &cmdbuf_layout); + get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout); const uint32_t offset = ip_type == AMD_IP_GFX ? cmdbuf_layout.main_trailer_offset : cmdbuf_layout.ace_trailer_offset; @@ -420,39 +558,39 @@ radv_get_indirect_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info, enum } uint32_t -radv_get_indirect_main_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info) +radv_get_indirect_main_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo) { - return radv_get_indirect_cmdbuf_offset(cmd_info, AMD_IP_GFX); + return radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo, AMD_IP_GFX); } uint32_t -radv_get_indirect_main_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info) +radv_get_indirect_main_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo) { - return radv_get_indirect_cmdbuf_size(cmd_info, AMD_IP_GFX); + return radv_get_indirect_cmdbuf_offset(pGeneratedCommandsInfo, AMD_IP_GFX); } uint32_t -radv_get_indirect_main_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info) +radv_get_indirect_main_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo) { - return radv_get_indirect_trailer_offset(cmd_info, AMD_IP_GFX); + return radv_get_indirect_trailer_offset(pGeneratedCommandsInfo, AMD_IP_GFX); } uint32_t -radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info) +radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo) { - return radv_get_indirect_cmdbuf_offset(cmd_info, AMD_IP_COMPUTE); + return radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo, AMD_IP_COMPUTE); } uint32_t -radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info) +radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo) { - return radv_get_indirect_cmdbuf_size(cmd_info, AMD_IP_COMPUTE); + return radv_get_indirect_cmdbuf_offset(pGeneratedCommandsInfo, AMD_IP_COMPUTE); } uint32_t -radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info) +radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo) { - return radv_get_indirect_trailer_offset(cmd_info, AMD_IP_COMPUTE); + return radv_get_indirect_trailer_offset(pGeneratedCommandsInfo, AMD_IP_COMPUTE); } struct radv_dgc_params { @@ -472,9 +610,13 @@ struct radv_dgc_params { uint64_t sequence_count_addr; uint64_t stream_addr; + uint8_t queue_family; + uint8_t use_preamble; + /* draw info */ uint16_t vtx_base_sgpr; uint32_t max_index_count; + uint32_t max_draw_count; /* task/mesh info */ uint8_t has_task_shader; @@ -483,26 +625,34 @@ struct radv_dgc_params { uint16_t task_ring_entry_sgpr; uint16_t task_xyz_sgpr; uint16_t task_draw_id_sgpr; - uint8_t wave32; - uint8_t const_copy; + /* dispatch info */ + uint16_t grid_base_sgpr; + uint32_t wave32; - uint16_t vbo_reg; + /* RT info */ + uint16_t cs_sbt_descriptors; + uint16_t cs_ray_launch_size_addr; + + /* VBO info */ uint32_t vb_desc_usage_mask; + uint16_t vbo_reg; uint8_t dynamic_vs_input; uint8_t use_per_attribute_vb_descs; + /* push constants info */ + uint8_t const_copy; uint16_t push_constant_stages; - uint8_t use_preamble; + /* IES info */ + uint64_t ies_addr; + uint32_t ies_stride; + uint32_t indirect_desc_sets_va; /* For conditional rendering on ACE. */ uint8_t predicating; uint8_t predication_type; uint64_t predication_va; - - /* For indirect descriptor sets */ - uint32_t indirect_desc_sets_va; }; enum { @@ -520,7 +670,7 @@ struct dgc_cmdbuf { nir_variable *offset; nir_variable *upload_offset; - nir_def *pipeline_va; /* For compute pipelines */ + nir_def *ies_va; }; static void @@ -566,15 +716,19 @@ dgc_upload(struct dgc_cmdbuf *cs, nir_def *data) nir_pack_64_2x32((b), nir_load_push_constant((b), 2, 32, nir_imm_int((b), 0), \ .base = offsetof(struct radv_dgc_params, field), .range = 8)) -/* Pipeline metadata */ static nir_def * -dgc_get_pipeline_va(struct dgc_cmdbuf *cs, nir_def *stream_addr) +dgc_load_ies_va(struct dgc_cmdbuf *cs, nir_def *stream_addr) { const struct radv_indirect_command_layout *layout = cs->layout; nir_builder *b = cs->b; - return nir_build_load_global(b, 1, 64, nir_iadd_imm(b, stream_addr, layout->pipeline_params_offset), - .access = ACCESS_NON_WRITEABLE); + nir_def *offset = nir_imm_int(b, layout->vk.ies_src_offset_B); + nir_def *ies_index = + nir_build_load_global(b, 1, 32, nir_iadd(b, stream_addr, nir_u2u64(b, offset)), .access = ACCESS_NON_WRITEABLE); + nir_def *ies_stride = load_param32(b, ies_stride); + nir_def *ies_offset = nir_imul(b, ies_index, ies_stride); + + return nir_iadd(b, load_param64(b, ies_addr), nir_u2u64(b, ies_offset)); } static nir_def * @@ -583,8 +737,8 @@ dgc_load_shader_metadata(struct dgc_cmdbuf *cs, uint32_t bitsize, uint32_t field const struct radv_indirect_command_layout *layout = cs->layout; nir_builder *b = cs->b; - if (layout->bind_pipeline) { - return nir_load_global(b, nir_iadd_imm(b, cs->pipeline_va, field_offset), 4, 1, bitsize); + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) { + return nir_load_global(b, nir_iadd_imm(b, cs->ies_va, field_offset), 4, 1, bitsize); } else { nir_def *params_buf = radv_meta_load_descriptor(b, 0, 0); @@ -646,61 +800,9 @@ nir_pkt3(nir_builder *b, unsigned op, nir_def *len) return nir_pkt3_base(b, op, len, false); } -static void -dgc_emit_userdata_vertex(struct dgc_cmdbuf *cs, nir_def *first_vertex, nir_def *first_instance, nir_def *drawid) -{ - nir_builder *b = cs->b; - - nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr); - vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr); - - nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID); - nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE); - - nir_def *pkt_cnt = nir_imm_int(b, 1); - pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt); - pkt_cnt = nir_bcsel(b, has_baseinstance, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt); - - dgc_cs_begin(cs); - dgc_cs_emit(nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt)); - dgc_cs_emit(nir_iand_imm(b, vtx_base_sgpr, 0x3FFF)); - dgc_cs_emit(first_vertex); - dgc_cs_emit(nir_bcsel(b, nir_ior(b, has_drawid, has_baseinstance), nir_bcsel(b, has_drawid, drawid, first_instance), - nir_imm_int(b, PKT3_NOP_PAD))); - dgc_cs_emit(nir_bcsel(b, nir_iand(b, has_drawid, has_baseinstance), first_instance, nir_imm_int(b, PKT3_NOP_PAD))); - dgc_cs_end(); -} - -static void -dgc_emit_userdata_mesh(struct dgc_cmdbuf *cs, nir_def *x, nir_def *y, nir_def *z, nir_def *drawid) -{ - nir_builder *b = cs->b; - - nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr); - vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr); - - nir_def *has_grid_size = nir_test_mask(b, vtx_base_sgpr, DGC_USES_GRID_SIZE); - nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID); - - nir_push_if(b, nir_ior(b, has_grid_size, has_drawid)); - { - nir_def *pkt_cnt = nir_imm_int(b, 0); - pkt_cnt = nir_bcsel(b, has_grid_size, nir_iadd_imm(b, pkt_cnt, 3), pkt_cnt); - pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt); - - dgc_cs_begin(cs); - dgc_cs_emit(nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt)); - dgc_cs_emit(nir_iand_imm(b, vtx_base_sgpr, 0x3FFF)); - /* DrawID needs to be first if no GridSize. */ - dgc_cs_emit(nir_bcsel(b, has_grid_size, x, drawid)); - dgc_cs_emit(nir_bcsel(b, has_grid_size, y, nir_imm_int(b, PKT3_NOP_PAD))); - dgc_cs_emit(nir_bcsel(b, has_grid_size, z, nir_imm_int(b, PKT3_NOP_PAD))); - dgc_cs_emit(nir_bcsel(b, has_drawid, drawid, nir_imm_int(b, PKT3_NOP_PAD))); - dgc_cs_end(); - } - nir_pop_if(b, NULL); -} - +/** + * SQTT + */ static void dgc_emit_sqtt_userdata(struct dgc_cmdbuf *cs, nir_def *data) { @@ -788,184 +890,9 @@ dgc_emit_sqtt_end_api_marker(struct dgc_cmdbuf *cs, enum rgp_sqtt_marker_general dgc_emit_sqtt_userdata(cs, nir_imm_int(b, marker.dword01)); } -static void -dgc_emit_instance_count(struct dgc_cmdbuf *cs, nir_def *instance_count) -{ - dgc_cs_begin(cs); - dgc_cs_emit_imm(PKT3(PKT3_NUM_INSTANCES, 0, 0)); - dgc_cs_emit(instance_count); - dgc_cs_end(); -} - -static void -dgc_emit_draw_index_offset_2(struct dgc_cmdbuf *cs, nir_def *index_offset, nir_def *index_count, - nir_def *max_index_count) -{ - dgc_cs_begin(cs); - dgc_cs_emit_imm(PKT3(PKT3_DRAW_INDEX_OFFSET_2, 3, 0)); - dgc_cs_emit(max_index_count); - dgc_cs_emit(index_offset); - dgc_cs_emit(index_count); - dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_DMA); - dgc_cs_end(); -} - -static void -dgc_emit_draw_index_auto(struct dgc_cmdbuf *cs, nir_def *vertex_count) -{ - dgc_cs_begin(cs); - dgc_cs_emit_imm(PKT3(PKT3_DRAW_INDEX_AUTO, 1, 0)); - dgc_cs_emit(vertex_count); - dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX); - dgc_cs_end(); -} - -static void -dgc_emit_dispatch_direct(struct dgc_cmdbuf *cs, nir_def *wg_x, nir_def *wg_y, nir_def *wg_z, - nir_def *dispatch_initiator) -{ - dgc_cs_begin(cs); - dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1)); - dgc_cs_emit(wg_x); - dgc_cs_emit(wg_y); - dgc_cs_emit(wg_z); - dgc_cs_emit(dispatch_initiator); - dgc_cs_end(); -} - -static void -dgc_emit_dispatch_mesh_direct(struct dgc_cmdbuf *cs, nir_def *x, nir_def *y, nir_def *z) -{ - dgc_cs_begin(cs); - dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_MESH_DIRECT, 3, 0)); - dgc_cs_emit(x); - dgc_cs_emit(y); - dgc_cs_emit(z); - dgc_cs_emit_imm(S_0287F0_SOURCE_SELECT(V_0287F0_DI_SRC_SEL_AUTO_INDEX)); - dgc_cs_end(); -} - -static void -dgc_emit_grid_size_user_sgpr(struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *wg_x, nir_def *wg_y, - nir_def *wg_z) -{ - dgc_cs_begin(cs); - dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 3, 0)); - dgc_cs_emit(grid_base_sgpr); - dgc_cs_emit(wg_x); - dgc_cs_emit(wg_y); - dgc_cs_emit(wg_z); - dgc_cs_end(); -} - -static void -dgc_emit_grid_size_pointer(struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *stream_addr) -{ - const struct radv_indirect_command_layout *layout = cs->layout; - nir_builder *b = cs->b; - - nir_def *va = nir_iadd_imm(b, stream_addr, layout->dispatch_params_offset); - - nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va); - nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va); - - dgc_cs_begin(cs); - dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 2, 0)); - dgc_cs_emit(grid_base_sgpr); - dgc_cs_emit(va_lo); - dgc_cs_emit(va_hi); - dgc_cs_end(); -} - -static void -dgc_emit_pkt3_set_base(struct dgc_cmdbuf *cs, nir_def *va) -{ - nir_builder *b = cs->b; - - nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va); - nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va); - - dgc_cs_begin(cs); - dgc_cs_emit_imm(PKT3(PKT3_SET_BASE, 2, 0)); - dgc_cs_emit_imm(1); - dgc_cs_emit(va_lo); - dgc_cs_emit(va_hi); - dgc_cs_end(); -} - -static void -dgc_emit_pkt3_draw_indirect(struct dgc_cmdbuf *cs, bool indexed) -{ - const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX; - nir_builder *b = cs->b; - - nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr); - - nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID); - nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE); - - vtx_base_sgpr = nir_iand_imm(b, nir_u2u32(b, vtx_base_sgpr), 0x3FFF); - - /* vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2 */ - nir_def *vertex_offset_reg = vtx_base_sgpr; - - /* start_instance_reg = (base_reg + (draw_id_enable ? 8 : 4) - SI_SH_REG_OFFSET) >> 2 */ - nir_def *start_instance_offset = nir_bcsel(b, has_drawid, nir_imm_int(b, 2), nir_imm_int(b, 1)); - nir_def *start_instance_reg = nir_iadd(b, vtx_base_sgpr, start_instance_offset); - - /* draw_id_reg = (base_reg + 4 - SI_SH_REG_OFFSET) >> 2 */ - nir_def *draw_id_reg = nir_iadd(b, vtx_base_sgpr, nir_imm_int(b, 1)); - - nir_if *if_drawid = nir_push_if(b, has_drawid); - { - const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI; - - dgc_cs_begin(cs); - dgc_cs_emit_imm(PKT3(pkt3_op, 8, 0)); - dgc_cs_emit_imm(0); - dgc_cs_emit(vertex_offset_reg); - dgc_cs_emit(nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0))); - dgc_cs_emit(nir_ior(b, draw_id_reg, nir_imm_int(b, S_2C3_DRAW_INDEX_ENABLE(1)))); - dgc_cs_emit_imm(1); /* draw count */ - dgc_cs_emit_imm(0); /* count va low */ - dgc_cs_emit_imm(0); /* count va high */ - dgc_cs_emit_imm(0); /* stride */ - dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX); - dgc_cs_end(); - } - nir_push_else(b, if_drawid); - { - const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT; - - dgc_cs_begin(cs); - dgc_cs_emit_imm(PKT3(pkt3_op, 3, 0)); - dgc_cs_emit_imm(0); - dgc_cs_emit(vertex_offset_reg); - dgc_cs_emit(nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0))); - dgc_cs_emit_imm(di_src_sel); - dgc_cs_end(); - } - nir_pop_if(b, if_drawid); -} - -static void -dgc_emit_draw_indirect(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, bool indexed) -{ - const struct radv_indirect_command_layout *layout = cs->layout; - nir_builder *b = cs->b; - - nir_def *va = nir_iadd_imm(b, stream_addr, layout->draw_params_offset); - - dgc_emit_sqtt_begin_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirect : ApiCmdDrawIndirect); - dgc_emit_sqtt_marker_event(cs, sequence_id, indexed ? EventCmdDrawIndexedIndirect : EventCmdDrawIndirect); - - dgc_emit_pkt3_set_base(cs, va); - dgc_emit_pkt3_draw_indirect(cs, indexed); - - dgc_emit_sqtt_thread_trace_marker(cs); - dgc_emit_sqtt_end_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirect : ApiCmdDrawIndirect); -} - +/** + * Command buffer + */ static nir_def * dgc_cmd_buf_size(nir_builder *b, nir_def *sequence_count, bool is_ace, const struct radv_device *device) { @@ -987,10 +914,11 @@ dgc_cmd_buf_size(nir_builder *b, nir_def *sequence_count, bool is_ace, const str static void build_dgc_buffer_tail(nir_builder *b, nir_def *cmd_buf_offset, nir_def *cmd_buf_size, nir_def *cmd_buf_stride, - nir_def *cmd_buf_trailer_offset, nir_def *sequence_count, unsigned trailer_size, + nir_def *cmd_buf_trailer_offset, nir_def *sequence_count, unsigned trailer_size, bool is_ace, const struct radv_device *device) { const struct radv_physical_device *pdev = radv_device_physical(device); + nir_def *is_compute_queue = nir_ior_imm(b, nir_ieq_imm(b, load_param8(b, queue_family), RADV_QUEUE_COMPUTE), is_ace); nir_def *global_id = get_global_ids(b, 1); @@ -1001,8 +929,11 @@ build_dgc_buffer_tail(nir_builder *b, nir_def *cmd_buf_offset, nir_def *cmd_buf_ nir_variable *offset = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "offset"); nir_store_var(b, offset, cmd_buf_tail_start, 0x1); - /* Add NOPs padding but leave space for the INDIRECT_BUFFER packet. */ - cmd_buf_size = nir_iadd_imm(b, cmd_buf_size, -PKT3_INDIRECT_BUFFER_BYTES); + /* On compute queue, the DGC command buffer is chained by patching the + * trailer but this isn't needed on graphics because it's using IB2. + */ + cmd_buf_size = + nir_bcsel(b, is_compute_queue, nir_iadd_imm(b, cmd_buf_size, -PKT3_INDIRECT_BUFFER_BYTES), cmd_buf_size); nir_def *va = nir_pack_64_2x32_split(b, load_param32(b, upload_addr), nir_imm_int(b, pdev->info.address32_hi)); nir_push_loop(b); @@ -1028,16 +959,20 @@ build_dgc_buffer_tail(nir_builder *b, nir_def *cmd_buf_offset, nir_def *cmd_buf_ } nir_pop_loop(b, NULL); - nir_def *chain_packet[] = { - nir_imm_int(b, PKT3(PKT3_INDIRECT_BUFFER, 2, 0)), - nir_iadd(b, load_param32(b, upload_addr), cmd_buf_trailer_offset), - nir_imm_int(b, pdev->info.address32_hi), - nir_imm_int(b, trailer_size | S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(false)), - }; + nir_push_if(b, is_compute_queue); + { + nir_def *chain_packets[] = { + nir_imm_int(b, PKT3(PKT3_INDIRECT_BUFFER, 2, 0)), + nir_iadd(b, load_param32(b, upload_addr), cmd_buf_trailer_offset), + nir_imm_int(b, pdev->info.address32_hi), + nir_imm_int(b, trailer_size | S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(false)), + }; - nir_build_store_global(b, nir_vec(b, chain_packet, 4), - nir_iadd(b, va, nir_u2u64(b, nir_iadd(b, nir_load_var(b, offset), cmd_buf_offset))), - .access = ACCESS_NON_READABLE); + nir_build_store_global(b, nir_vec(b, chain_packets, 4), + nir_iadd(b, va, nir_u2u64(b, nir_iadd(b, nir_load_var(b, offset), cmd_buf_offset))), + .access = ACCESS_NON_READABLE); + } + nir_pop_if(b, NULL); } nir_pop_if(b, NULL); } @@ -1052,7 +987,7 @@ build_dgc_buffer_tail_main(nir_builder *b, nir_def *sequence_count, const struct unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_GFX) / 4; build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, cmd_buf_trailer_offset, sequence_count, - trailer_size, device); + trailer_size, false, device); } static void @@ -1065,7 +1000,7 @@ build_dgc_buffer_tail_ace(nir_builder *b, nir_def *sequence_count, const struct unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE) / 4; build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, cmd_buf_trailer_offset, sequence_count, - trailer_size, device); + trailer_size, true, device); } static void @@ -1183,15 +1118,161 @@ build_dgc_buffer_preamble_ace(nir_builder *b, nir_def *sequence_count, const str } /** - * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV. + * Draw */ +static void +dgc_emit_userdata_vertex(struct dgc_cmdbuf *cs, nir_def *first_vertex, nir_def *first_instance, nir_def *drawid) +{ + nir_builder *b = cs->b; + + nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr); + vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr); + + nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID); + nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE); + + nir_def *pkt_cnt = nir_imm_int(b, 1); + pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt); + pkt_cnt = nir_bcsel(b, has_baseinstance, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt); + + dgc_cs_begin(cs); + dgc_cs_emit(nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt)); + dgc_cs_emit(nir_iand_imm(b, vtx_base_sgpr, 0x3FFF)); + dgc_cs_emit(first_vertex); + dgc_cs_emit(nir_bcsel(b, nir_ior(b, has_drawid, has_baseinstance), nir_bcsel(b, has_drawid, drawid, first_instance), + nir_imm_int(b, PKT3_NOP_PAD))); + dgc_cs_emit(nir_bcsel(b, nir_iand(b, has_drawid, has_baseinstance), first_instance, nir_imm_int(b, PKT3_NOP_PAD))); + dgc_cs_end(); +} + +static void +dgc_emit_instance_count(struct dgc_cmdbuf *cs, nir_def *instance_count) +{ + dgc_cs_begin(cs); + dgc_cs_emit_imm(PKT3(PKT3_NUM_INSTANCES, 0, 0)); + dgc_cs_emit(instance_count); + dgc_cs_end(); +} + +static void +dgc_emit_draw_index_offset_2(struct dgc_cmdbuf *cs, nir_def *index_offset, nir_def *index_count, + nir_def *max_index_count) +{ + dgc_cs_begin(cs); + dgc_cs_emit_imm(PKT3(PKT3_DRAW_INDEX_OFFSET_2, 3, 0)); + dgc_cs_emit(max_index_count); + dgc_cs_emit(index_offset); + dgc_cs_emit(index_count); + dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_DMA); + dgc_cs_end(); +} + +static void +dgc_emit_draw_index_auto(struct dgc_cmdbuf *cs, nir_def *vertex_count) +{ + dgc_cs_begin(cs); + dgc_cs_emit_imm(PKT3(PKT3_DRAW_INDEX_AUTO, 1, 0)); + dgc_cs_emit(vertex_count); + dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX); + dgc_cs_end(); +} + +static void +dgc_emit_pkt3_set_base(struct dgc_cmdbuf *cs, nir_def *va) +{ + nir_builder *b = cs->b; + + nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va); + nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va); + + dgc_cs_begin(cs); + dgc_cs_emit_imm(PKT3(PKT3_SET_BASE, 2, 0)); + dgc_cs_emit_imm(1); + dgc_cs_emit(va_lo); + dgc_cs_emit(va_hi); + dgc_cs_end(); +} + +static void +dgc_emit_pkt3_draw_indirect(struct dgc_cmdbuf *cs, bool indexed) +{ + const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX; + nir_builder *b = cs->b; + + nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr); + + nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID); + nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE); + + vtx_base_sgpr = nir_iand_imm(b, nir_u2u32(b, vtx_base_sgpr), 0x3FFF); + + /* vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2 */ + nir_def *vertex_offset_reg = vtx_base_sgpr; + + /* start_instance_reg = (base_reg + (draw_id_enable ? 8 : 4) - SI_SH_REG_OFFSET) >> 2 */ + nir_def *start_instance_offset = nir_bcsel(b, has_drawid, nir_imm_int(b, 2), nir_imm_int(b, 1)); + nir_def *start_instance_reg = nir_iadd(b, vtx_base_sgpr, start_instance_offset); + + /* draw_id_reg = (base_reg + 4 - SI_SH_REG_OFFSET) >> 2 */ + nir_def *draw_id_reg = nir_iadd(b, vtx_base_sgpr, nir_imm_int(b, 1)); + + nir_if *if_drawid = nir_push_if(b, has_drawid); + { + const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI; + + dgc_cs_begin(cs); + dgc_cs_emit_imm(PKT3(pkt3_op, 8, 0)); + dgc_cs_emit_imm(0); + dgc_cs_emit(vertex_offset_reg); + dgc_cs_emit(nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0))); + dgc_cs_emit(nir_ior(b, draw_id_reg, nir_imm_int(b, S_2C3_DRAW_INDEX_ENABLE(1)))); + dgc_cs_emit_imm(1); /* draw count */ + dgc_cs_emit_imm(0); /* count va low */ + dgc_cs_emit_imm(0); /* count va high */ + dgc_cs_emit_imm(0); /* stride */ + dgc_cs_emit_imm(di_src_sel); + dgc_cs_end(); + } + nir_push_else(b, if_drawid); + { + const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT; + + dgc_cs_begin(cs); + dgc_cs_emit_imm(PKT3(pkt3_op, 3, 0)); + dgc_cs_emit_imm(0); + dgc_cs_emit(vertex_offset_reg); + dgc_cs_emit(nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0))); + dgc_cs_emit_imm(di_src_sel); + dgc_cs_end(); + } + nir_pop_if(b, if_drawid); +} + +static void +dgc_emit_draw_indirect(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, bool indexed) +{ + const struct radv_indirect_command_layout *layout = cs->layout; + nir_builder *b = cs->b; + + nir_def *va = nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B); + + dgc_emit_sqtt_begin_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirect : ApiCmdDrawIndirect); + dgc_emit_sqtt_marker_event(cs, sequence_id, indexed ? EventCmdDrawIndexedIndirect : EventCmdDrawIndirect); + + dgc_emit_pkt3_set_base(cs, va); + dgc_emit_pkt3_draw_indirect(cs, indexed); + + dgc_emit_sqtt_thread_trace_marker(cs); + dgc_emit_sqtt_end_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirect : ApiCmdDrawIndirect); +} + static void dgc_emit_draw(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id) { const struct radv_indirect_command_layout *layout = cs->layout; nir_builder *b = cs->b; - nir_def *draw_data0 = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->draw_params_offset), + nir_def *draw_data0 = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B), .access = ACCESS_NON_WRITEABLE); nir_def *vertex_count = nir_channel(b, draw_data0, 0); nir_def *instance_count = nir_channel(b, draw_data0, 1); @@ -1203,7 +1284,7 @@ dgc_emit_draw(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id) dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDraw); dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDraw); - dgc_emit_userdata_vertex(cs, vertex_offset, first_instance, sequence_id); + dgc_emit_userdata_vertex(cs, vertex_offset, first_instance, nir_imm_int(b, 0)); dgc_emit_instance_count(cs, instance_count); dgc_emit_draw_index_auto(cs, vertex_count); @@ -1213,19 +1294,16 @@ dgc_emit_draw(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id) nir_pop_if(b, 0); } -/** - * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV. - */ static void dgc_emit_draw_indexed(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, nir_def *max_index_count) { const struct radv_indirect_command_layout *layout = cs->layout; nir_builder *b = cs->b; - nir_def *draw_data0 = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->draw_params_offset), + nir_def *draw_data0 = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B), .access = ACCESS_NON_WRITEABLE); nir_def *draw_data1 = - nir_build_load_global(b, 1, 32, nir_iadd_imm(b, nir_iadd_imm(b, stream_addr, layout->draw_params_offset), 16), + nir_build_load_global(b, 1, 32, nir_iadd_imm(b, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B), 16), .access = ACCESS_NON_WRITEABLE); nir_def *index_count = nir_channel(b, draw_data0, 0); nir_def *instance_count = nir_channel(b, draw_data0, 1); @@ -1238,7 +1316,7 @@ dgc_emit_draw_indexed(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequ dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawIndexed); dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawIndexed); - dgc_emit_userdata_vertex(cs, vertex_offset, first_instance, sequence_id); + dgc_emit_userdata_vertex(cs, vertex_offset, first_instance, nir_imm_int(b, 0)); dgc_emit_instance_count(cs, instance_count); dgc_emit_draw_index_offset_2(cs, first_index, index_count, max_index_count); @@ -1248,8 +1326,56 @@ dgc_emit_draw_indexed(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequ nir_pop_if(b, 0); } +static void +dgc_emit_draw_with_count(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, bool indexed) +{ + const struct radv_indirect_command_layout *layout = cs->layout; + nir_builder *b = cs->b; + + nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr); + nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID); + nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE); + + nir_def *draw_data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B), + .access = ACCESS_NON_WRITEABLE); + nir_def *va = nir_pack_64_2x32(b, nir_channels(b, draw_data, 0x3)); + nir_def *stride = nir_channel(b, draw_data, 2); + nir_def *draw_count = nir_umin(b, load_param32(b, max_draw_count), nir_channel(b, draw_data, 3)); + + dgc_emit_pkt3_set_base(cs, va); + + nir_def *vertex_offset_reg = nir_iand_imm(b, vtx_base_sgpr, 0x3FFF); + nir_def *start_instance_offset = nir_bcsel(b, has_drawid, nir_imm_int(b, 2), nir_imm_int(b, 1)); + nir_def *start_instance_reg = + nir_bcsel(b, has_baseinstance, nir_iadd(b, vertex_offset_reg, start_instance_offset), nir_imm_int(b, 0)); + nir_def *draw_id_reg = nir_bcsel( + b, has_drawid, nir_ior_imm(b, nir_iadd(b, vertex_offset_reg, nir_imm_int(b, 1)), S_2C3_DRAW_INDEX_ENABLE(1)), + nir_imm_int(b, 0)); + + nir_def *di_src_sel = nir_imm_int(b, indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX); + + dgc_emit_sqtt_begin_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirectCount : ApiCmdDrawIndirectCount); + dgc_emit_sqtt_marker_event(cs, sequence_id, indexed ? EventCmdDrawIndexedIndirectCount : EventCmdDrawIndirectCount); + + dgc_cs_begin(cs); + dgc_cs_emit_imm(PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8, false)); + dgc_cs_emit_imm(0); + dgc_cs_emit(vertex_offset_reg); + dgc_cs_emit(start_instance_reg); + dgc_cs_emit(draw_id_reg); + dgc_cs_emit(draw_count); + dgc_cs_emit_imm(0); + dgc_cs_emit_imm(0); + dgc_cs_emit(stride); + dgc_cs_emit(di_src_sel); + dgc_cs_end(); + + dgc_emit_sqtt_thread_trace_marker(cs); + dgc_emit_sqtt_end_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirectCount : ApiCmdDrawIndirectCount); +} + /** - * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV. + * Index buffer */ static nir_def * dgc_get_index_type(struct dgc_cmdbuf *cs, nir_def *user_index_type) @@ -1257,10 +1383,17 @@ dgc_get_index_type(struct dgc_cmdbuf *cs, nir_def *user_index_type) const struct radv_indirect_command_layout *layout = cs->layout; nir_builder *b = cs->b; - nir_def *index_type = nir_bcsel(b, nir_ieq_imm(b, user_index_type, layout->ibo_type_32), - nir_imm_int(b, V_028A7C_VGT_INDEX_32), nir_imm_int(b, V_028A7C_VGT_INDEX_16)); - return nir_bcsel(b, nir_ieq_imm(b, user_index_type, layout->ibo_type_8), nir_imm_int(b, V_028A7C_VGT_INDEX_8), - index_type); + if (layout->vk.index_mode_is_dx) { + nir_def *index_type = nir_bcsel(b, nir_ieq_imm(b, user_index_type, 0x2a /* DXGI_FORMAT_R32_UINT */), + nir_imm_int(b, V_028A7C_VGT_INDEX_32), nir_imm_int(b, V_028A7C_VGT_INDEX_16)); + return nir_bcsel(b, nir_ieq_imm(b, user_index_type, 0x3e /* DXGI_FORMAT_R8_UINT */), + nir_imm_int(b, V_028A7C_VGT_INDEX_8), index_type); + } else { + nir_def *index_type = nir_bcsel(b, nir_ieq_imm(b, user_index_type, VK_INDEX_TYPE_UINT32), + nir_imm_int(b, V_028A7C_VGT_INDEX_32), nir_imm_int(b, V_028A7C_VGT_INDEX_16)); + return nir_bcsel(b, nir_ieq_imm(b, user_index_type, VK_INDEX_TYPE_UINT8_KHR), + nir_imm_int(b, V_028A7C_VGT_INDEX_8), index_type); + } } static void @@ -1271,7 +1404,7 @@ dgc_emit_index_buffer(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_variable const struct radv_physical_device *pdev = radv_device_physical(device); nir_builder *b = cs->b; - nir_def *data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->index_buffer_offset), + nir_def *data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.index_src_offset_B), .access = ACCESS_NON_WRITEABLE); nir_def *index_type = dgc_get_index_type(cs, nir_channel(b, data, 3)); @@ -1309,15 +1442,15 @@ dgc_emit_index_buffer(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_variable } /** - * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV. + * Push constants */ static nir_def * -dgc_get_push_constant_stages(struct dgc_cmdbuf *cs, nir_def *stream_addr) +dgc_get_push_constant_stages(struct dgc_cmdbuf *cs) { const struct radv_indirect_command_layout *layout = cs->layout; nir_builder *b = cs->b; - if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) { + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) { nir_def *has_push_constant = nir_ine_imm(b, load_shader_metadata32(cs, push_const_sgpr), 0); return nir_bcsel(b, has_push_constant, nir_imm_int(b, VK_SHADER_STAGE_COMPUTE_BIT), nir_imm_int(b, 0)); } else { @@ -1332,7 +1465,7 @@ dgc_get_upload_sgpr(struct dgc_cmdbuf *cs, nir_def *param_buf, nir_def *param_of nir_builder *b = cs->b; nir_def *res; - if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) { + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) { res = load_shader_metadata32(cs, push_const_sgpr); } else { res = nir_load_ssbo(b, 1, 32, param_buf, nir_iadd_imm(b, param_offset, stage * 12)); @@ -1348,7 +1481,7 @@ dgc_get_inline_sgpr(struct dgc_cmdbuf *cs, nir_def *param_buf, nir_def *param_of nir_builder *b = cs->b; nir_def *res; - if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) { + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) { res = load_shader_metadata32(cs, push_const_sgpr); } else { res = nir_load_ssbo(b, 1, 32, param_buf, nir_iadd_imm(b, param_offset, stage * 12)); @@ -1363,7 +1496,7 @@ dgc_get_inline_mask(struct dgc_cmdbuf *cs, nir_def *param_buf, nir_def *param_of const struct radv_indirect_command_layout *layout = cs->layout; nir_builder *b = cs->b; - if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) { + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) { return load_shader_metadata64(cs, inline_push_const_mask); } else { nir_def *reg_info = nir_load_ssbo(b, 2, 32, param_buf, nir_iadd_imm(b, param_offset, stage * 12 + 4)); @@ -1377,7 +1510,7 @@ dgc_push_constant_needs_copy(struct dgc_cmdbuf *cs) const struct radv_indirect_command_layout *layout = cs->layout; nir_builder *b = cs->b; - if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) { + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) { return nir_ine_imm(b, nir_ubfe_imm(b, load_shader_metadata32(cs, push_const_sgpr), 0, 16), 0); } else { return nir_ine_imm(b, load_param8(b, const_copy), 0); @@ -1400,13 +1533,12 @@ dgc_get_pc_params(struct dgc_cmdbuf *cs) params.buf = radv_meta_load_descriptor(b, 0, 0); uint32_t offset = 0; - - if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) { - offset = layout->bind_pipeline ? 0 : sizeof(struct radv_compute_pipeline_metadata); + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) { + offset = + (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) ? 0 : sizeof(struct radv_compute_pipeline_metadata); } else { - if (layout->bind_vbo_mask) { - offset += MAX_VBS * DGC_VBO_INFO_SIZE; - } + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) + offset = MAX_VBS * DGC_VBO_INFO_SIZE; } params.offset = nir_imm_int(b, offset); @@ -1416,15 +1548,19 @@ dgc_get_pc_params(struct dgc_cmdbuf *cs) } static void -dgc_alloc_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, const struct dgc_pc_params *params) +dgc_alloc_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, + const struct dgc_pc_params *params) { const struct radv_indirect_command_layout *layout = cs->layout; + VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout); nir_builder *b = cs->b; - for (uint32_t i = 0; i < layout->push_constant_size / 4; i++) { + for (uint32_t i = 0; i < pipeline_layout->push_constant_size / 4; i++) { nir_def *data; - if ((layout->push_constant_mask & (1ull << i))) { + if (layout->sequence_index_mask & (1ull << i)) { + data = sequence_id; + } else if ((layout->push_constant_mask & (1ull << i))) { data = nir_build_load_global(b, 1, 32, nir_iadd_imm(b, stream_addr, layout->push_constant_offsets[i]), .access = ACCESS_NON_WRITEABLE); } else { @@ -1436,10 +1572,11 @@ dgc_alloc_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, const struc } static void -dgc_emit_push_constant_for_stage(struct dgc_cmdbuf *cs, nir_def *stream_addr, const struct dgc_pc_params *params, - gl_shader_stage stage) +dgc_emit_push_constant_for_stage(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, + const struct dgc_pc_params *params, gl_shader_stage stage) { const struct radv_indirect_command_layout *layout = cs->layout; + VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout); nir_builder *b = cs->b; nir_def *upload_sgpr = dgc_get_upload_sgpr(cs, params->buf, params->offset, stage); @@ -1461,15 +1598,17 @@ dgc_emit_push_constant_for_stage(struct dgc_cmdbuf *cs, nir_def *stream_addr, co nir_variable *pc_idx = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "pc_idx"); nir_store_var(b, pc_idx, nir_imm_int(b, 0), 0x1); - for (uint32_t i = 0; i < layout->push_constant_size / 4; i++) { + for (uint32_t i = 0; i < pipeline_layout->push_constant_size / 4; i++) { nir_push_if(b, nir_ine_imm(b, nir_iand_imm(b, inline_mask, 1ull << i), 0)); { nir_def *data = NULL; - if (layout->push_constant_mask & (1ull << i)) { + if (layout->sequence_index_mask & (1ull << i)) { + data = sequence_id; + } else if (layout->push_constant_mask & (1ull << i)) { data = nir_build_load_global(b, 1, 32, nir_iadd_imm(b, stream_addr, layout->push_constant_offsets[i]), .access = ACCESS_NON_WRITEABLE); - } else if (layout->bind_pipeline) { + } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) { /* For indirect pipeline binds, partial push constant updates can't be emitted when * the DGC execute is called because there is no bound pipeline and they have to be * emitted from the DGC prepare shader. @@ -1494,17 +1633,17 @@ dgc_emit_push_constant_for_stage(struct dgc_cmdbuf *cs, nir_def *stream_addr, co } static void -dgc_emit_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, VkShaderStageFlags stages) +dgc_emit_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, VkShaderStageFlags stages) { const struct dgc_pc_params params = dgc_get_pc_params(cs); nir_builder *b = cs->b; - nir_def *push_constant_stages = dgc_get_push_constant_stages(cs, stream_addr); + nir_def *push_constant_stages = dgc_get_push_constant_stages(cs); radv_foreach_stage(s, stages) { nir_push_if(b, nir_test_mask(b, push_constant_stages, mesa_to_vk_shader_stage(s))); { - dgc_emit_push_constant_for_stage(cs, stream_addr, ¶ms, s); + dgc_emit_push_constant_for_stage(cs, stream_addr, sequence_id, ¶ms, s); } nir_pop_if(b, NULL); } @@ -1512,13 +1651,13 @@ dgc_emit_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, VkShaderStag nir_def *const_copy = dgc_push_constant_needs_copy(cs); nir_push_if(b, const_copy); { - dgc_alloc_push_constant(cs, stream_addr, ¶ms); + dgc_alloc_push_constant(cs, stream_addr, sequence_id, ¶ms); } nir_pop_if(b, NULL); } /** - * For emitting VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV. + * Vertex buffers */ struct dgc_vbo_info { nir_def *va; @@ -1689,8 +1828,8 @@ dgc_emit_vertex_buffer(struct dgc_cmdbuf *cs, nir_def *stream_addr) nir_def *binding = load_vbo_metadata32(cs, cur_idx, binding); - nir_def *vbo_override = - nir_ine_imm(b, nir_iand(b, nir_imm_int(b, layout->bind_vbo_mask), nir_ishl(b, nir_imm_int(b, 1), binding)), 0); + nir_def *vbo_override = nir_ine_imm( + b, nir_iand(b, nir_imm_int(b, layout->vk.vertex_bindings), nir_ishl(b, nir_imm_int(b, 1), binding)), 0); nir_push_if(b, vbo_override); { nir_def *stream_offset = load_vbo_offset(cs, cur_idx); @@ -1700,12 +1839,7 @@ dgc_emit_vertex_buffer(struct dgc_cmdbuf *cs, nir_def *stream_addr) nir_def *va = nir_pack_64_2x32(b, nir_trim_vector(b, stream_data, 2)); nir_def *size = nir_channel(b, stream_data, 2); - nir_def *stride; - if (layout->vertex_dynamic_stride) { - stride = nir_channel(b, stream_data, 3); - } else { - stride = load_vbo_metadata32(cs, cur_idx, stride); - } + nir_def *stride = nir_channel(b, stream_data, 3); nir_store_var(b, va_var, va, 0x1); nir_store_var(b, size_var, size, 0x1); @@ -1750,7 +1884,7 @@ dgc_emit_vertex_buffer(struct dgc_cmdbuf *cs, nir_def *stream_addr) } /** - * For emitting VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV. + * Compute dispatch */ static nir_def * dgc_get_dispatch_initiator(struct dgc_cmdbuf *cs) @@ -1765,36 +1899,66 @@ dgc_get_dispatch_initiator(struct dgc_cmdbuf *cs) } static void -dgc_emit_dispatch(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id) +dgc_emit_grid_size_user_sgpr(struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *wg_x, nir_def *wg_y, + nir_def *wg_z) +{ + dgc_cs_begin(cs); + dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 3, 0)); + dgc_cs_emit(grid_base_sgpr); + dgc_cs_emit(wg_x); + dgc_cs_emit(wg_y); + dgc_cs_emit(wg_z); + dgc_cs_end(); +} + +static void +dgc_emit_grid_size_pointer(struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *size_va) +{ + nir_builder *b = cs->b; + + nir_def *va_lo = nir_unpack_64_2x32_split_x(b, size_va); + nir_def *va_hi = nir_unpack_64_2x32_split_y(b, size_va); + + dgc_cs_begin(cs); + dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 2, 0)); + dgc_cs_emit(grid_base_sgpr); + dgc_cs_emit(va_lo); + dgc_cs_emit(va_hi); + dgc_cs_end(); +} + +static void +dgc_emit_dispatch_direct(struct dgc_cmdbuf *cs, nir_def *wg_x, nir_def *wg_y, nir_def *wg_z, + nir_def *dispatch_initiator, nir_def *grid_sgpr, nir_def *size_va, nir_def *sequence_id, + bool is_rt) { - const struct radv_indirect_command_layout *layout = cs->layout; const struct radv_device *device = cs->dev; nir_builder *b = cs->b; - nir_def *dispatch_data = nir_build_load_global( - b, 3, 32, nir_iadd_imm(b, stream_addr, layout->dispatch_params_offset), .access = ACCESS_NON_WRITEABLE); - nir_def *wg_x = nir_channel(b, dispatch_data, 0); - nir_def *wg_y = nir_channel(b, dispatch_data, 1); - nir_def *wg_z = nir_channel(b, dispatch_data, 2); - nir_push_if(b, nir_iand(b, nir_ine_imm(b, wg_x, 0), nir_iand(b, nir_ine_imm(b, wg_y, 0), nir_ine_imm(b, wg_z, 0)))); { - nir_def *grid_sgpr = load_shader_metadata32(cs, grid_base_sgpr); nir_push_if(b, nir_ine_imm(b, grid_sgpr, 0)); { if (device->load_grid_size_from_user_sgpr) { dgc_emit_grid_size_user_sgpr(cs, grid_sgpr, wg_x, wg_y, wg_z); } else { - dgc_emit_grid_size_pointer(cs, grid_sgpr, stream_addr); + dgc_emit_grid_size_pointer(cs, grid_sgpr, size_va); } } nir_pop_if(b, 0); dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDispatch); - dgc_emit_sqtt_marker_event_with_dims(cs, sequence_id, wg_x, wg_y, wg_z, EventCmdDispatch); + dgc_emit_sqtt_marker_event_with_dims( + cs, sequence_id, wg_x, wg_y, wg_z, + is_rt ? EventCmdTraceRaysKHR | ApiRayTracingSeparateCompiled : EventCmdDispatch); - nir_def *dispatch_initiator = dgc_get_dispatch_initiator(cs); - dgc_emit_dispatch_direct(cs, wg_x, wg_y, wg_z, dispatch_initiator); + dgc_cs_begin(cs); + dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1)); + dgc_cs_emit(wg_x); + dgc_cs_emit(wg_y); + dgc_cs_emit(wg_z); + dgc_cs_emit(dispatch_initiator); + dgc_cs_end(); dgc_emit_sqtt_thread_trace_marker(cs); dgc_emit_sqtt_end_api_marker(cs, ApiCmdDispatch); @@ -1802,11 +1966,72 @@ dgc_emit_dispatch(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence nir_pop_if(b, 0); } +static void +dgc_emit_dispatch(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id) +{ + const struct radv_indirect_command_layout *layout = cs->layout; + nir_builder *b = cs->b; + + nir_def *dispatch_data = nir_build_load_global( + b, 3, 32, nir_iadd_imm(b, stream_addr, layout->vk.dispatch_src_offset_B), .access = ACCESS_NON_WRITEABLE); + nir_def *wg_x = nir_channel(b, dispatch_data, 0); + nir_def *wg_y = nir_channel(b, dispatch_data, 1); + nir_def *wg_z = nir_channel(b, dispatch_data, 2); + + nir_def *grid_sgpr = load_shader_metadata32(cs, grid_base_sgpr); + nir_def *dispatch_initiator = dgc_get_dispatch_initiator(cs); + nir_def *size_va = nir_iadd_imm(b, stream_addr, layout->vk.dispatch_src_offset_B); + + dgc_emit_dispatch_direct(cs, wg_x, wg_y, wg_z, dispatch_initiator, grid_sgpr, size_va, sequence_id, false); +} + /** - * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV. + * Draw mesh/task */ static void -dgc_emit_dispatch_taskmesh_gfx(struct dgc_cmdbuf *cs) +dgc_emit_userdata_mesh(struct dgc_cmdbuf *cs, nir_def *x, nir_def *y, nir_def *z, nir_def *drawid) +{ + nir_builder *b = cs->b; + + nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr); + vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr); + + nir_def *has_grid_size = nir_test_mask(b, vtx_base_sgpr, DGC_USES_GRID_SIZE); + nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID); + + nir_push_if(b, nir_ior(b, has_grid_size, has_drawid)); + { + nir_def *pkt_cnt = nir_imm_int(b, 0); + pkt_cnt = nir_bcsel(b, has_grid_size, nir_iadd_imm(b, pkt_cnt, 3), pkt_cnt); + pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt); + + dgc_cs_begin(cs); + dgc_cs_emit(nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt)); + dgc_cs_emit(nir_iand_imm(b, vtx_base_sgpr, 0x3FFF)); + /* DrawID needs to be first if no GridSize. */ + dgc_cs_emit(nir_bcsel(b, has_grid_size, x, drawid)); + dgc_cs_emit(nir_bcsel(b, has_grid_size, y, nir_imm_int(b, PKT3_NOP_PAD))); + dgc_cs_emit(nir_bcsel(b, has_grid_size, z, nir_imm_int(b, PKT3_NOP_PAD))); + dgc_cs_emit(nir_bcsel(b, has_drawid, drawid, nir_imm_int(b, PKT3_NOP_PAD))); + dgc_cs_end(); + } + nir_pop_if(b, NULL); +} + +static void +dgc_emit_dispatch_mesh_direct(struct dgc_cmdbuf *cs, nir_def *x, nir_def *y, nir_def *z) +{ + dgc_cs_begin(cs); + dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_MESH_DIRECT, 3, 0)); + dgc_cs_emit(x); + dgc_cs_emit(y); + dgc_cs_emit(z); + dgc_cs_emit_imm(S_0287F0_SOURCE_SELECT(V_0287F0_DI_SRC_SEL_AUTO_INDEX)); + dgc_cs_end(); +} + +static void +dgc_emit_dispatch_taskmesh_gfx(struct dgc_cmdbuf *cs, nir_def *sequence_id) { const struct radv_device *device = cs->dev; const struct radv_physical_device *pdev = radv_device_physical(device); @@ -1826,6 +2051,9 @@ dgc_emit_dispatch_taskmesh_gfx(struct dgc_cmdbuf *cs) nir_bcsel(b, has_linear_dispatch_en, nir_imm_int(b, S_4D1_LINEAR_DISPATCH_ENABLE(1)), nir_imm_int(b, 0)); nir_def *sqtt_enable = nir_imm_int(b, device->sqtt.bo ? S_4D1_THREAD_TRACE_MARKER_ENABLE(1) : 0); + dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawMeshTasksEXT); + dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawMeshTasksEXT); + dgc_cs_begin(cs); dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_TASKMESH_GFX, 2, 0) | PKT3_RESET_FILTER_CAM_S(1)); /* S_4D0_RING_ENTRY_REG(ring_entry_reg) | S_4D0_XYZ_DIM_REG(xyz_dim_reg) */ @@ -1837,6 +2065,9 @@ dgc_emit_dispatch_taskmesh_gfx(struct dgc_cmdbuf *cs) } dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX); dgc_cs_end(); + + dgc_emit_sqtt_thread_trace_marker(cs); + dgc_emit_sqtt_end_api_marker(cs, ApiCmdDrawMeshTasksEXT); } static void @@ -1847,7 +2078,7 @@ dgc_emit_draw_mesh_tasks_gfx(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_de const struct radv_physical_device *pdev = radv_device_physical(device); nir_builder *b = cs->b; - nir_def *draw_data = nir_build_load_global(b, 3, 32, nir_iadd_imm(b, stream_addr, layout->draw_params_offset), + nir_def *draw_data = nir_build_load_global(b, 3, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B), .access = ACCESS_NON_WRITEABLE); nir_def *x = nir_channel(b, draw_data, 0); nir_def *y = nir_channel(b, draw_data, 1); @@ -1855,15 +2086,15 @@ dgc_emit_draw_mesh_tasks_gfx(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_de nir_push_if(b, nir_iand(b, nir_ine_imm(b, x, 0), nir_iand(b, nir_ine_imm(b, y, 0), nir_ine_imm(b, z, 0)))); { - dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawMeshTasksEXT); - dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawMeshTasksEXT); - nir_push_if(b, nir_ieq_imm(b, load_param8(b, has_task_shader), 1)); { - dgc_emit_dispatch_taskmesh_gfx(cs); + dgc_emit_dispatch_taskmesh_gfx(cs, sequence_id); } nir_push_else(b, NULL); { + dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawMeshTasksEXT); + dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawMeshTasksEXT); + dgc_emit_userdata_mesh(cs, x, y, z, sequence_id); dgc_emit_instance_count(cs, nir_imm_int(b, 1)); @@ -1882,6 +2113,76 @@ dgc_emit_draw_mesh_tasks_gfx(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_de nir_pop_if(b, NULL); } +static void +dgc_emit_draw_mesh_tasks_with_count_gfx(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id) +{ + const struct radv_indirect_command_layout *layout = cs->layout; + const struct radv_device *device = cs->dev; + const struct radv_physical_device *pdev = radv_device_physical(device); + nir_builder *b = cs->b; + + nir_push_if(b, nir_ieq_imm(b, load_param8(b, has_task_shader), 1)); + { + dgc_emit_dispatch_taskmesh_gfx(cs, sequence_id); + } + nir_push_else(b, NULL); + { + nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr); + nir_def *has_grid_size = nir_test_mask(b, vtx_base_sgpr, DGC_USES_GRID_SIZE); + nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID); + + nir_def *draw_data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B), + .access = ACCESS_NON_WRITEABLE); + nir_def *va = nir_pack_64_2x32(b, nir_channels(b, draw_data, 0x3)); + nir_def *stride = nir_channel(b, draw_data, 2); + nir_def *draw_count = nir_umin(b, load_param32(b, max_draw_count), nir_channel(b, draw_data, 3)); + + dgc_emit_pkt3_set_base(cs, va); + + nir_def *base_reg = nir_iand_imm(b, vtx_base_sgpr, 0x3FFF); + nir_def *xyz_dim_reg = nir_bcsel(b, has_grid_size, base_reg, nir_imm_int(b, 0)); + nir_def *draw_id_offset = nir_bcsel(b, has_grid_size, nir_imm_int(b, 3), nir_imm_int(b, 0)); + nir_def *draw_id_reg = nir_bcsel(b, has_drawid, nir_iadd(b, base_reg, draw_id_offset), nir_imm_int(b, 0)); + + nir_push_if(b, has_drawid); + { + nir_def *packet[3] = {nir_imm_int(b, PKT3(PKT3_SET_SH_REG, 1, 0)), draw_id_reg, nir_imm_int(b, 0)}; + dgc_emit(cs, 3, packet); + } + nir_pop_if(b, NULL); + + nir_def *draw_index_enable = + nir_bcsel(b, has_drawid, nir_imm_int(b, S_4C2_DRAW_INDEX_ENABLE(1)), nir_imm_int(b, 0)); + nir_def *xyz_dim_enable = nir_bcsel(b, has_grid_size, nir_imm_int(b, S_4C2_XYZ_DIM_ENABLE(1)), nir_imm_int(b, 0)); + + dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawMeshTasksIndirectCountEXT); + dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawMeshTasksIndirectCountEXT); + + dgc_cs_begin(cs); + dgc_cs_emit(nir_imm_int(b, PKT3(PKT3_DISPATCH_MESH_INDIRECT_MULTI, 7, false) | PKT3_RESET_FILTER_CAM_S(1))); + dgc_cs_emit_imm(0); /* data offset */ + /* S_4C1_XYZ_DIM_REG(xyz_dim_reg) | S_4C1_DRAW_INDEX_REG(draw_id_reg) */ + dgc_cs_emit( + nir_ior(b, nir_iand_imm(b, xyz_dim_reg, 0xFFFF), nir_ishl_imm(b, nir_iand_imm(b, draw_id_reg, 0xFFFF), 16))); + if (pdev->info.gfx_level >= GFX11) { + dgc_cs_emit(nir_ior_imm(b, nir_ior(b, draw_index_enable, xyz_dim_enable), + S_4C2_MODE1_ENABLE(!pdev->mesh_fast_launch_2))); + } else { + dgc_cs_emit(draw_index_enable); + } + dgc_cs_emit(draw_count); + dgc_cs_emit_imm(0); + dgc_cs_emit_imm(0); + dgc_cs_emit(stride); + dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX); + dgc_cs_end(); + + dgc_emit_sqtt_thread_trace_marker(cs); + dgc_emit_sqtt_end_api_marker(cs, ApiCmdDrawMeshTasksIndirectCountEXT); + } + nir_pop_if(b, NULL); +} + static void dgc_emit_userdata_task(struct dgc_cmdbuf *ace_cs, nir_def *x, nir_def *y, nir_def *z) { @@ -1912,18 +2213,24 @@ dgc_emit_userdata_task(struct dgc_cmdbuf *ace_cs, nir_def *x, nir_def *y, nir_de nir_pop_if(b, NULL); } +static nir_def * +dgc_get_dispatch_initiator_task(struct dgc_cmdbuf *ace_cs) +{ + const struct radv_device *device = ace_cs->dev; + const uint32_t dispatch_initiator_task = device->dispatch_initiator_task; + nir_builder *b = ace_cs->b; + + nir_def *is_wave32 = nir_ieq_imm(b, load_param8(b, wave32), 1); + return nir_bcsel(b, is_wave32, nir_imm_int(b, dispatch_initiator_task | S_00B800_CS_W32_EN(1)), + nir_imm_int(b, dispatch_initiator_task)); +} + static void dgc_emit_dispatch_taskmesh_direct_ace(struct dgc_cmdbuf *ace_cs, nir_def *x, nir_def *y, nir_def *z) { - const struct radv_device *device = ace_cs->dev; + nir_def *dispatch_initiator = dgc_get_dispatch_initiator_task(ace_cs); nir_builder *b = ace_cs->b; - const uint32_t dispatch_initiator_task = device->dispatch_initiator_task; - nir_def *is_wave32 = nir_ieq_imm(b, load_param8(b, wave32), 1); - nir_def *dispatch_initiator = - nir_bcsel(b, is_wave32, nir_imm_int(b, dispatch_initiator_task | S_00B800_CS_W32_EN(1)), - nir_imm_int(b, dispatch_initiator_task)); - dgc_cs_begin(ace_cs); dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_TASKMESH_DIRECT_ACE, 4, 0) | PKT3_SHADER_TYPE_S(1)); dgc_cs_emit(x); @@ -1940,7 +2247,7 @@ dgc_emit_draw_mesh_tasks_ace(struct dgc_cmdbuf *ace_cs, nir_def *stream_addr) const struct radv_indirect_command_layout *layout = ace_cs->layout; nir_builder *b = ace_cs->b; - nir_def *draw_data = nir_build_load_global(b, 3, 32, nir_iadd_imm(b, stream_addr, layout->draw_params_offset), + nir_def *draw_data = nir_build_load_global(b, 3, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B), .access = ACCESS_NON_WRITEABLE); nir_def *x = nir_channel(b, draw_data, 0); nir_def *y = nir_channel(b, draw_data, 1); @@ -1954,8 +2261,47 @@ dgc_emit_draw_mesh_tasks_ace(struct dgc_cmdbuf *ace_cs, nir_def *stream_addr) nir_pop_if(b, NULL); } +static void +dgc_emit_draw_mesh_tasks_with_count_ace(struct dgc_cmdbuf *ace_cs, nir_def *stream_addr, nir_def *sequence_id) +{ + const struct radv_indirect_command_layout *layout = ace_cs->layout; + nir_builder *b = ace_cs->b; + + nir_def *draw_data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B), + .access = ACCESS_NON_WRITEABLE); + nir_def *va_lo = nir_channel(b, draw_data, 0); + nir_def *va_hi = nir_channel(b, draw_data, 1); + nir_def *stride = nir_channel(b, draw_data, 2); + nir_def *draw_count = nir_umin(b, load_param32(b, max_draw_count), nir_channel(b, draw_data, 3)); + + nir_def *xyz_dim_reg = load_param16(b, task_xyz_sgpr); + nir_def *ring_entry_reg = load_param16(b, task_ring_entry_sgpr); + nir_def *draw_id_reg = load_param16(b, task_draw_id_sgpr); + + nir_def *draw_index_enable = + nir_bcsel(b, nir_ine_imm(b, draw_id_reg, 0), nir_imm_int(b, S_AD3_DRAW_INDEX_ENABLE(1)), nir_imm_int(b, 0)); + nir_def *xyz_dim_enable = + nir_bcsel(b, nir_ine_imm(b, xyz_dim_reg, 0), nir_imm_int(b, S_AD3_XYZ_DIM_ENABLE(1)), nir_imm_int(b, 0)); + + nir_def *dispatch_initiator = dgc_get_dispatch_initiator_task(ace_cs); + + dgc_cs_begin(ace_cs); + dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE, 9, 0) | PKT3_SHADER_TYPE_S(1)); + dgc_cs_emit(va_lo); + dgc_cs_emit(va_hi); + dgc_cs_emit(ring_entry_reg); + dgc_cs_emit(nir_ior(b, draw_index_enable, nir_ior(b, xyz_dim_enable, nir_ishl_imm(b, draw_id_reg, 16)))); + dgc_cs_emit(xyz_dim_reg); + dgc_cs_emit(draw_count); + dgc_cs_emit_imm(0); + dgc_cs_emit_imm(0); + dgc_cs_emit(stride); + dgc_cs_emit(dispatch_initiator); + dgc_cs_end(); +} + /** - * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NV. + * Indirect execution set */ static void dgc_emit_indirect_sets(struct dgc_cmdbuf *cs) @@ -1975,11 +2321,11 @@ dgc_emit_indirect_sets(struct dgc_cmdbuf *cs) } static void -dgc_emit_bind_pipeline(struct dgc_cmdbuf *cs) +dgc_emit_ies(struct dgc_cmdbuf *cs) { nir_builder *b = cs->b; - nir_def *va = nir_iadd_imm(b, cs->pipeline_va, sizeof(struct radv_compute_pipeline_metadata)); + nir_def *va = nir_iadd_imm(b, cs->ies_va, sizeof(struct radv_compute_pipeline_metadata)); nir_def *num_dw = nir_build_load_global(b, 1, 32, va, .access = ACCESS_NON_WRITEABLE); nir_def *cs_va = nir_iadd_imm(b, va, 4); @@ -2006,6 +2352,66 @@ dgc_emit_bind_pipeline(struct dgc_cmdbuf *cs) dgc_emit_indirect_sets(cs); } +/** + * Raytracing. + */ +static void +dgc_emit_shader_pointer(struct dgc_cmdbuf *cs, nir_def *sh_offset, nir_def *va) +{ + nir_builder *b = cs->b; + + nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va); + nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va); + + dgc_cs_begin(cs); + dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 2, 0)); + dgc_cs_emit(sh_offset); + dgc_cs_emit(va_lo); + dgc_cs_emit(va_hi); + dgc_cs_end(); +} + +static void +dgc_emit_rt(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id) +{ + const struct radv_indirect_command_layout *layout = cs->layout; + const struct radv_device *device = cs->dev; + nir_builder *b = cs->b; + + nir_def *indirect_va = nir_iadd_imm(b, stream_addr, layout->vk.dispatch_src_offset_B); + + nir_def *cs_sbt_descriptors = load_param16(b, cs_sbt_descriptors); + nir_push_if(b, nir_ine_imm(b, cs_sbt_descriptors, 0)); + { + dgc_emit_shader_pointer(cs, cs_sbt_descriptors, indirect_va); + } + nir_pop_if(b, NULL); + + nir_def *launch_size_va = nir_iadd_imm(b, indirect_va, offsetof(VkTraceRaysIndirectCommand2KHR, width)); + + nir_def *cs_ray_launch_size_addr = load_param16(b, cs_ray_launch_size_addr); + nir_push_if(b, nir_ine_imm(b, cs_ray_launch_size_addr, 0)); + { + dgc_emit_shader_pointer(cs, cs_ray_launch_size_addr, launch_size_va); + } + nir_pop_if(b, NULL); + + const uint32_t dispatch_initiator = device->dispatch_initiator | S_00B800_USE_THREAD_DIMENSIONS(1); + nir_def *is_wave32 = nir_ieq_imm(b, load_param8(b, wave32), 1); + nir_def *dispatch_initiator_rt = nir_bcsel(b, is_wave32, nir_imm_int(b, dispatch_initiator | S_00B800_CS_W32_EN(1)), + nir_imm_int(b, dispatch_initiator)); + + nir_def *dispatch_data = nir_build_load_global(b, 3, 32, launch_size_va, .access = ACCESS_NON_WRITEABLE); + nir_def *width = nir_channel(b, dispatch_data, 0); + nir_def *height = nir_channel(b, dispatch_data, 1); + nir_def *depth = nir_channel(b, dispatch_data, 2); + + nir_def *grid_sgpr = load_param16(b, grid_base_sgpr); + + dgc_emit_dispatch_direct(cs, width, height, depth, dispatch_initiator_rt, grid_sgpr, launch_size_va, sequence_id, + true); +} + static nir_def * dgc_is_cond_render_enabled(nir_builder *b) { @@ -2059,23 +2465,22 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l nir_def *sequence_id = global_id; nir_def *cmd_buf_stride = load_param32(&b, cmd_buf_stride); - nir_def *sequence_count = load_param32(&b, sequence_count); - - nir_def *use_count = nir_iand_imm(&b, sequence_count, 1u << 31); - sequence_count = nir_iand_imm(&b, sequence_count, UINT32_MAX >> 1); - nir_def *cmd_buf_base_offset = load_param32(&b, cmd_buf_main_offset); + nir_def *sequence_count = load_param32(&b, sequence_count); + nir_def *sequence_count_addr = load_param64(&b, sequence_count_addr); + /* The effective number of draws is * min(sequencesCount, sequencesCountBuffer[sequencesCountOffset]) when * using sequencesCountBuffer. Otherwise it is sequencesCount. */ nir_variable *count_var = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "sequence_count"); nir_store_var(&b, count_var, sequence_count, 0x1); - nir_push_if(&b, nir_ine_imm(&b, use_count, 0)); + nir_push_if(&b, nir_ine_imm(&b, sequence_count_addr, 0)); { nir_def *cnt = nir_build_load_global(&b, 1, 32, load_param64(&b, sequence_count_addr), .access = ACCESS_NON_WRITEABLE); + /* Must clamp count against the API count explicitly. * The workgroup potentially contains more threads than maxSequencesCount from API, * and we have to ensure these threads write NOP packets to pad out the IB. */ @@ -2111,33 +2516,40 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l nir_def *cmd_buf_end = nir_iadd(&b, nir_load_var(&b, cmd_buf.offset), cmd_buf_stride); nir_def *stream_addr = load_param64(&b, stream_addr); - stream_addr = nir_iadd(&b, stream_addr, nir_u2u64(&b, nir_imul_imm(&b, sequence_id, layout->input_stride))); - - if (layout->bind_pipeline) - cmd_buf.pipeline_va = dgc_get_pipeline_va(&cmd_buf, stream_addr); + stream_addr = nir_iadd(&b, stream_addr, nir_u2u64(&b, nir_imul_imm(&b, sequence_id, layout->vk.stride))); nir_def *upload_offset_init = nir_iadd(&b, load_param32(&b, upload_main_offset), nir_imul(&b, load_param32(&b, upload_stride), sequence_id)); nir_store_var(&b, cmd_buf.upload_offset, upload_offset_init, 0x1); + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) + cmd_buf.ies_va = dgc_load_ies_va(&cmd_buf, stream_addr); + if (layout->push_constant_mask) { const VkShaderStageFlags stages = VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_COMPUTE_BIT | VK_SHADER_STAGE_MESH_BIT_EXT; - dgc_emit_push_constant(&cmd_buf, stream_addr, stages); + dgc_emit_push_constant(&cmd_buf, stream_addr, sequence_id, stages); } - if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) { - if (layout->bind_vbo_mask) { + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) { + /* Raytracing */ + dgc_emit_rt(&cmd_buf, stream_addr, sequence_id); + } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) { + /* Compute */ + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) { + dgc_emit_ies(&cmd_buf); + } + + dgc_emit_dispatch(&cmd_buf, stream_addr, sequence_id); + } else { + /* Graphics */ + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) { dgc_emit_vertex_buffer(&cmd_buf, stream_addr); } - if (layout->indexed) { - /* Emit direct draws when index buffers are also updated by DGC. Otherwise, emit - * indirect draws to remove the dependency on the cmdbuf state in order to enable - * preprocessing. - */ - if (layout->binds_index_buffer) { + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_INDEXED)) { + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IB)) { nir_variable *max_index_count_var = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "max_index_count"); @@ -2145,23 +2557,34 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l nir_def *max_index_count = nir_load_var(&b, max_index_count_var); - dgc_emit_draw_indexed(&cmd_buf, stream_addr, sequence_id, max_index_count); + if (layout->vk.draw_count) { + dgc_emit_draw_with_count(&cmd_buf, stream_addr, sequence_id, true); + } else { + dgc_emit_draw_indexed(&cmd_buf, stream_addr, sequence_id, max_index_count); + } } else { - dgc_emit_draw_indirect(&cmd_buf, stream_addr, sequence_id, true); + if (layout->vk.draw_count) { + dgc_emit_draw_with_count(&cmd_buf, stream_addr, sequence_id, true); + } else { + dgc_emit_draw_indirect(&cmd_buf, stream_addr, sequence_id, true); + } } } else { - if (layout->draw_mesh_tasks) { - dgc_emit_draw_mesh_tasks_gfx(&cmd_buf, stream_addr, sequence_id); + /* Non-indexed draws */ + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) { + if (layout->vk.draw_count) { + dgc_emit_draw_mesh_tasks_with_count_gfx(&cmd_buf, stream_addr, sequence_id); + } else { + dgc_emit_draw_mesh_tasks_gfx(&cmd_buf, stream_addr, sequence_id); + } } else { - dgc_emit_draw(&cmd_buf, stream_addr, sequence_id); + if (layout->vk.draw_count) { + dgc_emit_draw_with_count(&cmd_buf, stream_addr, sequence_id, false); + } else { + dgc_emit_draw(&cmd_buf, stream_addr, sequence_id); + } } } - } else { - if (layout->bind_pipeline) { - dgc_emit_bind_pipeline(&cmd_buf); - } - - dgc_emit_dispatch(&cmd_buf, stream_addr, sequence_id); } /* Pad the cmdbuffer if we did not use the whole stride */ @@ -2195,27 +2618,31 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l nir_def *cmd_buf_end = nir_iadd(&b, nir_load_var(&b, cmd_buf.offset), ace_cmd_buf_stride); nir_def *stream_addr = load_param64(&b, stream_addr); - stream_addr = nir_iadd(&b, stream_addr, nir_u2u64(&b, nir_imul_imm(&b, sequence_id, layout->input_stride))); - - if (layout->bind_pipeline) - cmd_buf.pipeline_va = dgc_get_pipeline_va(&cmd_buf, stream_addr); + stream_addr = nir_iadd(&b, stream_addr, nir_u2u64(&b, nir_imul_imm(&b, sequence_id, layout->vk.stride))); nir_def *upload_offset_init = nir_iadd(&b, load_param32(&b, upload_main_offset), nir_imul(&b, load_param32(&b, upload_stride), sequence_id)); nir_store_var(&b, cmd_buf.upload_offset, upload_offset_init, 0x1); + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) + cmd_buf.ies_va = dgc_load_ies_va(&cmd_buf, stream_addr); + if (layout->push_constant_mask) { - nir_def *push_constant_stages = dgc_get_push_constant_stages(&cmd_buf, stream_addr); + nir_def *push_constant_stages = dgc_get_push_constant_stages(&cmd_buf); nir_push_if(&b, nir_test_mask(&b, push_constant_stages, VK_SHADER_STAGE_TASK_BIT_EXT)); { const struct dgc_pc_params params = dgc_get_pc_params(&cmd_buf); - dgc_emit_push_constant_for_stage(&cmd_buf, stream_addr, ¶ms, MESA_SHADER_TASK); + dgc_emit_push_constant_for_stage(&cmd_buf, stream_addr, sequence_id, ¶ms, MESA_SHADER_TASK); } nir_pop_if(&b, NULL); } - dgc_emit_draw_mesh_tasks_ace(&cmd_buf, stream_addr); + if (layout->vk.draw_count) { + dgc_emit_draw_mesh_tasks_with_count_ace(&cmd_buf, stream_addr, sequence_id); + } else { + dgc_emit_draw_mesh_tasks_ace(&cmd_buf, stream_addr); + } /* Pad the cmdbuffer if we did not use the whole stride */ dgc_pad_cmdbuf(&cmd_buf, cmd_buf_end); @@ -2300,130 +2727,17 @@ radv_create_dgc_pipeline(struct radv_device *device, struct radv_indirect_comman return result; } -static void -radv_destroy_indirect_commands_layout(struct radv_device *device, const VkAllocationCallbacks *pAllocator, - struct radv_indirect_command_layout *layout) -{ - radv_DestroyPipeline(radv_device_to_handle(device), layout->pipeline, &device->meta_state.alloc); - - vk_object_base_finish(&layout->base); - vk_free2(&device->vk.alloc, pAllocator, layout); -} - -VKAPI_ATTR VkResult VKAPI_CALL -radv_CreateIndirectCommandsLayoutNV(VkDevice _device, const VkIndirectCommandsLayoutCreateInfoNV *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkIndirectCommandsLayoutNV *pIndirectCommandsLayout) -{ - VK_FROM_HANDLE(radv_device, device, _device); - struct radv_indirect_command_layout *layout; - VkResult result; - - size_t size = sizeof(*layout) + pCreateInfo->tokenCount * sizeof(VkIndirectCommandsLayoutTokenNV); - - layout = vk_zalloc2(&device->vk.alloc, pAllocator, size, alignof(struct radv_indirect_command_layout), - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (!layout) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - vk_object_base_init(&device->vk, &layout->base, VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV); - - layout->flags = pCreateInfo->flags; - layout->pipeline_bind_point = pCreateInfo->pipelineBindPoint; - layout->input_stride = pCreateInfo->pStreamStrides[0]; - layout->token_count = pCreateInfo->tokenCount; - typed_memcpy(layout->tokens, pCreateInfo->pTokens, pCreateInfo->tokenCount); - - layout->ibo_type_32 = VK_INDEX_TYPE_UINT32; - layout->ibo_type_8 = VK_INDEX_TYPE_UINT8_KHR; - - for (unsigned i = 0; i < pCreateInfo->tokenCount; ++i) { - switch (pCreateInfo->pTokens[i].tokenType) { - case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV: - layout->draw_params_offset = pCreateInfo->pTokens[i].offset; - break; - case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV: - layout->indexed = true; - layout->draw_params_offset = pCreateInfo->pTokens[i].offset; - break; - case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV: - layout->dispatch_params_offset = pCreateInfo->pTokens[i].offset; - break; - case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV: - layout->binds_index_buffer = true; - layout->index_buffer_offset = pCreateInfo->pTokens[i].offset; - /* 16-bit is implied if we find no match. */ - for (unsigned j = 0; j < pCreateInfo->pTokens[i].indexTypeCount; j++) { - if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT32) - layout->ibo_type_32 = pCreateInfo->pTokens[i].pIndexTypeValues[j]; - else if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT8_KHR) - layout->ibo_type_8 = pCreateInfo->pTokens[i].pIndexTypeValues[j]; - } - break; - case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV: - layout->bind_vbo_mask |= 1u << pCreateInfo->pTokens[i].vertexBindingUnit; - layout->vbo_offsets[pCreateInfo->pTokens[i].vertexBindingUnit] = pCreateInfo->pTokens[i].offset; - layout->vertex_dynamic_stride = pCreateInfo->pTokens[i].vertexDynamicStride; - break; - case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV: { - VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->pTokens[i].pushconstantPipelineLayout); - for (unsigned j = pCreateInfo->pTokens[i].pushconstantOffset / 4, k = 0; - k < pCreateInfo->pTokens[i].pushconstantSize / 4; ++j, ++k) { - layout->push_constant_mask |= 1ull << j; - layout->push_constant_offsets[j] = pCreateInfo->pTokens[i].offset + k * 4; - } - layout->push_constant_size = pipeline_layout->push_constant_size; - assert(!pipeline_layout->dynamic_offset_count); - break; - } - case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV: - layout->draw_mesh_tasks = true; - layout->draw_params_offset = pCreateInfo->pTokens[i].offset; - break; - case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NV: - layout->bind_pipeline = true; - layout->pipeline_params_offset = pCreateInfo->pTokens[i].offset; - break; - default: - unreachable("Unhandled token type"); - } - } - - result = radv_create_dgc_pipeline(device, layout); - if (result != VK_SUCCESS) { - radv_destroy_indirect_commands_layout(device, pAllocator, layout); - return result; - } - - *pIndirectCommandsLayout = radv_indirect_command_layout_to_handle(layout); - return VK_SUCCESS; -} - VKAPI_ATTR void VKAPI_CALL -radv_DestroyIndirectCommandsLayoutNV(VkDevice _device, VkIndirectCommandsLayoutNV indirectCommandsLayout, - const VkAllocationCallbacks *pAllocator) -{ - VK_FROM_HANDLE(radv_device, device, _device); - VK_FROM_HANDLE(radv_indirect_command_layout, layout, indirectCommandsLayout); - - if (!layout) - return; - - radv_destroy_indirect_commands_layout(device, pAllocator, layout); -} - -VKAPI_ATTR void VKAPI_CALL -radv_GetGeneratedCommandsMemoryRequirementsNV(VkDevice _device, - const VkGeneratedCommandsMemoryRequirementsInfoNV *pInfo, - VkMemoryRequirements2 *pMemoryRequirements) +radv_GetGeneratedCommandsMemoryRequirementsEXT(VkDevice _device, + const VkGeneratedCommandsMemoryRequirementsInfoEXT *pInfo, + VkMemoryRequirements2 *pMemoryRequirements) { VK_FROM_HANDLE(radv_device, device, _device); const struct radv_physical_device *pdev = radv_device_physical(device); VK_FROM_HANDLE(radv_indirect_command_layout, layout, pInfo->indirectCommandsLayout); - VK_FROM_HANDLE(radv_pipeline, pipeline, pInfo->pipeline); struct dgc_cmdbuf_layout cmdbuf_layout; - get_dgc_cmdbuf_layout(device, layout, pipeline, pInfo->maxSequencesCount, true, &cmdbuf_layout); + get_dgc_cmdbuf_layout(device, layout, pInfo->pNext, pInfo->maxSequenceCount, true, &cmdbuf_layout); pMemoryRequirements->memoryRequirements.memoryTypeBits = pdev->memory_types_32bit; pMemoryRequirements->memoryRequirements.alignment = @@ -2433,28 +2747,12 @@ radv_GetGeneratedCommandsMemoryRequirementsNV(VkDevice _device, } bool -radv_dgc_with_task_shader(const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo) +radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo) { - VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout); - - if (layout->pipeline_bind_point != VK_PIPELINE_BIND_POINT_GRAPHICS) - return false; - - if (!layout->draw_mesh_tasks) - return false; - - VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline); - const struct radv_shader *task_shader = radv_get_shader(pipeline->shaders, MESA_SHADER_TASK); - if (!task_shader) - return false; - - return true; -} - -bool -radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo) -{ - VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer); + const VkGeneratedCommandsPipelineInfoEXT *pipeline_info = + vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT); + const VkGeneratedCommandsShaderInfoEXT *eso_info = + vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT); /* Enable conditional rendering (if not enabled by user) to skip prepare/execute DGC calls when * the indirect sequence count might be zero. This can only be enabled on GFX because on ACE it's @@ -2462,137 +2760,42 @@ radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCo * when the graphics pipelines has a task shader for the same reason (otherwise the DGC ACE IB * would be uninitialized). */ - return cmd_buffer->qf == RADV_QUEUE_GENERAL && !radv_dgc_with_task_shader(pGeneratedCommandsInfo) && - seq_count_buffer && !cmd_buffer->state.predicating; + return cmd_buffer->qf == RADV_QUEUE_GENERAL && !radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK) && + pGeneratedCommandsInfo->sequenceCountAddress != 0 && !cmd_buffer->state.predicating; } -static bool -radv_dgc_need_push_constants_copy(const struct radv_pipeline *pipeline) -{ - for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); ++i) { - const struct radv_shader *shader = pipeline->shaders[i]; - - if (!shader) - continue; - - const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs; - if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) - return true; - } - - return false; -} - -bool -radv_dgc_can_preprocess(const struct radv_indirect_command_layout *layout, struct radv_pipeline *pipeline) -{ - if (!(layout->flags & VK_INDIRECT_COMMANDS_LAYOUT_USAGE_EXPLICIT_PREPROCESS_BIT_NV)) - return false; - - /* From the Vulkan spec (1.3.269, chapter 32): - * "The bound descriptor sets and push constants that will be used with indirect command generation for the compute - * piplines must already be specified at the time of preprocessing commands with vkCmdPreprocessGeneratedCommandsNV. - * They must not change until the execution of indirect commands is submitted with vkCmdExecuteGeneratedCommandsNV." - * - * So we can always preprocess compute layouts. - */ - if (layout->pipeline_bind_point != VK_PIPELINE_BIND_POINT_COMPUTE) { - /* VBO binding (in particular partial VBO binding) uses some draw state which we don't generate at preprocess time - * yet. */ - if (layout->bind_vbo_mask) - return false; - - /* Do not preprocess when all push constants can't be inlined because they need to be copied - * to the upload BO. - */ - if (layout->push_constant_mask && radv_dgc_need_push_constants_copy(pipeline)) - return false; - } - - return true; -} - -/* Always need to call this directly before draw due to dependence on bound state. */ -static void -radv_prepare_dgc_graphics(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo, - unsigned *upload_size, unsigned *upload_offset, void **upload_data, - struct radv_dgc_params *params) +VKAPI_ATTR void VKAPI_CALL +radv_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer, + const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, + VkCommandBuffer stateCommandBuffer) { + VK_FROM_HANDLE(radv_cmd_buffer, state_cmd_buffer, stateCommandBuffer); + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout); - VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline); - struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline); - struct radv_shader *vs = radv_get_shader(graphics_pipeline->base.shaders, MESA_SHADER_VERTEX); - unsigned vb_size = layout->bind_vbo_mask ? MAX_VBS * DGC_VBO_INFO_SIZE : 0; - *upload_size = MAX2(*upload_size + vb_size, 16); + assert(layout->vk.usage & VK_INDIRECT_COMMANDS_LAYOUT_USAGE_EXPLICIT_PREPROCESS_BIT_EXT); - if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) { - vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY); - return; - } + /* VK_EXT_conditional_rendering says that copy commands should not be + * affected by conditional rendering. + */ + const bool old_predicating = cmd_buffer->state.predicating; + cmd_buffer->state.predicating = false; - uint16_t vtx_base_sgpr = 0; + radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, state_cmd_buffer, old_predicating); - if (graphics_pipeline->vtx_base_sgpr) - vtx_base_sgpr = (graphics_pipeline->vtx_base_sgpr - SI_SH_REG_OFFSET) >> 2; - - if (graphics_pipeline->uses_drawid) - vtx_base_sgpr |= DGC_USES_DRAWID; - - if (layout->draw_mesh_tasks) { - struct radv_shader *mesh_shader = radv_get_shader(graphics_pipeline->base.shaders, MESA_SHADER_MESH); - const struct radv_shader *task_shader = radv_get_shader(graphics_pipeline->base.shaders, MESA_SHADER_TASK); - - if (mesh_shader->info.cs.uses_grid_size) - vtx_base_sgpr |= DGC_USES_GRID_SIZE; - - if (task_shader) { - params->has_task_shader = 1; - params->mesh_ring_entry_sgpr = radv_get_user_sgpr(mesh_shader, AC_UD_TASK_RING_ENTRY); - params->wave32 = task_shader->info.wave_size == 32; - params->linear_dispatch_en = task_shader->info.cs.linear_taskmesh_dispatch; - params->task_ring_entry_sgpr = radv_get_user_sgpr(task_shader, AC_UD_TASK_RING_ENTRY); - params->task_xyz_sgpr = radv_get_user_sgpr(task_shader, AC_UD_CS_GRID_SIZE); - params->task_draw_id_sgpr = radv_get_user_sgpr(task_shader, AC_UD_CS_TASK_DRAW_ID); - } - } else { - if (graphics_pipeline->uses_baseinstance) - vtx_base_sgpr |= DGC_USES_BASEINSTANCE; - } - - params->vtx_base_sgpr = vtx_base_sgpr; - params->max_index_count = cmd_buffer->state.max_index_count; - params->dynamic_vs_input = layout->bind_vbo_mask && vs->info.vs.dynamic_inputs; - params->use_per_attribute_vb_descs = layout->bind_vbo_mask && vs->info.vs.use_per_attribute_vb_descs; - - if (layout->bind_vbo_mask) { - uint8_t *ptr = (uint8_t *)((char *)*upload_data); - - for (uint32_t i = 0; i < MAX_VBS; i++) { - struct radv_vbo_info vbo_info; - radv_get_vbo_info(cmd_buffer, i, &vbo_info); - - memcpy(ptr, &vbo_info, sizeof(vbo_info)); - ptr += sizeof(struct radv_vbo_info); - - memcpy(ptr, &layout->vbo_offsets[vbo_info.binding], sizeof(uint32_t)); - ptr += sizeof(uint32_t); - } - params->vb_desc_usage_mask = vs->info.vs.vb_desc_usage_mask; - params->vbo_reg = radv_get_user_sgpr(vs, AC_UD_VS_VERTEX_BUFFERS); - - *upload_data = (char *)*upload_data + vb_size; - } + /* Restore conditional rendering. */ + cmd_buffer->state.predicating = old_predicating; } static void -radv_prepare_dgc_compute(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo, - unsigned *upload_size, unsigned *upload_offset, void **upload_data, - struct radv_dgc_params *params, bool cond_render_enabled) +radv_prepare_dgc_compute(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, + struct radv_cmd_buffer *state_cmd_buffer, unsigned *upload_size, unsigned *upload_offset, + void **upload_data, struct radv_dgc_params *params, bool cond_render_enabled) + { - VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline); + VK_FROM_HANDLE(radv_indirect_execution_set, ies, pGeneratedCommandsInfo->indirectExecutionSet); const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); - const uint32_t alloc_size = pipeline ? sizeof(struct radv_compute_pipeline_metadata) : 0; + const uint32_t alloc_size = ies ? 0 : sizeof(struct radv_compute_pipeline_metadata); *upload_size = MAX2(*upload_size + alloc_size, 16); @@ -2607,59 +2810,172 @@ radv_prepare_dgc_compute(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCo params->predication_type = cmd_buffer->state.predication_type; } - if (pipeline) { - struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline); - struct radv_shader *cs = radv_get_shader(compute_pipeline->base.shaders, MESA_SHADER_COMPUTE); + if (ies) { + struct radv_descriptor_state *descriptors_state = + radv_get_descriptors_state(state_cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE); + + radv_upload_indirect_descriptor_sets(cmd_buffer, descriptors_state); + + params->ies_stride = ies->stride; + params->indirect_desc_sets_va = descriptors_state->indirect_descriptor_sets_va; + } else { + const VkGeneratedCommandsPipelineInfoEXT *pipeline_info = + vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT); + const VkGeneratedCommandsShaderInfoEXT *eso_info = + vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT); + const struct radv_shader *cs = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_COMPUTE); struct radv_compute_pipeline_metadata *metadata = (struct radv_compute_pipeline_metadata *)(*upload_data); radv_get_compute_shader_metadata(device, cs, metadata); *upload_data = (char *)*upload_data + alloc_size; + } +} + +static void +radv_prepare_dgc_rt(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, + unsigned *upload_size, unsigned *upload_offset, void **upload_data, struct radv_dgc_params *params) +{ + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) { + vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY); + return; + } + + const VkGeneratedCommandsPipelineInfoEXT *pipeline_info = + vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT); + VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline); + const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline); + const struct radv_shader *rt_prolog = rt_pipeline->prolog; + + params->wave32 = rt_prolog->info.wave_size == 32; + params->grid_base_sgpr = radv_get_user_sgpr(rt_prolog, AC_UD_CS_GRID_SIZE); + params->cs_sbt_descriptors = radv_get_user_sgpr(rt_prolog, AC_UD_CS_SBT_DESCRIPTORS); + params->cs_ray_launch_size_addr = radv_get_user_sgpr(rt_prolog, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR); +} + +static uint32_t +get_dgc_vertex_binding_offset(const struct radv_indirect_command_layout *layout, uint32_t binding) +{ + for (uint32_t i = 0; i < layout->vk.n_vb_layouts; i++) { + if (layout->vk.vb_layouts[i].binding == binding) + return layout->vk.vb_layouts[i].src_offset_B; + } + + return -1; +} + +static void +radv_prepare_dgc_graphics(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, + struct radv_cmd_buffer *state_cmd_buffer, unsigned *upload_size, unsigned *upload_offset, + void **upload_data, struct radv_dgc_params *params) +{ + VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout); + + const VkGeneratedCommandsPipelineInfoEXT *pipeline_info = + vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT); + const VkGeneratedCommandsShaderInfoEXT *eso_info = + vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT); + + const gl_shader_stage first_stage = + (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) ? MESA_SHADER_MESH : MESA_SHADER_VERTEX; + struct radv_shader *first_shader = radv_dgc_get_shader(pipeline_info, eso_info, first_stage); + + unsigned vb_size = (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) ? MAX_VBS * DGC_VBO_INFO_SIZE : 0; + + *upload_size = MAX2(*upload_size + vb_size, 16); + + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) { + vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY); + return; + } + + uint16_t vtx_base_sgpr = radv_get_user_sgpr(first_shader, AC_UD_VS_BASE_VERTEX_START_INSTANCE); + const bool uses_drawid = first_shader->info.vs.needs_draw_id; + + if (uses_drawid) + vtx_base_sgpr |= DGC_USES_DRAWID; + + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) { + if (first_shader->info.cs.uses_grid_size) + vtx_base_sgpr |= DGC_USES_GRID_SIZE; + + const struct radv_shader *task_shader = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK); + if (task_shader) { + params->has_task_shader = 1; + params->mesh_ring_entry_sgpr = radv_get_user_sgpr(first_shader, AC_UD_TASK_RING_ENTRY); + params->linear_dispatch_en = task_shader->info.cs.linear_taskmesh_dispatch; + params->task_ring_entry_sgpr = radv_get_user_sgpr(task_shader, AC_UD_TASK_RING_ENTRY); + params->wave32 = task_shader->info.wave_size == 32; + params->task_xyz_sgpr = radv_get_user_sgpr(task_shader, AC_UD_CS_GRID_SIZE); + params->task_draw_id_sgpr = radv_get_user_sgpr(task_shader, AC_UD_CS_TASK_DRAW_ID); + } } else { - struct radv_descriptor_state *descriptors_state = - radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE); + const bool uses_baseinstance = first_shader->info.vs.needs_base_instance; - radv_upload_indirect_descriptor_sets(cmd_buffer, descriptors_state); + if (uses_baseinstance) + vtx_base_sgpr |= DGC_USES_BASEINSTANCE; + } - params->indirect_desc_sets_va = descriptors_state->indirect_descriptor_sets_va; + params->vtx_base_sgpr = vtx_base_sgpr; + params->max_index_count = state_cmd_buffer->state.max_index_count; + params->max_draw_count = pGeneratedCommandsInfo->maxDrawCount; + params->dynamic_vs_input = + (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) && first_shader->info.vs.dynamic_inputs; + params->use_per_attribute_vb_descs = + (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) && first_shader->info.vs.use_per_attribute_vb_descs; + + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) { + uint8_t *ptr = (uint8_t *)((char *)*upload_data); + + for (uint32_t i = 0; i < MAX_VBS; i++) { + struct radv_vbo_info vbo_info; + radv_get_vbo_info(state_cmd_buffer, i, &vbo_info); + + const uint32_t vbo_offset = get_dgc_vertex_binding_offset(layout, vbo_info.binding); + + memcpy(ptr, &vbo_info, sizeof(vbo_info)); + ptr += sizeof(struct radv_vbo_info); + + memcpy(ptr, &vbo_offset, sizeof(uint32_t)); + ptr += sizeof(uint32_t); + } + params->vb_desc_usage_mask = first_shader->info.vs.vb_desc_usage_mask; + params->vbo_reg = radv_get_user_sgpr(first_shader, AC_UD_VS_VERTEX_BUFFERS); + + *upload_data = (char *)*upload_data + vb_size; } } void -radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo, - bool cond_render_enabled) +radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, + struct radv_cmd_buffer *state_cmd_buffer, bool cond_render_enabled) { VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout); - VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline); - VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer); - VK_FROM_HANDLE(radv_buffer, stream_buffer, pGeneratedCommandsInfo->pStreams[0].buffer); - VK_FROM_HANDLE(radv_buffer, sequence_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer); + VK_FROM_HANDLE(radv_indirect_execution_set, ies, pGeneratedCommandsInfo->indirectExecutionSet); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); const struct radv_physical_device *pdev = radv_device_physical(device); struct radv_meta_saved_state saved_state; - unsigned upload_offset, upload_size; + unsigned upload_offset, upload_size = 0; struct radv_buffer token_buffer; void *upload_data; - uint64_t upload_addr = - radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset + pGeneratedCommandsInfo->preprocessOffset; - - uint64_t stream_addr = - radv_buffer_get_va(stream_buffer->bo) + stream_buffer->offset + pGeneratedCommandsInfo->pStreams[0].offset; - - uint64_t sequence_count_addr = 0; - if (sequence_count_buffer) - sequence_count_addr = radv_buffer_get_va(sequence_count_buffer->bo) + sequence_count_buffer->offset + - pGeneratedCommandsInfo->sequencesCountOffset; + const VkGeneratedCommandsPipelineInfoEXT *pipeline_info = + vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT); + const VkGeneratedCommandsShaderInfoEXT *eso_info = + vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT); const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo); - const uint32_t sequences_count = pGeneratedCommandsInfo->sequencesCount; + const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount; struct dgc_cmdbuf_layout cmdbuf_layout; - get_dgc_cmdbuf_layout(device, layout, pipeline, sequences_count, use_preamble, &cmdbuf_layout); + get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout); - assert((cmdbuf_layout.main_offset + upload_addr) % pdev->info.ip[AMD_IP_GFX].ib_alignment == 0); - assert((cmdbuf_layout.ace_main_offset + upload_addr) % pdev->info.ip[AMD_IP_COMPUTE].ib_alignment == 0); + assert((cmdbuf_layout.main_offset + pGeneratedCommandsInfo->preprocessAddress) % + pdev->info.ip[AMD_IP_GFX].ib_alignment == + 0); + assert((cmdbuf_layout.ace_main_offset + pGeneratedCommandsInfo->preprocessAddress) % + pdev->info.ip[AMD_IP_COMPUTE].ib_alignment == + 0); struct radv_dgc_params params = { .cmd_buf_preamble_offset = cmdbuf_layout.main_preamble_offset, @@ -2672,67 +2988,98 @@ radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsIn .ace_cmd_buf_stride = cmdbuf_layout.ace_cmd_stride, .ace_cmd_buf_size = cmdbuf_layout.ace_size, .upload_main_offset = cmdbuf_layout.upload_offset, - .upload_addr = (uint32_t)upload_addr, + .upload_addr = (uint32_t)pGeneratedCommandsInfo->preprocessAddress, .upload_stride = cmdbuf_layout.upload_stride, - .sequence_count = sequences_count | (sequence_count_addr ? 1u << 31 : 0), - .sequence_count_addr = sequence_count_addr, + .sequence_count = sequences_count, .use_preamble = use_preamble, - .stream_addr = stream_addr, + .stream_addr = pGeneratedCommandsInfo->indirectAddress, + .sequence_count_addr = pGeneratedCommandsInfo->sequenceCountAddress, + .ies_addr = ies ? ies->va : 0, + .queue_family = state_cmd_buffer->qf, }; - upload_size = layout->push_constant_size + ARRAY_SIZE(pipeline->shaders) * 12; - if (!layout->push_constant_mask) - upload_size = 0; + VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout); - if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) { - radv_prepare_dgc_graphics(cmd_buffer, pGeneratedCommandsInfo, &upload_size, &upload_offset, &upload_data, - ¶ms); + if (layout->vk.dgc_info & (BITFIELD_BIT(MESA_VK_DGC_PC) | BITFIELD_BIT(MESA_VK_DGC_SI))) { + upload_size = pipeline_layout->push_constant_size + MESA_VULKAN_SHADER_STAGES * 12; + } + + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) { + radv_prepare_dgc_compute(cmd_buffer, pGeneratedCommandsInfo, state_cmd_buffer, &upload_size, &upload_offset, + &upload_data, ¶ms, cond_render_enabled); + } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) { + radv_prepare_dgc_rt(cmd_buffer, pGeneratedCommandsInfo, &upload_size, &upload_offset, &upload_data, ¶ms); } else { - assert(layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE); - radv_prepare_dgc_compute(cmd_buffer, pGeneratedCommandsInfo, &upload_size, &upload_offset, &upload_data, ¶ms, - cond_render_enabled); + radv_prepare_dgc_graphics(cmd_buffer, pGeneratedCommandsInfo, state_cmd_buffer, &upload_size, &upload_offset, + &upload_data, ¶ms); } if (layout->push_constant_mask) { VkShaderStageFlags pc_stages = 0; uint32_t *desc = upload_data; - upload_data = (char *)upload_data + ARRAY_SIZE(pipeline->shaders) * 12; + upload_data = (char *)upload_data + MESA_VULKAN_SHADER_STAGES * 12; - if (pipeline) { - for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); ++i) { - if (!pipeline->shaders[i]) - continue; + struct radv_shader *shaders[MESA_VULKAN_SHADER_STAGES] = {0}; + if (pipeline_info) { + VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline); - const struct radv_shader *shader = pipeline->shaders[i]; - const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs; - if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) - params.const_copy = 1; + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) { + const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline); + struct radv_shader *rt_prolog = rt_pipeline->prolog; - if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0 || - locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) { - unsigned upload_sgpr = 0; - unsigned inline_sgpr = 0; + shaders[MESA_SHADER_COMPUTE] = rt_prolog; + } else { + memcpy(shaders, pipeline->shaders, sizeof(shaders)); + } + } else if (eso_info) { + for (unsigned i = 0; i < eso_info->shaderCount; ++i) { + VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]); + struct radv_shader *shader = shader_object->shader; + gl_shader_stage stage = shader->info.stage; - if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) { - upload_sgpr = radv_get_user_sgpr(shader, AC_UD_PUSH_CONSTANTS); - } + shaders[stage] = shader; + } + } - if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) { - inline_sgpr = radv_get_user_sgpr(shader, AC_UD_INLINE_PUSH_CONSTANTS); - desc[i * 3 + 1] = pipeline->shaders[i]->info.inline_push_constant_mask; - desc[i * 3 + 2] = pipeline->shaders[i]->info.inline_push_constant_mask >> 32; - } - desc[i * 3] = upload_sgpr | (inline_sgpr << 16); + for (unsigned i = 0; i < ARRAY_SIZE(shaders); i++) { + const struct radv_shader *shader = shaders[i]; - pc_stages |= mesa_to_vk_shader_stage(i); + if (!shader) + continue; + + const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs; + if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) { + params.const_copy = 1; + } + + if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0 || + locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) { + unsigned upload_sgpr = 0; + unsigned inline_sgpr = 0; + + if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) { + upload_sgpr = (shader->info.user_data_0 + 4 * locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx - + SI_SH_REG_OFFSET) >> + 2; } + + if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) { + inline_sgpr = (shader->info.user_data_0 + 4 * locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx - + SI_SH_REG_OFFSET) >> + 2; + desc[i * 3 + 1] = shader->info.inline_push_constant_mask; + desc[i * 3 + 2] = shader->info.inline_push_constant_mask >> 32; + } + desc[i * 3] = upload_sgpr | (inline_sgpr << 16); + + pc_stages |= mesa_to_vk_shader_stage(i); } } params.push_constant_stages = pc_stages; - memcpy(upload_data, cmd_buffer->push_constants, layout->push_constant_size); - upload_data = (char *)upload_data + layout->push_constant_size; + memcpy(upload_data, state_cmd_buffer->push_constants, pipeline_layout->push_constant_size); + upload_data = (char *)upload_data + pipeline_layout->push_constant_size; } radv_buffer_init(&token_buffer, device, cmd_buffer->upload.upload_bo, upload_size, upload_offset); @@ -2756,36 +3103,256 @@ radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsIn .offset = 0, .range = upload_size}}}); - unsigned block_count = MAX2(1, DIV_ROUND_UP(pGeneratedCommandsInfo->sequencesCount, 64)); + unsigned block_count = MAX2(1, DIV_ROUND_UP(pGeneratedCommandsInfo->maxSequenceCount, 64)); vk_common_CmdDispatch(radv_cmd_buffer_to_handle(cmd_buffer), block_count, 1, 1); radv_buffer_finish(&token_buffer); radv_meta_restore(&saved_state, cmd_buffer); } -/* VK_NV_device_generated_commands_compute */ -VKAPI_ATTR void VKAPI_CALL -radv_GetPipelineIndirectMemoryRequirementsNV(VkDevice _device, const VkComputePipelineCreateInfo *pCreateInfo, - VkMemoryRequirements2 *pMemoryRequirements) +static void +radv_destroy_indirect_commands_layout(struct radv_device *device, const VkAllocationCallbacks *pAllocator, + struct radv_indirect_command_layout *layout) +{ + radv_DestroyPipeline(radv_device_to_handle(device), layout->pipeline, &device->meta_state.alloc); + + vk_indirect_command_layout_destroy(&device->vk, pAllocator, &layout->vk); +} + +VKAPI_ATTR VkResult VKAPI_CALL +radv_CreateIndirectCommandsLayoutEXT(VkDevice _device, const VkIndirectCommandsLayoutCreateInfoEXT *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkIndirectCommandsLayoutEXT *pIndirectCommandsLayout) +{ + VK_FROM_HANDLE(radv_device, device, _device); + struct radv_indirect_command_layout *layout; + VkResult result; + + layout = vk_indirect_command_layout_create(&device->vk, pCreateInfo, pAllocator, sizeof(*layout)); + if (!layout) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + for (uint32_t i = 0; i < layout->vk.n_pc_layouts; i++) { + for (uint32_t j = layout->vk.pc_layouts[i].dst_offset_B / 4, k = 0; k < layout->vk.pc_layouts[i].size_B / 4; + j++, k++) { + layout->push_constant_mask |= 1ull << j; + layout->push_constant_offsets[j] = layout->vk.pc_layouts[i].src_offset_B + k * 4; + } + } + + if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_SI)) { + layout->sequence_index_mask = 1ull << (layout->vk.si_layout.dst_offset_B / 4); + layout->push_constant_mask |= layout->sequence_index_mask; + } + + result = radv_create_dgc_pipeline(device, layout); + if (result != VK_SUCCESS) { + radv_destroy_indirect_commands_layout(device, pAllocator, layout); + return result; + } + + *pIndirectCommandsLayout = radv_indirect_command_layout_to_handle(layout); + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +radv_DestroyIndirectCommandsLayoutEXT(VkDevice _device, VkIndirectCommandsLayoutEXT indirectCommandsLayout, + const VkAllocationCallbacks *pAllocator) +{ + VK_FROM_HANDLE(radv_device, device, _device); + VK_FROM_HANDLE(radv_indirect_command_layout, layout, indirectCommandsLayout); + + if (!layout) + return; + + vk_indirect_command_layout_destroy(&device->vk, pAllocator, &layout->vk); +} + +static void +radv_update_ies_shader(struct radv_device *device, struct radv_indirect_execution_set *set, uint32_t index, + struct radv_shader *shader) +{ + const struct radv_physical_device *pdev = radv_device_physical(device); + uint8_t *ptr = set->mapped_ptr + set->stride * index; + struct radv_compute_pipeline_metadata md; + struct radeon_cmdbuf *cs; + + assert(shader->info.stage == MESA_SHADER_COMPUTE); + radv_get_compute_shader_metadata(device, shader, &md); + + cs = calloc(1, sizeof(*cs)); + if (!cs) + return; + + cs->reserved_dw = cs->max_dw = 32; + cs->buf = malloc(cs->max_dw * 4); + if (!cs->buf) { + free(cs); + return; + } + + radv_emit_compute_shader(pdev, cs, shader); + + memcpy(ptr, &md, sizeof(md)); + ptr += sizeof(md); + + memcpy(ptr, &cs->cdw, sizeof(uint32_t)); + ptr += sizeof(uint32_t); + + memcpy(ptr, cs->buf, cs->cdw * sizeof(uint32_t)); + ptr += cs->cdw * sizeof(uint32_t); + + set->compute_scratch_size_per_wave = MAX2(set->compute_scratch_size_per_wave, shader->config.scratch_bytes_per_wave); + set->compute_scratch_waves = MAX2(set->compute_scratch_waves, radv_get_max_scratch_waves(device, shader)); + + free(cs->buf); + free(cs); +} + +static void +radv_update_ies_pipeline(struct radv_device *device, struct radv_indirect_execution_set *set, uint32_t index, + const struct radv_pipeline *pipeline) +{ + assert(pipeline->type == RADV_PIPELINE_COMPUTE); + radv_update_ies_shader(device, set, index, pipeline->shaders[MESA_SHADER_COMPUTE]); +} + +static void +radv_destroy_indirect_execution_set(struct radv_device *device, const VkAllocationCallbacks *pAllocator, + struct radv_indirect_execution_set *set) +{ + if (set->bo) + radv_bo_destroy(device, &set->base, set->bo); + + vk_object_base_finish(&set->base); + vk_free2(&device->vk.alloc, pAllocator, set); +} + +VKAPI_ATTR VkResult VKAPI_CALL +radv_CreateIndirectExecutionSetEXT(VkDevice _device, const VkIndirectExecutionSetCreateInfoEXT *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkIndirectExecutionSetEXT *pIndirectExecutionSet) { - VkMemoryRequirements *reqs = &pMemoryRequirements->memoryRequirements; VK_FROM_HANDLE(radv_device, device, _device); const struct radv_physical_device *pdev = radv_device_physical(device); - uint32_t size; + struct radv_indirect_execution_set *set; + uint32_t num_entries; + uint32_t stride; + VkResult result; - size = sizeof(struct radv_compute_pipeline_metadata); - size += 4 /* num CS DW */; - size += (pdev->info.gfx_level >= GFX10 ? 19 : 16) * 4; + set = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*set), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!set) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - reqs->memoryTypeBits = ((1u << pdev->memory_properties.memoryTypeCount) - 1u) & ~pdev->memory_types_32bit; - reqs->alignment = 4; - reqs->size = align(size, reqs->alignment); + vk_object_base_init(&device->vk, &set->base, VK_OBJECT_TYPE_INDIRECT_EXECUTION_SET_EXT); + + switch (pCreateInfo->type) { + case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT: { + const VkIndirectExecutionSetPipelineInfoEXT *pipeline_info = pCreateInfo->info.pPipelineInfo; + VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->initialPipeline); + + assert(pipeline->type == RADV_PIPELINE_COMPUTE); + num_entries = pipeline_info->maxPipelineCount; + break; + } + case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT: { + const VkIndirectExecutionSetShaderInfoEXT *shaders_info = pCreateInfo->info.pShaderInfo; + VK_FROM_HANDLE(radv_shader_object, shader_object, shaders_info->pInitialShaders[0]); + + assert(shader_object->stage == MESA_SHADER_COMPUTE); + num_entries = shaders_info->maxShaderCount; + break; + } + default: + unreachable("Invalid IES type"); + } + + stride = sizeof(struct radv_compute_pipeline_metadata); + stride += 4 /* num CS DW */; + stride += (pdev->info.gfx_level >= GFX10 ? 19 : 16) * 4; + + result = radv_bo_create(device, &set->base, num_entries * stride, 8, RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY, RADV_BO_PRIORITY_DESCRIPTOR, 0, + false, &set->bo); + if (result != VK_SUCCESS) { + radv_destroy_indirect_execution_set(device, pAllocator, set); + return vk_error(device, result); + } + + set->mapped_ptr = (uint8_t *)radv_buffer_map(device->ws, set->bo); + if (!set->mapped_ptr) { + radv_destroy_indirect_execution_set(device, pAllocator, set); + return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + } + + set->va = radv_buffer_get_va(set->bo); + set->stride = stride; + + /* The driver is supposed to always populate slot 0 with the initial pipeline/shader. */ + switch (pCreateInfo->type) { + case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT: { + const VkIndirectExecutionSetPipelineInfoEXT *pipeline_info = pCreateInfo->info.pPipelineInfo; + VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->initialPipeline); + + radv_update_ies_pipeline(device, set, 0, pipeline); + break; + } + case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT: { + const VkIndirectExecutionSetShaderInfoEXT *shaders_info = pCreateInfo->info.pShaderInfo; + VK_FROM_HANDLE(radv_shader_object, shader_object, shaders_info->pInitialShaders[0]); + + radv_update_ies_shader(device, set, 0, shader_object->shader); + break; + } + default: + unreachable("Invalid IES type"); + } + + *pIndirectExecutionSet = radv_indirect_execution_set_to_handle(set); + return VK_SUCCESS; } -VKAPI_ATTR VkDeviceAddress VKAPI_CALL -radv_GetPipelineIndirectDeviceAddressNV(VkDevice device, const VkPipelineIndirectDeviceAddressInfoNV *pInfo) +VKAPI_ATTR void VKAPI_CALL +radv_DestroyIndirectExecutionSetEXT(VkDevice _device, VkIndirectExecutionSetEXT indirectExecutionSet, + const VkAllocationCallbacks *pAllocator) { - VK_FROM_HANDLE(radv_pipeline, pipeline, pInfo->pipeline); + VK_FROM_HANDLE(radv_device, device, _device); + VK_FROM_HANDLE(radv_indirect_execution_set, set, indirectExecutionSet); - return radv_pipeline_to_compute(pipeline)->indirect.va; + if (!set) + return; + + radv_destroy_indirect_execution_set(device, pAllocator, set); +} + +VKAPI_ATTR void VKAPI_CALL +radv_UpdateIndirectExecutionSetPipelineEXT(VkDevice _device, VkIndirectExecutionSetEXT indirectExecutionSet, + uint32_t executionSetWriteCount, + const VkWriteIndirectExecutionSetPipelineEXT *pExecutionSetWrites) +{ + VK_FROM_HANDLE(radv_indirect_execution_set, set, indirectExecutionSet); + VK_FROM_HANDLE(radv_device, device, _device); + + for (uint32_t i = 0; i < executionSetWriteCount; i++) { + const VkWriteIndirectExecutionSetPipelineEXT *writeset = &pExecutionSetWrites[i]; + VK_FROM_HANDLE(radv_pipeline, pipeline, writeset->pipeline); + + radv_update_ies_pipeline(device, set, writeset->index, pipeline); + } +} + +VKAPI_ATTR void VKAPI_CALL +radv_UpdateIndirectExecutionSetShaderEXT(VkDevice _device, VkIndirectExecutionSetEXT indirectExecutionSet, + uint32_t executionSetWriteCount, + const VkWriteIndirectExecutionSetShaderEXT *pExecutionSetWrites) +{ + VK_FROM_HANDLE(radv_indirect_execution_set, set, indirectExecutionSet); + VK_FROM_HANDLE(radv_device, device, _device); + + for (uint32_t i = 0; i < executionSetWriteCount; i++) { + const VkWriteIndirectExecutionSetShaderEXT *writeset = &pExecutionSetWrites[i]; + VK_FROM_HANDLE(radv_shader_object, shader_object, writeset->shader); + + radv_update_ies_shader(device, set, writeset->index, shader_object->shader); + } } diff --git a/src/amd/vulkan/radv_dgc.h b/src/amd/vulkan/radv_dgc.h new file mode 100644 index 00000000000..f208a07805a --- /dev/null +++ b/src/amd/vulkan/radv_dgc.h @@ -0,0 +1,66 @@ +/* + * Copyright © 2024 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +#ifndef RADV_DGC_H +#define RADV_DGC_H + +#include "compiler/shader_enums.h" + +#include "radv_constants.h" + +#include "vk_device_generated_commands.h" + +struct radv_cmd_buffer; +enum radv_queue_family; + +struct radv_indirect_command_layout { + struct vk_indirect_command_layout vk; + + uint64_t push_constant_mask; + uint32_t push_constant_offsets[MAX_PUSH_CONSTANTS_SIZE / 4]; + uint64_t sequence_index_mask; + + VkPipeline pipeline; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(radv_indirect_command_layout, vk.base, VkIndirectCommandsLayoutEXT, + VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_EXT) + +struct radv_indirect_execution_set { + struct vk_object_base base; + + struct radeon_winsys_bo *bo; + uint64_t va; + uint8_t *mapped_ptr; + + uint32_t stride; + + uint32_t compute_scratch_size_per_wave; + uint32_t compute_scratch_waves; +}; + +VK_DEFINE_NONDISP_HANDLE_CASTS(radv_indirect_execution_set, base, VkIndirectExecutionSetEXT, + VK_OBJECT_TYPE_INDIRECT_EXECUTION_SET_EXT); + +uint32_t radv_get_indirect_main_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo); +uint32_t radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo); + +uint32_t radv_get_indirect_main_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo); +uint32_t radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo); + +uint32_t radv_get_indirect_main_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo); +uint32_t radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo); + +void radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, + struct radv_cmd_buffer *state_cmd_buffer, bool cond_render_enabled); + +bool radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, + const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo); + +struct radv_shader *radv_dgc_get_shader(const VkGeneratedCommandsPipelineInfoEXT *pipeline_info, + const VkGeneratedCommandsShaderInfoEXT *eso_info, gl_shader_stage stage); + +#endif /* RADV_DGC_H */ diff --git a/src/amd/vulkan/radv_instance.c b/src/amd/vulkan/radv_instance.c index 9ec86ee0d8a..ff98e336596 100644 --- a/src/amd/vulkan/radv_instance.c +++ b/src/amd/vulkan/radv_instance.c @@ -148,7 +148,6 @@ static const driOptionDescription radv_dri_options[] = { DRI_CONF_RADV_DISABLE_TRUNC_COORD(false) DRI_CONF_RADV_DISABLE_SINKING_LOAD_INPUT_FS(false) DRI_CONF_RADV_DISABLE_DEPTH_STORAGE(false) - DRI_CONF_RADV_DGC(false) DRI_CONF_RADV_FLUSH_BEFORE_QUERY_COPY(false) DRI_CONF_RADV_ENABLE_UNIFIED_HEAP_ON_APU(false) DRI_CONF_RADV_TEX_NON_UNIFORM(false) @@ -243,8 +242,6 @@ radv_init_dri_options(struct radv_instance *instance) instance->drirc.override_ray_tracing_shader_version = driQueryOptioni(&instance->drirc.options, "radv_override_ray_tracing_shader_version"); - instance->drirc.enable_dgc = driQueryOptionb(&instance->drirc.options, "radv_dgc"); - instance->drirc.override_vram_size = driQueryOptioni(&instance->drirc.options, "override_vram_size"); instance->drirc.enable_khr_present_wait = driQueryOptionb(&instance->drirc.options, "vk_khr_present_wait"); diff --git a/src/amd/vulkan/radv_instance.h b/src/amd/vulkan/radv_instance.h index 9c16c806cc3..fc103e1a1da 100644 --- a/src/amd/vulkan/radv_instance.h +++ b/src/amd/vulkan/radv_instance.h @@ -66,7 +66,6 @@ struct radv_instance { bool legacy_sparse_binding; bool force_pstate_peak_gfx11_dgpu; bool clear_lds; - bool enable_dgc; bool enable_khr_present_wait; bool report_llvm9_version_string; bool vk_require_etc2; diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c index d6ce01d3c29..baf1ad57a9c 100644 --- a/src/amd/vulkan/radv_physical_device.c +++ b/src/amd/vulkan/radv_physical_device.c @@ -747,10 +747,6 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device .INTEL_shader_integer_functions2 = true, .MESA_image_alignment_control = pdev->info.gfx_level >= GFX9 && pdev->info.gfx_level <= GFX11_5, .NV_compute_shader_derivatives = true, - .NV_device_generated_commands = - pdev->info.gfx_level >= GFX8 && instance->drirc.enable_dgc && !(instance->debug_flags & RADV_DEBUG_NO_IBS), - .NV_device_generated_commands_compute = - pdev->info.gfx_level >= GFX8 && instance->drirc.enable_dgc && !(instance->debug_flags & RADV_DEBUG_NO_IBS), /* Undocumented extension purely for vkd3d-proton. This check is to prevent anyone else from * using it. */ @@ -1129,9 +1125,6 @@ radv_physical_device_get_features(const struct radv_physical_device *pdev, struc .performanceCounterQueryPools = has_perf_query, .performanceCounterMultipleQueryPools = has_perf_query, - /* VK_NV_device_generated_commands */ - .deviceGeneratedCommandsNV = true, - /* VK_EXT_attachment_feedback_loop_layout */ .attachmentFeedbackLoopLayout = true, @@ -1214,11 +1207,6 @@ radv_physical_device_get_features(const struct radv_physical_device *pdev, struc /* VK_KHR_maintenance5 */ .maintenance5 = true, - /* VK_NV_device_generated_commands_compute */ - .deviceGeneratedCompute = true, - .deviceGeneratedComputePipelines = true, - .deviceGeneratedComputeCaptureReplay = false, - /* VK_KHR_cooperative_matrix */ .cooperativeMatrix = pdev->info.gfx_level >= GFX11 && !pdev->use_llvm, .cooperativeMatrixRobustBufferAccess = pdev->info.gfx_level >= GFX11 && !pdev->use_llvm, @@ -1830,20 +1818,6 @@ radv_get_physical_device_properties(struct radv_physical_device *pdev) /* VK_KHR_performance_query */ p->allowCommandBufferQueryCopies = false; - /* VK_NV_device_generated_commands */ - p->maxIndirectCommandsStreamCount = 1; - p->maxIndirectCommandsStreamStride = UINT32_MAX; - p->maxIndirectCommandsTokenCount = 512; - p->maxIndirectCommandsTokenOffset = UINT16_MAX; - p->minIndirectCommandsBufferOffsetAlignment = 4; - p->minSequencesCountBufferOffsetAlignment = 4; - p->minSequencesIndexBufferOffsetAlignment = 4; - /* Don't support even a shader group count = 1 until we support shader - * overrides during pipeline creation. */ - p->maxGraphicsShaderGroupCount = 0; - /* MSB reserved for signalling indirect count enablement. */ - p->maxIndirectSequenceCount = UINT32_MAX >> 1; - /* VK_EXT_graphics_pipeline_library */ p->graphicsPipelineLibraryFastLinking = true; p->graphicsPipelineLibraryIndependentInterpolationDecoration = true; diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 56cc2b4b62e..a9df9b6b8ae 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -137,7 +137,7 @@ radv_pipeline_get_shader_key(const struct radv_device *device, const VkPipelineS if (flags & VK_PIPELINE_CREATE_2_VIEW_INDEX_FROM_DEVICE_INDEX_BIT_KHR) key.view_index_from_device_index = 1; - if (flags & VK_PIPELINE_CREATE_INDIRECT_BINDABLE_BIT_NV) + if (flags & VK_PIPELINE_CREATE_2_INDIRECT_BINDABLE_BIT_EXT) key.indirect_bindable = 1; if (stage->stage & RADV_GRAPHICS_STAGE_BITS) { diff --git a/src/amd/vulkan/radv_pipeline_compute.c b/src/amd/vulkan/radv_pipeline_compute.c index de9abc08505..32d04d44257 100644 --- a/src/amd/vulkan/radv_pipeline_compute.c +++ b/src/amd/vulkan/radv_pipeline_compute.c @@ -305,37 +305,6 @@ radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache, const VkC radv_compute_pipeline_init(pipeline, pipeline_layout, pipeline->base.shaders[MESA_SHADER_COMPUTE]); - if (pipeline->base.create_flags & VK_PIPELINE_CREATE_INDIRECT_BINDABLE_BIT_NV) { - const VkComputePipelineIndirectBufferInfoNV *indirect_buffer = - vk_find_struct_const(pCreateInfo->pNext, COMPUTE_PIPELINE_INDIRECT_BUFFER_INFO_NV); - struct radv_shader *shader = pipeline->base.shaders[MESA_SHADER_COMPUTE]; - const struct radv_physical_device *pdev = radv_device_physical(device); - struct radeon_cmdbuf *cs = &pipeline->indirect.cs; - - cs->reserved_dw = cs->max_dw = 32; - cs->buf = malloc(cs->max_dw * 4); - if (!cs->buf) { - radv_pipeline_destroy(device, &pipeline->base, pAllocator); - return result; - } - - radv_emit_compute_shader(pdev, cs, shader); - - pipeline->indirect.va = indirect_buffer->deviceAddress; - pipeline->indirect.size = indirect_buffer->size; - - /* vkCmdUpdatePipelineIndirectBufferNV() can be called on any queues supporting transfer - * operations and it's not required to call it on the same queue as the DGC execute. Because - * it's not possible to know if the compute shader uses scratch when DGC execute is called, - * the only solution is gather the max scratch size of all indirect pipelines. - */ - simple_mtx_lock(&device->compute_scratch_mtx); - device->compute_scratch_size_per_wave = - MAX2(device->compute_scratch_size_per_wave, shader->config.scratch_bytes_per_wave); - device->compute_scratch_waves = MAX2(device->compute_scratch_waves, radv_get_max_scratch_waves(device, shader)); - simple_mtx_unlock(&device->compute_scratch_mtx); - } - *pPipeline = radv_pipeline_to_handle(&pipeline->base); radv_rmv_log_compute_pipeline_create(device, &pipeline->base, pipeline->base.is_internal); return VK_SUCCESS; @@ -371,12 +340,8 @@ radv_create_compute_pipelines(VkDevice _device, VkPipelineCache pipelineCache, u void radv_destroy_compute_pipeline(struct radv_device *device, struct radv_compute_pipeline *pipeline) { - struct radeon_cmdbuf *cs = &pipeline->indirect.cs; - if (pipeline->base.shaders[MESA_SHADER_COMPUTE]) radv_shader_unref(device, pipeline->base.shaders[MESA_SHADER_COMPUTE]); - - free(cs->buf); } VKAPI_ATTR VkResult VKAPI_CALL diff --git a/src/amd/vulkan/radv_pipeline_compute.h b/src/amd/vulkan/radv_pipeline_compute.h index cc75d90396a..9ac879d7ad9 100644 --- a/src/amd/vulkan/radv_pipeline_compute.h +++ b/src/amd/vulkan/radv_pipeline_compute.h @@ -19,12 +19,6 @@ struct radv_shader_info; struct radv_compute_pipeline { struct radv_pipeline base; - - struct { - struct radeon_cmdbuf cs; - uint64_t va; - uint64_t size; - } indirect; }; RADV_DECL_PIPELINE_DOWNCAST(compute, RADV_PIPELINE_COMPUTE) diff --git a/src/amd/vulkan/radv_queue.c b/src/amd/vulkan/radv_queue.c index 4c02188d0d4..16127383a4d 100644 --- a/src/amd/vulkan/radv_queue.c +++ b/src/amd/vulkan/radv_queue.c @@ -1268,7 +1268,6 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device bool *has_follower) { const struct radv_physical_device *pdev = radv_device_physical(device); - bool has_indirect_pipeline_binds = false; if (queue->qf != RADV_QUEUE_GENERAL && queue->qf != RADV_QUEUE_COMPUTE) { for (uint32_t j = 0; j < cmd_buffer_count; j++) { @@ -1308,16 +1307,6 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device needs.sample_positions |= cmd_buffer->sample_positions_needed; *use_perf_counters |= cmd_buffer->state.uses_perf_counters; *has_follower |= !!cmd_buffer->gang.cs; - - has_indirect_pipeline_binds |= cmd_buffer->has_indirect_pipeline_binds; - } - - if (has_indirect_pipeline_binds) { - /* Use the maximum possible scratch size for indirect compute pipelines with DGC. */ - simple_mtx_lock(&device->compute_scratch_mtx); - needs.compute_scratch_size_per_wave = MAX2(needs.compute_scratch_waves, device->compute_scratch_size_per_wave); - needs.compute_scratch_waves = MAX2(needs.compute_scratch_waves, device->compute_scratch_waves); - simple_mtx_unlock(&device->compute_scratch_mtx); } /* Sanitize scratch size information. */ diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 60c2d9c9b55..369586f2a0d 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -886,13 +886,6 @@ radv_create_shader_arena(struct radv_device *device, struct radv_shader_free_lis if (replayable) flags |= RADEON_FLAG_REPLAYABLE; - /* vkCmdUpdatePipelineIndirectBufferNV() can be called on any queue supporting transfer - * operations and it's not required to call it on the same queue as DGC execute. To make sure the - * compute shader BO is part of the DGC execute submission, force all shaders to be local BOs. - */ - if (device->vk.enabled_features.deviceGeneratedComputePipelines) - flags |= RADEON_FLAG_PREFER_LOCAL_BO; - VkResult result; result = radv_bo_create(device, NULL, arena_size, RADV_SHADER_ALLOC_ALIGNMENT, RADEON_DOMAIN_VRAM, flags, RADV_BO_PRIORITY_SHADER, replay_va, true, &arena->bo); diff --git a/src/amd/vulkan/radv_shader_object.c b/src/amd/vulkan/radv_shader_object.c index 36f55a4af4d..e5b57476e0f 100644 --- a/src/amd/vulkan/radv_shader_object.c +++ b/src/amd/vulkan/radv_shader_object.c @@ -110,6 +110,9 @@ radv_shader_stage_init(const VkShaderCreateInfoEXT *sinfo, struct radv_shader_st out_stage->key.subgroup_require_full = 1; } + if (sinfo->flags & VK_SHADER_CREATE_INDIRECT_BINDABLE_BIT_EXT) + out_stage->key.indirect_bindable = 1; + if (out_stage->stage == MESA_SHADER_MESH) { out_stage->key.has_task_shader = !(sinfo->flags & VK_SHADER_CREATE_NO_TASK_SHADER_BIT_EXT); } diff --git a/src/util/00-radv-defaults.conf b/src/util/00-radv-defaults.conf index 555fc3c79af..371f9c8a7ca 100644 --- a/src/util/00-radv-defaults.conf +++ b/src/util/00-radv-defaults.conf @@ -42,7 +42,6 @@ Application bugs worked around in this file: