diff --git a/src/amd/vulkan/layers/radv_sqtt_layer.c b/src/amd/vulkan/layers/radv_sqtt_layer.c
index eda1fdf00b2..520dc2c741d 100644
--- a/src/amd/vulkan/layers/radv_sqtt_layer.c
+++ b/src/amd/vulkan/layers/radv_sqtt_layer.c
@@ -1263,11 +1263,12 @@ sqtt_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou
}
VKAPI_ATTR void VKAPI_CALL
-sqtt_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
- const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
+sqtt_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
+ const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
{
/* There is no ExecuteIndirect Vulkan event in RGP yet. */
- API_MARKER_ALIAS(ExecuteGeneratedCommandsNV, ExecuteCommands, commandBuffer, isPreprocessed, pGeneratedCommandsInfo);
+ API_MARKER_ALIAS(ExecuteGeneratedCommandsEXT, ExecuteCommands, commandBuffer, isPreprocessed,
+ pGeneratedCommandsInfo);
}
VKAPI_ATTR void VKAPI_CALL
diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build
index 087b31b5af6..46af9008134 100644
--- a/src/amd/vulkan/meson.build
+++ b/src/amd/vulkan/meson.build
@@ -111,8 +111,8 @@ libradv_files = files(
'radv_device_memory.h',
'radv_descriptor_set.c',
'radv_descriptor_set.h',
- 'radv_device_generated_commands.c',
- 'radv_device_generated_commands.h',
+ 'radv_dgc.c',
+ 'radv_dgc.h',
'radv_event.c',
'radv_event.h',
'radv_formats.c',
diff --git a/src/amd/vulkan/meta/radv_meta.c b/src/amd/vulkan/meta/radv_meta.c
index 0eac0e6450d..63a94854f5b 100644
--- a/src/amd/vulkan/meta/radv_meta.c
+++ b/src/amd/vulkan/meta/radv_meta.c
@@ -511,7 +511,7 @@ radv_device_init_meta(struct radv_device *device)
if (result != VK_SUCCESS)
goto fail_astc_decode;
- if (radv_uses_device_generated_commands(device)) {
+ if (device->vk.enabled_features.deviceGeneratedCommands) {
result = radv_device_init_dgc_prepare_state(device, on_demand);
if (result != VK_SUCCESS)
goto fail_dgc;
diff --git a/src/amd/vulkan/radv_buffer.c b/src/amd/vulkan/radv_buffer.c
index 0bff0f7b20d..b265255d8cf 100644
--- a/src/amd/vulkan/radv_buffer.c
+++ b/src/amd/vulkan/radv_buffer.c
@@ -194,23 +194,11 @@ radv_get_buffer_memory_requirements(struct radv_device *device, VkDeviceSize siz
pMemoryRequirements->memoryRequirements.memoryTypeBits =
((1u << pdev->memory_properties.memoryTypeCount) - 1u) & ~pdev->memory_types_32bit;
- /* Allow 32-bit address-space for DGC usage, as this buffer will contain
- * cmd buffer upload buffers, and those get passed to shaders through 32-bit
- * pointers.
- *
- * We only allow it with this usage set, to "protect" the 32-bit address space
- * from being overused. The actual requirement is done as part of
- * vkGetGeneratedCommandsMemoryRequirementsNV. (we have to make sure their
- * intersection is non-zero at least)
- */
- if ((usage & VK_BUFFER_USAGE_2_INDIRECT_BUFFER_BIT_KHR) && radv_uses_device_generated_commands(device))
- pMemoryRequirements->memoryRequirements.memoryTypeBits |= pdev->memory_types_32bit;
-
/* Force 32-bit address-space for descriptor buffers usage because they are passed to shaders
* through 32-bit pointers.
*/
- if (usage &
- (VK_BUFFER_USAGE_2_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT | VK_BUFFER_USAGE_2_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT))
+ if (usage & (VK_BUFFER_USAGE_2_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT |
+ VK_BUFFER_USAGE_2_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT | VK_BUFFER_USAGE_2_PREPROCESS_BUFFER_BIT_EXT))
pMemoryRequirements->memoryRequirements.memoryTypeBits = pdev->memory_types_32bit;
if (flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT)
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index af770825ead..d205cebbda6 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -13,7 +13,7 @@
#include "radv_cp_dma.h"
#include "radv_cs.h"
#include "radv_debug.h"
-#include "radv_device_generated_commands.h"
+#include "radv_dgc.h"
#include "radv_event.h"
#include "radv_pipeline_rt.h"
#include "radv_radeon_winsys.h"
@@ -477,7 +477,6 @@ radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, UNUSED VkCommandB
cmd_buffer->gang.sem.emitted_leader_value = 0;
cmd_buffer->gang.sem.va = 0;
cmd_buffer->shader_upload_seq = 0;
- cmd_buffer->has_indirect_pipeline_binds = false;
if (cmd_buffer->upload.upload_bo)
radv_cs_add_buffer(device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
@@ -646,8 +645,8 @@ radv_gang_barrier(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_
/* Add stage flush only when necessary. */
if (src_stage_mask & (VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
- VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
- VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV))
+ VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_EXT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
+ VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
cmd_buffer->gang.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
/* Block task shaders when we have to wait for CP DMA on the GFX cmdbuf. */
@@ -6645,9 +6644,10 @@ radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_s
if (src_stage_mask &
(VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
- VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV | VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
+ VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR | VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
- VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
+ VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_EXT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
+ VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
}
@@ -6719,7 +6719,7 @@ radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2
has_DB_meta = false;
}
- if (src_flags & VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_NV)
+ if (src_flags & VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_EXT)
flush_bits |= RADV_CMD_FLAG_INV_L2;
if (src_flags & (VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT | VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR)) {
@@ -6808,9 +6808,8 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2
flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
/* Ensure the DGC meta shader can read the commands. */
- if (radv_uses_device_generated_commands(device)) {
+ if (device->vk.enabled_features.deviceGeneratedCommands) {
flush_bits |= RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE;
-
if (pdev->info.gfx_level < GFX9)
flush_bits |= RADV_CMD_FLAG_INV_L2;
}
@@ -6849,7 +6848,7 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2
flush_bits |= RADV_CMD_FLAG_INV_L2;
}
- if (dst_flags & VK_ACCESS_2_COMMAND_PREPROCESS_READ_BIT_NV) {
+ if (dst_flags & VK_ACCESS_2_COMMAND_PREPROCESS_READ_BIT_EXT) {
flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
if (pdev->info.gfx_level < GFX9)
flush_bits |= RADV_CMD_FLAG_INV_L2;
@@ -11558,52 +11557,31 @@ radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer commandBuffer, VkBuffer _b
}
/* TODO: Use these functions with the normal dispatch path. */
-static void radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer);
+static void radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point);
static void radv_dgc_after_dispatch(struct radv_cmd_buffer *cmd_buffer);
-VKAPI_ATTR void VKAPI_CALL
-radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer,
- const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
-{
- VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
- VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
- VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
-
- if (!radv_dgc_can_preprocess(layout, pipeline))
- return;
-
- /* VK_EXT_conditional_rendering says that copy commands should not be
- * affected by conditional rendering.
- */
- const bool old_predicating = cmd_buffer->state.predicating;
- cmd_buffer->state.predicating = false;
-
- radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, old_predicating);
-
- /* Restore conditional rendering. */
- cmd_buffer->state.predicating = old_predicating;
-}
-
+/* VK_EXT_device_generated_commands */
static void
-radv_dgc_execute_ib(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
+radv_dgc_execute_ib(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
{
- VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
- struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
- const bool has_task_shader = radv_dgc_with_task_shader(pGeneratedCommandsInfo);
-
+ const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+ const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
+ vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
+ const VkGeneratedCommandsShaderInfoEXT *eso_info =
+ vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
+ const struct radv_shader *task_shader = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK);
const uint32_t cmdbuf_size = radv_get_indirect_main_cmdbuf_size(pGeneratedCommandsInfo);
- const uint64_t ib_va =
- radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset + pGeneratedCommandsInfo->preprocessOffset;
- const uint64_t main_trailer_va = ib_va + radv_get_indirect_main_trailer_offset(pGeneratedCommandsInfo);
+ const uint64_t ib_va = pGeneratedCommandsInfo->preprocessAddress;
const uint64_t main_ib_va = ib_va + radv_get_indirect_main_cmdbuf_offset(pGeneratedCommandsInfo);
+ const uint64_t main_trailer_va = ib_va + radv_get_indirect_main_trailer_offset(pGeneratedCommandsInfo);
device->ws->cs_chain_dgc_ib(cmd_buffer->cs, main_ib_va, cmdbuf_size >> 2, main_trailer_va,
cmd_buffer->state.predicating);
- if (has_task_shader) {
+ if (task_shader) {
const uint32_t ace_cmdbuf_size = radv_get_indirect_ace_cmdbuf_size(pGeneratedCommandsInfo);
- const uint64_t ace_trailer_va = ib_va + radv_get_indirect_ace_trailer_offset(pGeneratedCommandsInfo);
const uint64_t ace_ib_va = ib_va + radv_get_indirect_ace_cmdbuf_offset(pGeneratedCommandsInfo);
+ const uint64_t ace_trailer_va = ib_va + radv_get_indirect_ace_trailer_offset(pGeneratedCommandsInfo);
assert(cmd_buffer->gang.cs);
device->ws->cs_chain_dgc_ib(cmd_buffer->gang.cs, ace_ib_va, ace_cmdbuf_size >> 2, ace_trailer_va,
@@ -11612,82 +11590,82 @@ radv_dgc_execute_ib(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommand
}
VKAPI_ATTR void VKAPI_CALL
-radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
- const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
+radv_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
+ const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
{
- VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
- VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
- VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
- struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
- const bool compute = layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE;
+ VK_FROM_HANDLE(radv_indirect_execution_set, ies, pGeneratedCommandsInfo->indirectExecutionSet);
+ VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+ const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const bool use_predication = radv_use_dgc_predication(cmd_buffer, pGeneratedCommandsInfo);
+ const bool compute = !!(layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH));
+ const bool rt = !!(layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT));
+ const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
+ vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
+ const VkGeneratedCommandsShaderInfoEXT *eso_info =
+ vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
- /* Secondary command buffers are needed for the full extension but can't use
- * PKT3_INDIRECT_BUFFER.
- */
+ if (ies) {
+ radv_cs_add_buffer(device->ws, cmd_buffer->cs, ies->bo);
+
+ cmd_buffer->compute_scratch_size_per_wave_needed =
+ MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, ies->compute_scratch_size_per_wave);
+ cmd_buffer->compute_scratch_waves_wanted =
+ MAX2(cmd_buffer->compute_scratch_waves_wanted, ies->compute_scratch_waves);
+ }
+
+ /* Secondary command buffers are banned. */
assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
if (use_predication) {
- VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer);
- const uint64_t va = radv_buffer_get_va(seq_count_buffer->bo) + seq_count_buffer->offset +
- pGeneratedCommandsInfo->sequencesCountOffset;
-
+ const uint64_t va = pGeneratedCommandsInfo->sequenceCountAddress;
radv_begin_conditional_rendering(cmd_buffer, va, true);
}
- if (!radv_dgc_can_preprocess(layout, pipeline)) {
+ if (!(layout->vk.usage & VK_INDIRECT_COMMANDS_LAYOUT_USAGE_EXPLICIT_PREPROCESS_BIT_EXT)) {
/* Suspend conditional rendering when the DGC execute is called on the compute queue to
- * generate a cmdbuf which will skips dispatches when necessary. This is because the
- * compute queue is missing IB2 which means it's not possible to skip the cmdbuf entirely.
- * It should also be suspended when task shaders are used because the DGC ACE IB would be
+ * generate a cmdbuf which will skips dispatches when necessary. This is because the compute
+ * queue is missing IB2 which means it's not possible to skip the cmdbuf entirely. This
+ * should also be suspended when task shaders are used because the DGC ACE IB would be
* uninitialized otherwise.
*/
- const bool suspend_cond_render =
- (cmd_buffer->qf == RADV_QUEUE_COMPUTE || radv_dgc_with_task_shader(pGeneratedCommandsInfo));
+ const bool suspend_conditional_rendering =
+ (cmd_buffer->qf == RADV_QUEUE_COMPUTE || radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK));
const bool old_predicating = cmd_buffer->state.predicating;
- if (suspend_cond_render && cmd_buffer->state.predicating) {
+ if (suspend_conditional_rendering && cmd_buffer->state.predicating) {
cmd_buffer->state.predicating = false;
}
- radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, old_predicating);
+ radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, cmd_buffer, old_predicating);
- if (suspend_cond_render) {
+ if (suspend_conditional_rendering) {
cmd_buffer->state.predicating = old_predicating;
}
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2;
- if (radv_dgc_with_task_shader(pGeneratedCommandsInfo)) {
- /* Make sure the DGC ACE IB will wait for the DGC prepare shader before the execution
- * starts.
- */
+ /* Make sure the DGC ACE IB will wait for the DGC prepare shader before the execution
+ * starts.
+ */
+ if (radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK)) {
radv_gang_barrier(cmd_buffer, VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV,
VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT);
}
}
- if (compute) {
- radv_dgc_before_dispatch(cmd_buffer);
-
- if (!pGeneratedCommandsInfo->pipeline)
- cmd_buffer->has_indirect_pipeline_binds = true;
+ if (rt) {
+ radv_dgc_before_dispatch(cmd_buffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
+ } else if (compute) {
+ radv_dgc_before_dispatch(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
} else {
- struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
- struct radv_draw_info info;
+ struct radv_draw_info info = {
+ .count = pGeneratedCommandsInfo->maxSequenceCount,
+ .indirect = (void *)&info,
+ .indexed = !!(layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_INDEXED)),
+ };
- info.count = pGeneratedCommandsInfo->sequencesCount;
- info.indirect = prep_buffer; /* We're not really going use it this way, but a good signal
- that this is not direct. */
- info.indirect_offset = 0;
- info.stride = 0;
- info.strmout_buffer = NULL;
- info.count_buffer = NULL;
- info.indexed = layout->indexed;
- info.instance_count = 0;
-
- if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH)) {
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) {
if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1, true))
return;
} else {
@@ -11696,46 +11674,63 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre
}
}
- const uint32_t view_mask = cmd_buffer->state.render.view_mask;
-
if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
radeon_emit(cmd_buffer->cs, 0);
}
- radv_cs_add_buffer(device->ws, cmd_buffer->cs, prep_buffer->bo);
-
- if (compute || !view_mask) {
+ const uint32_t view_mask = cmd_buffer->state.render.view_mask;
+ if (rt || compute || !view_mask) {
radv_dgc_execute_ib(cmd_buffer, pGeneratedCommandsInfo);
} else {
u_foreach_bit (view, view_mask) {
radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
-
radv_dgc_execute_ib(cmd_buffer, pGeneratedCommandsInfo);
}
}
- if (compute) {
+ if (rt) {
+ cmd_buffer->push_constant_stages |= RADV_RT_STAGE_BITS;
+
+ radv_dgc_after_dispatch(cmd_buffer);
+ } else if (compute) {
cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
- if (!pGeneratedCommandsInfo->pipeline)
+ if (ies)
radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
radv_dgc_after_dispatch(cmd_buffer);
} else {
- struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
-
- if (layout->binds_index_buffer) {
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IB)) {
cmd_buffer->state.last_index_type = -1;
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
}
- if (layout->bind_vbo_mask)
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB))
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
- cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages;
+ if (pipeline_info) {
+ VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
+ struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
+
+ cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages;
+ } else {
+ assert(eso_info);
+
+ for (unsigned i = 0; i < eso_info->shaderCount; ++i) {
+ VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]);
+
+ cmd_buffer->push_constant_stages |= mesa_to_vk_shader_stage(shader_object->stage);
+ }
+ }
+
+ if (!(layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_INDEXED))) {
+ /* Non-indexed draws overwrite VGT_INDEX_TYPE, so the state must be
+ * re-emitted before the next indexed draw.
+ */
+ cmd_buffer->state.last_index_type = -1;
+ }
- cmd_buffer->state.last_index_type = -1;
cmd_buffer->state.last_num_instances = -1;
cmd_buffer->state.last_vertex_offset_valid = false;
cmd_buffer->state.last_first_instance = -1;
@@ -12102,12 +12097,16 @@ radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_inf
}
static void
-radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer)
+radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
{
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const struct radv_physical_device *pdev = radv_device_physical(device);
- struct radv_compute_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
- struct radv_shader *compute_shader = cmd_buffer->state.shaders[MESA_SHADER_COMPUTE];
+ struct radv_compute_pipeline *pipeline = bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
+ ? &cmd_buffer->state.rt_pipeline->base
+ : cmd_buffer->state.compute_pipeline;
+ struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
+ ? cmd_buffer->state.rt_pipeline->prolog
+ : cmd_buffer->state.shaders[MESA_SHADER_COMPUTE];
bool pipeline_is_dirty = pipeline != cmd_buffer->state.emitted_compute_pipeline;
/* We will have run the DGC patch shaders before, so we can assume that there is something to
@@ -12119,9 +12118,11 @@ radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer)
if (pipeline)
radv_emit_compute_pipeline(cmd_buffer, pipeline);
+ if (bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR)
+ radv_emit_rt_stack_size(cmd_buffer);
radv_emit_cache_flush(cmd_buffer);
- radv_upload_compute_shader_descriptors(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
+ radv_upload_compute_shader_descriptors(cmd_buffer, bind_point);
if (pipeline_is_dirty) {
const bool has_prefetch = pdev->info.gfx_level >= GFX7;
@@ -12136,7 +12137,9 @@ radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer)
* We only need to do this when the pipeline is dirty because when we switch between
* the two we always need to switch pipelines.
*/
- radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
+ radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
+ ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
+ : VK_PIPELINE_BIND_POINT_COMPUTE);
}
}
@@ -13672,42 +13675,6 @@ radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, VkPipelineStageFlag
assert(cmd_buffer->cs->cdw <= cdw_max);
}
-VKAPI_ATTR void VKAPI_CALL
-radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
- VkPipeline pipeline, uint32_t groupIndex)
-{
- fprintf(stderr, "radv: unimplemented vkCmdBindPipelineShaderGroupNV\n");
- abort();
-}
-
-/* VK_NV_device_generated_commands_compute */
-VKAPI_ATTR void VKAPI_CALL
-radv_CmdUpdatePipelineIndirectBufferNV(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
- VkPipeline _pipeline)
-{
- VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
- VK_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
- struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
- const struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
- const struct radeon_cmdbuf *cs = &compute_pipeline->indirect.cs;
- const uint64_t va = compute_pipeline->indirect.va;
- struct radv_compute_pipeline_metadata metadata;
- uint32_t offset = 0;
-
- radv_get_compute_shader_metadata(device, compute_pipeline->base.shaders[MESA_SHADER_COMPUTE], &metadata);
-
- radv_write_data(cmd_buffer, V_370_ME, va + offset, sizeof(metadata) / 4, (const uint32_t *)&metadata, false);
- offset += sizeof(metadata);
-
- radv_write_data(cmd_buffer, V_370_ME, va + offset, 1, (const uint32_t *)&cs->cdw, false);
- offset += sizeof(uint32_t);
-
- radv_write_data(cmd_buffer, V_370_ME, va + offset, cs->cdw, (const uint32_t *)cs->buf, false);
- offset += cs->cdw * sizeof(uint32_t);
-
- assert(offset < compute_pipeline->indirect.size);
-}
-
/* VK_EXT_descriptor_buffer */
VKAPI_ATTR void VKAPI_CALL
radv_CmdBindDescriptorBuffersEXT(VkCommandBuffer commandBuffer, uint32_t bufferCount,
diff --git a/src/amd/vulkan/radv_cmd_buffer.h b/src/amd/vulkan/radv_cmd_buffer.h
index f5a7c7761ba..f3e2c230730 100644
--- a/src/amd/vulkan/radv_cmd_buffer.h
+++ b/src/amd/vulkan/radv_cmd_buffer.h
@@ -537,7 +537,6 @@ struct radv_cmd_buffer {
bool gds_needed; /* for GFX10 streamout and NGG GS queries */
bool gds_oa_needed; /* for GFX10 streamout */
bool sample_positions_needed;
- bool has_indirect_pipeline_binds;
uint64_t gfx9_fence_va;
uint32_t gfx9_fence_idx;
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 2e6b940425c..02810372a1d 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -1100,7 +1100,6 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
simple_mtx_init(&device->trace_mtx, mtx_plain);
simple_mtx_init(&device->pstate_mtx, mtx_plain);
simple_mtx_init(&device->rt_handles_mtx, mtx_plain);
- simple_mtx_init(&device->compute_scratch_mtx, mtx_plain);
simple_mtx_init(&device->pso_cache_stats_mtx, mtx_plain);
device->rt_handles = _mesa_hash_table_create(NULL, _mesa_hash_u32, _mesa_key_u32_equal);
@@ -1359,7 +1358,6 @@ fail_queue:
simple_mtx_destroy(&device->pstate_mtx);
simple_mtx_destroy(&device->trace_mtx);
simple_mtx_destroy(&device->rt_handles_mtx);
- simple_mtx_destroy(&device->compute_scratch_mtx);
simple_mtx_destroy(&device->pso_cache_stats_mtx);
mtx_destroy(&device->overallocation_mutex);
@@ -1417,7 +1415,6 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
simple_mtx_destroy(&device->pstate_mtx);
simple_mtx_destroy(&device->trace_mtx);
simple_mtx_destroy(&device->rt_handles_mtx);
- simple_mtx_destroy(&device->compute_scratch_mtx);
simple_mtx_destroy(&device->pso_cache_stats_mtx);
radv_destroy_shader_arenas(device);
diff --git a/src/amd/vulkan/radv_device.h b/src/amd/vulkan/radv_device.h
index 39e4907728f..9fd158557d2 100644
--- a/src/amd/vulkan/radv_device.h
+++ b/src/amd/vulkan/radv_device.h
@@ -541,11 +541,6 @@ struct radv_device {
/* Not NULL if a GPU hang report has been generated for VK_EXT_device_fault. */
char *gpu_hang_report;
- /* For indirect compute pipeline binds with DGC only. */
- simple_mtx_t compute_scratch_mtx;
- uint32_t compute_scratch_size_per_wave;
- uint32_t compute_scratch_waves;
-
/* PSO cache stats */
simple_mtx_t pso_cache_stats_mtx;
struct radv_pso_cache_stats pso_cache_stats[RADV_PIPELINE_TYPE_COUNT];
@@ -559,12 +554,6 @@ radv_device_physical(const struct radv_device *dev)
return (struct radv_physical_device *)dev->vk.physical;
}
-static inline bool
-radv_uses_device_generated_commands(const struct radv_device *device)
-{
- return device->vk.enabled_features.deviceGeneratedCommandsNV || device->vk.enabled_features.deviceGeneratedCompute;
-}
-
static inline bool
radv_uses_primitives_generated_query(const struct radv_device *device)
{
diff --git a/src/amd/vulkan/radv_device_generated_commands.h b/src/amd/vulkan/radv_device_generated_commands.h
deleted file mode 100644
index 0e739db3233..00000000000
--- a/src/amd/vulkan/radv_device_generated_commands.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright © 2016 Red Hat.
- * Copyright © 2016 Bas Nieuwenhuizen
- *
- * based in part on anv driver which is:
- * Copyright © 2015 Intel Corporation
- *
- * SPDX-License-Identifier: MIT
- */
-
-#ifndef RADV_DEVICE_GENERATED_COMMANDS_H
-#define RADV_DEVICE_GENERATED_COMMANDS_H
-
-#include "vk_object.h"
-
-#include "radv_constants.h"
-
-struct radv_cmd_buffer;
-struct radv_pipeline;
-
-struct radv_indirect_command_layout {
- struct vk_object_base base;
-
- VkIndirectCommandsLayoutUsageFlagsNV flags;
- VkPipelineBindPoint pipeline_bind_point;
-
- uint32_t input_stride;
- uint32_t token_count;
-
- bool indexed;
- bool binds_index_buffer;
- bool draw_mesh_tasks;
- uint16_t draw_params_offset;
- uint16_t index_buffer_offset;
-
- uint16_t dispatch_params_offset;
-
- bool bind_pipeline;
- uint16_t pipeline_params_offset;
-
- bool vertex_dynamic_stride;
- uint32_t bind_vbo_mask;
- uint32_t vbo_offsets[MAX_VBS];
-
- uint64_t push_constant_mask;
- uint32_t push_constant_offsets[MAX_PUSH_CONSTANTS_SIZE / 4];
- uint32_t push_constant_size;
-
- uint32_t ibo_type_32;
- uint32_t ibo_type_8;
-
- VkPipeline pipeline;
-
- VkIndirectCommandsLayoutTokenNV tokens[0];
-};
-
-VK_DEFINE_NONDISP_HANDLE_CASTS(radv_indirect_command_layout, base, VkIndirectCommandsLayoutNV,
- VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV)
-
-uint32_t radv_get_indirect_main_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info);
-
-uint32_t radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info);
-
-uint32_t radv_get_indirect_main_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info);
-
-uint32_t radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info);
-
-uint32_t radv_get_indirect_main_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info);
-
-uint32_t radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info);
-
-bool radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer,
- const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo);
-
-bool radv_dgc_can_preprocess(const struct radv_indirect_command_layout *layout, struct radv_pipeline *pipeline);
-
-bool radv_dgc_with_task_shader(const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo);
-
-void radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo,
- bool cond_render_enabled);
-
-#endif /* RADV_DEVICE_GENERATED_COMMANDS_H */
diff --git a/src/amd/vulkan/radv_device_generated_commands.c b/src/amd/vulkan/radv_dgc.c
similarity index 61%
rename from src/amd/vulkan/radv_device_generated_commands.c
rename to src/amd/vulkan/radv_dgc.c
index 18149ac2f8d..febeb6d657e 100644
--- a/src/amd/vulkan/radv_device_generated_commands.c
+++ b/src/amd/vulkan/radv_dgc.c
@@ -1,23 +1,24 @@
/*
- * Copyright © 2021 Google
+ * Copyright © 2024 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
-#include "radv_device_generated_commands.h"
+#include "radv_dgc.h"
#include "meta/radv_meta.h"
-#include "radv_cmd_buffer.h"
#include "radv_entrypoints.h"
+#include "radv_pipeline_rt.h"
#include "ac_rgp.h"
#include "nir_builder.h"
#include "vk_common_entrypoints.h"
+#include "vk_device_generated_commands.h"
#include "vk_shader_module.h"
-#define DGC_VBO_INFO_SIZE (sizeof(struct radv_vbo_info) + 4 /* vbo_offsets */)
#define PKT3_INDIRECT_BUFFER_BYTES 16
+#define DGC_VBO_INFO_SIZE (sizeof(struct radv_vbo_info) + 4 /* vbo_offsets */)
/* The DGC command buffer layout is quite complex, here's some explanations:
*
@@ -27,10 +28,9 @@
* | trailer | commands | padding | jump to trailer |
* +---------+----------+---------+-----------------+
*
- * The trailer is used to implement IB chaining for compute queue because IB2
- * isn't supported. The trailer is patched at execute time on the CPU to chain
- * back the DGC command buffer. The trailer is added at the beginning to make
- * sure the offset is fixed (ie. not possible to know the offset with a
+ * The trailer is used to implement IB chaining for compute queue because IB2 isn't supported. The
+ * trailer is patched at execute time to chain back the DGC command buffer. The trailer is added at
+ * the beginning to make sure the offset is fixed (ie. not possible to know the offset with a
* preamble). In practice the execution looks like:
*
* +----------+---------+-----------------+ +---------+ +-----------------------+
@@ -53,21 +53,100 @@
*
* The execution of this DGC command buffer is different if it's GFX or COMPUTE queue:
* - on GFX, the driver uses the IB2 packet which the easiest solution
- * - on COMPUTE, IB2 isn't supported and the driver chains the DGC command
- * buffer by patching the trailer
+ * - on COMPUTE, IB2 isn't supported and the driver chains the DGC command buffer by patching the
+ * trailer
*/
-static void
-radv_get_sequence_size_compute(const struct radv_indirect_command_layout *layout,
- const struct radv_compute_pipeline *pipeline, uint32_t *cmd_size, uint32_t *upload_size)
+
+static uint32_t
+radv_pad_cmdbuf(const struct radv_device *device, uint32_t size, enum amd_ip_type ip_type)
{
- const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk);
const struct radv_physical_device *pdev = radv_device_physical(device);
+ const uint32_t ib_alignment = (pdev->info.ip[ip_type].ib_pad_dw_mask + 1) * 4;
+
+ return align(size, ib_alignment);
+}
+
+static uint32_t
+radv_align_cmdbuf(const struct radv_device *device, uint32_t size, enum amd_ip_type ip_type)
+{
+ const struct radv_physical_device *pdev = radv_device_physical(device);
+ const uint32_t ib_alignment = pdev->info.ip[ip_type].ib_alignment;
+
+ return align(size, ib_alignment);
+}
+
+static unsigned
+radv_dgc_preamble_cmdbuf_size(const struct radv_device *device, enum amd_ip_type ip_type)
+{
+ return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type);
+}
+
+static unsigned
+radv_dgc_trailer_cmdbuf_size(const struct radv_device *device, enum amd_ip_type ip_type)
+{
+ return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type);
+}
+
+static bool
+radv_dgc_use_preamble(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
+{
+ /* Heuristic on when the overhead for the preamble (i.e. double jump) is worth it. Obviously
+ * a bit of a guess as it depends on the actual count which we don't know. */
+ return pGeneratedCommandsInfo->sequenceCountAddress != 0 && pGeneratedCommandsInfo->maxSequenceCount >= 64;
+}
+
+struct radv_shader *
+radv_dgc_get_shader(const VkGeneratedCommandsPipelineInfoEXT *pipeline_info,
+ const VkGeneratedCommandsShaderInfoEXT *eso_info, gl_shader_stage stage)
+{
+ if (pipeline_info) {
+ VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
+ return radv_get_shader(pipeline->shaders, stage);
+ } else if (eso_info) {
+ VkShaderStageFlags stages = 0;
+
+ for (uint32_t i = 0; i < eso_info->shaderCount; i++) {
+ VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]);
+ stages |= mesa_to_vk_shader_stage(shader_object->stage);
+ }
+
+ for (uint32_t i = 0; i < eso_info->shaderCount; i++) {
+ VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]);
+
+ if (shader_object->stage != stage)
+ continue;
+
+ if (stage == MESA_SHADER_VERTEX && (stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)) {
+ return shader_object->as_ls.shader;
+ } else if ((stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL) &&
+ (stages & VK_SHADER_STAGE_GEOMETRY_BIT)) {
+ return shader_object->as_es.shader;
+ } else {
+ return shader_object->shader;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static void
+radv_get_sequence_size_compute(const struct radv_indirect_command_layout *layout, const void *pNext, uint32_t *cmd_size,
+ uint32_t *upload_size)
+{
+ const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
+ const struct radv_physical_device *pdev = radv_device_physical(device);
+
+ const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
+ vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
+ const VkGeneratedCommandsShaderInfoEXT *eso_info = vk_find_struct_const(pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
+
+ struct radv_shader *cs = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_COMPUTE);
/* dispatch */
*cmd_size += 5 * 4;
- if (pipeline) {
- struct radv_shader *cs = radv_get_shader(pipeline->base.shaders, MESA_SHADER_COMPUTE);
+ if (cs) {
const struct radv_userdata_info *loc = radv_get_user_sgpr_info(cs, AC_UD_CS_GRID_SIZE);
if (loc->sgpr_idx != -1) {
if (device->load_grid_size_from_user_sgpr) {
@@ -112,60 +191,89 @@ radv_get_sequence_size_compute(const struct radv_indirect_command_layout *layout
}
static void
-radv_get_sequence_size_graphics(const struct radv_indirect_command_layout *layout,
- const struct radv_graphics_pipeline *pipeline, uint32_t *cmd_size,
- uint32_t *ace_cmd_size, uint32_t *upload_size)
+radv_get_sequence_size_graphics(const struct radv_indirect_command_layout *layout, const void *pNext,
+ uint32_t *cmd_size, uint32_t *ace_cmd_size, uint32_t *upload_size)
{
- const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk);
+ const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
const struct radv_physical_device *pdev = radv_device_physical(device);
- const struct radv_shader *vs = radv_get_shader(pipeline->base.shaders, MESA_SHADER_VERTEX);
- if (layout->bind_vbo_mask) {
+ const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
+ vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
+ const VkGeneratedCommandsShaderInfoEXT *eso_info = vk_find_struct_const(pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
+
+ struct radv_shader *vs = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_VERTEX);
+
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) {
*upload_size += 16 * util_bitcount(vs->info.vs.vb_desc_usage_mask);
/* One PKT3_SET_SH_REG for emitting VBO pointer (32-bit) */
*cmd_size += 3 * 4;
}
- if (layout->indexed) {
- if (layout->binds_index_buffer) {
- /* Index type write (normal reg write) + index buffer base write (64-bits, but special packet
- * so only 1 word overhead) + index buffer size (again, special packet so only 1 word
- * overhead)
- */
- *cmd_size += (3 + 3 + 2) * 4;
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IB)) {
+ /* Index type write (normal reg write) + index buffer base write (64-bits, but special packet
+ * so only 1 word overhead) + index buffer size (again, special packet so only 1 word
+ * overhead)
+ */
+ *cmd_size += (3 + 3 + 2) * 4;
+ }
- /* userdata writes + instance count + indexed draw */
- *cmd_size += (5 + 2 + 5) * 4;
- } else {
- /* PKT3_SET_BASE + PKT3_DRAW_{INDEX}_INDIRECT_MULTI */
- *cmd_size += (4 + (pipeline->uses_drawid ? 10 : 5)) * 4;
- }
- } else {
- if (layout->draw_mesh_tasks) {
- const struct radv_shader *task_shader = radv_get_shader(pipeline->base.shaders, MESA_SHADER_TASK);
+ if (layout->vk.draw_count) {
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) {
+ const struct radv_shader *task_shader = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK);
if (task_shader) {
- const struct radv_userdata_info *xyz_loc = radv_get_user_sgpr_info(task_shader, AC_UD_CS_GRID_SIZE);
- const struct radv_userdata_info *draw_id_loc = radv_get_user_sgpr_info(task_shader, AC_UD_CS_TASK_DRAW_ID);
-
/* PKT3_DISPATCH_TASKMESH_GFX */
*cmd_size += 4 * 4;
- if (xyz_loc->sgpr_idx != -1)
- *ace_cmd_size += 5 * 4;
- if (draw_id_loc->sgpr_idx != -1)
- *ace_cmd_size += 3 * 4;
-
- /* PKT3_DISPATCH_TASKMESH_DIRECT_ACE */
- *ace_cmd_size += 6 * 4;
+ /* PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE */
+ *ace_cmd_size += 11 * 4;
} else {
- /* userdata writes + instance count + non-indexed draw */
- *cmd_size += (6 + 2 + (pdev->mesh_fast_launch_2 ? 5 : 3)) * 4;
+ struct radv_shader *ms = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_MESH);
+
+ /* PKT3_SET_BASE + PKT3_SET_SH_REG + PKT3_DISPATCH_MESH_INDIRECT_MULTI */
+ *cmd_size += (4 + (ms->info.vs.needs_draw_id ? 3 : 0) + 9) * 4;
}
} else {
- /* userdata writes + instance count + non-indexed draw */
- *cmd_size += (5 + 2 + 3) * 4;
+ /* PKT3_SET_BASE + PKT3_DRAW_{INDEX}_INDIRECT_MULTI */
+ *cmd_size += (4 + 10) * 4;
+ }
+ } else {
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_INDEXED)) {
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IB)) {
+ /* userdata writes + instance count + indexed draw */
+ *cmd_size += (5 + 2 + 5) * 4;
+ } else {
+ /* PKT3_SET_BASE + PKT3_SET_SH_REG + PKT3_DRAW_{INDEX}_INDIRECT_MULTI */
+ *cmd_size += (4 + (vs->info.vs.needs_draw_id ? 10 : 5)) * 4;
+ }
+ } else {
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) {
+ const struct radv_shader *task_shader = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK);
+
+ if (task_shader) {
+ const struct radv_userdata_info *xyz_loc = radv_get_user_sgpr_info(task_shader, AC_UD_CS_GRID_SIZE);
+ const struct radv_userdata_info *draw_id_loc =
+ radv_get_user_sgpr_info(task_shader, AC_UD_CS_TASK_DRAW_ID);
+
+ /* PKT3_DISPATCH_TASKMESH_GFX */
+ *cmd_size += 4 * 4;
+
+ if (xyz_loc->sgpr_idx != -1)
+ *ace_cmd_size += 5 * 4;
+ if (draw_id_loc->sgpr_idx != -1)
+ *ace_cmd_size += 3 * 4;
+
+ /* PKT3_DISPATCH_TASKMESH_DIRECT_ACE */
+ *ace_cmd_size += 6 * 4;
+ } else {
+ /* userdata writes + instance count + non-indexed draw */
+ *cmd_size += (6 + 2 + (pdev->mesh_fast_launch_2 ? 5 : 3)) * 4;
+ }
+ } else {
+ /* userdata writes + instance count + non-indexed draw */
+ *cmd_size += (5 + 2 + 3) * 4;
+ }
}
}
@@ -176,24 +284,106 @@ radv_get_sequence_size_graphics(const struct radv_indirect_command_layout *layou
}
static void
-radv_get_sequence_size(const struct radv_indirect_command_layout *layout, struct radv_pipeline *pipeline,
- uint32_t *cmd_size, uint32_t *ace_cmd_size, uint32_t *upload_size)
+radv_get_sequence_size_rt(const struct radv_indirect_command_layout *layout, const void *pNext, uint32_t *cmd_size,
+ uint32_t *upload_size)
{
- const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk);
+ const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
+
+ const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
+ vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
+ VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
+ const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
+ const struct radv_shader *rt_prolog = rt_pipeline->prolog;
+
+ /* dispatch */
+ *cmd_size += 5 * 4;
+
+ const struct radv_userdata_info *cs_grid_size_loc = radv_get_user_sgpr_info(rt_prolog, AC_UD_CS_GRID_SIZE);
+ if (cs_grid_size_loc->sgpr_idx != -1) {
+ if (device->load_grid_size_from_user_sgpr) {
+ /* PKT3_LOAD_SH_REG_INDEX */
+ *cmd_size += 5 * 4;
+ } else {
+ /* PKT3_SET_SH_REG for pointer */
+ *cmd_size += 4 * 4;
+ }
+ }
+
+ const struct radv_userdata_info *cs_sbt_descriptors_loc =
+ radv_get_user_sgpr_info(rt_prolog, AC_UD_CS_SBT_DESCRIPTORS);
+ if (cs_sbt_descriptors_loc->sgpr_idx != -1) {
+ /* PKT3_SET_SH_REG for pointer */
+ *cmd_size += 4 * 4;
+ }
+
+ const struct radv_userdata_info *cs_ray_launch_size_addr_loc =
+ radv_get_user_sgpr_info(rt_prolog, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR);
+ if (cs_ray_launch_size_addr_loc->sgpr_idx != -1) {
+ /* PKT3_SET_SH_REG for pointer */
+ *cmd_size += 4 * 4;
+ }
+
+ if (device->sqtt.bo) {
+ /* sqtt markers */
+ *cmd_size += 5 * 3 * 4;
+ }
+}
+
+static void
+radv_get_sequence_size(const struct radv_indirect_command_layout *layout, const void *pNext, uint32_t *cmd_size,
+ uint32_t *ace_cmd_size, uint32_t *upload_size)
+{
+ const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
+ const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
+ vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
+ const VkGeneratedCommandsShaderInfoEXT *eso_info = vk_find_struct_const(pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
*cmd_size = 0;
*ace_cmd_size = 0;
*upload_size = 0;
- if (layout->push_constant_mask) {
+ if (layout->vk.dgc_info & (BITFIELD_BIT(MESA_VK_DGC_PC) | BITFIELD_BIT(MESA_VK_DGC_SI))) {
+ VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout);
bool need_copy = false;
- if (pipeline) {
- for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); ++i) {
- if (!pipeline->shaders[i])
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) {
+ /* Assume the compute shader needs both user SGPRs because we can't know the information
+ * for indirect pipelines.
+ */
+ *cmd_size += 3 * 4;
+ need_copy = true;
+
+ *cmd_size += (3 * util_bitcount64(layout->push_constant_mask)) * 4;
+ } else {
+ struct radv_shader *shaders[MESA_VULKAN_SHADER_STAGES] = {0};
+ if (pipeline_info) {
+ VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
+
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) {
+ const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
+ struct radv_shader *rt_prolog = rt_pipeline->prolog;
+
+ shaders[MESA_SHADER_COMPUTE] = rt_prolog;
+ } else {
+ memcpy(shaders, pipeline->shaders, sizeof(shaders));
+ }
+ } else if (eso_info) {
+ for (unsigned i = 0; i < eso_info->shaderCount; ++i) {
+ VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]);
+ struct radv_shader *shader = shader_object->shader;
+ gl_shader_stage stage = shader->info.stage;
+
+ shaders[stage] = shader;
+ }
+ }
+
+ for (unsigned i = 0; i < ARRAY_SIZE(shaders); ++i) {
+ const struct radv_shader *shader = shaders[i];
+
+ if (!shader)
continue;
- struct radv_userdata_locations *locs = &pipeline->shaders[i]->info.user_sgprs_locs;
+ const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
/* One PKT3_SET_SH_REG for emitting push constants pointer (32-bit) */
if (i == MESA_SHADER_TASK) {
@@ -214,19 +404,10 @@ radv_get_sequence_size(const struct radv_indirect_command_layout *layout, struct
}
}
}
- } else {
- /* Assume the compute shader needs both user SGPRs because we can't know the information
- * for indirect pipelines.
- */
- assert(layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE);
- *cmd_size += 3 * 4;
- need_copy = true;
-
- *cmd_size += (3 * util_bitcount64(layout->push_constant_mask)) * 4;
}
if (need_copy) {
- *upload_size += align(layout->push_constant_size, 16);
+ *upload_size += align(pipeline_layout->push_constant_size, 16);
}
}
@@ -235,54 +416,15 @@ radv_get_sequence_size(const struct radv_indirect_command_layout *layout, struct
*cmd_size += 2 * 4;
}
- if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
- struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
- radv_get_sequence_size_graphics(layout, graphics_pipeline, cmd_size, ace_cmd_size, upload_size);
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
+ radv_get_sequence_size_compute(layout, pNext, cmd_size, upload_size);
+ } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) {
+ radv_get_sequence_size_rt(layout, pNext, cmd_size, upload_size);
} else {
- assert(layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE);
- struct radv_compute_pipeline *compute_pipeline = pipeline ? radv_pipeline_to_compute(pipeline) : NULL;
- radv_get_sequence_size_compute(layout, compute_pipeline, cmd_size, upload_size);
+ radv_get_sequence_size_graphics(layout, pNext, cmd_size, ace_cmd_size, upload_size);
}
}
-static uint32_t
-radv_pad_cmdbuf(const struct radv_device *device, uint32_t size, enum amd_ip_type ip_type)
-{
- const struct radv_physical_device *pdev = radv_device_physical(device);
- const uint32_t ib_alignment = (pdev->info.ip[ip_type].ib_pad_dw_mask + 1) * 4;
-
- return align(size, ib_alignment);
-}
-
-static uint32_t
-radv_align_cmdbuf(const struct radv_device *device, uint32_t size, enum amd_ip_type ip_type)
-{
- const struct radv_physical_device *pdev = radv_device_physical(device);
- const uint32_t ib_alignment = pdev->info.ip[ip_type].ib_alignment;
-
- return align(size, ib_alignment);
-}
-
-static unsigned
-radv_dgc_preamble_cmdbuf_size(const struct radv_device *device, enum amd_ip_type ip_type)
-{
- return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type);
-}
-
-static unsigned
-radv_dgc_trailer_cmdbuf_size(const struct radv_device *device, enum amd_ip_type ip_type)
-{
- return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type);
-}
-
-static bool
-radv_dgc_use_preamble(const VkGeneratedCommandsInfoNV *cmd_info)
-{
- /* Heuristic on when the overhead for the preamble (i.e. double jump) is worth it. Obviously
- * a bit of a guess as it depends on the actual count which we don't know. */
- return cmd_info->sequencesCountBuffer != VK_NULL_HANDLE && cmd_info->sequencesCount >= 64;
-}
-
struct dgc_cmdbuf_layout {
bool use_preamble;
uint32_t alloc_size;
@@ -308,15 +450,13 @@ struct dgc_cmdbuf_layout {
static void
get_dgc_cmdbuf_layout(const struct radv_device *device, const struct radv_indirect_command_layout *dgc_layout,
- struct radv_pipeline *pipeline, uint32_t sequences_count, bool use_preamble,
- struct dgc_cmdbuf_layout *layout)
+ const void *pNext, uint32_t sequences_count, bool use_preamble, struct dgc_cmdbuf_layout *layout)
{
uint32_t offset = 0;
memset(layout, 0, sizeof(*layout));
- radv_get_sequence_size(dgc_layout, pipeline, &layout->main_cmd_stride, &layout->ace_cmd_stride,
- &layout->upload_stride);
+ radv_get_sequence_size(dgc_layout, pNext, &layout->main_cmd_stride, &layout->ace_cmd_stride, &layout->upload_stride);
layout->use_preamble = use_preamble;
if (layout->use_preamble) {
@@ -352,6 +492,7 @@ get_dgc_cmdbuf_layout(const struct radv_device *device, const struct radv_indire
offset += radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE);
offset = radv_align_cmdbuf(device, offset, AMD_IP_COMPUTE);
+
layout->ace_preamble_offset = offset;
if (layout->use_preamble)
@@ -370,16 +511,15 @@ get_dgc_cmdbuf_layout(const struct radv_device *device, const struct radv_indire
}
static uint32_t
-radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info, enum amd_ip_type ip_type)
+radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, enum amd_ip_type ip_type)
{
- VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout);
- VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline);
- const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk);
- const bool use_preamble = radv_dgc_use_preamble(cmd_info);
- const uint32_t sequences_count = cmd_info->sequencesCount;
+ VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
+ const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
+ const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo);
+ const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount;
struct dgc_cmdbuf_layout cmdbuf_layout;
- get_dgc_cmdbuf_layout(device, layout, pipeline, sequences_count, use_preamble, &cmdbuf_layout);
+ get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout);
if (use_preamble)
return ip_type == AMD_IP_GFX ? cmdbuf_layout.main_preamble_size : cmdbuf_layout.ace_preamble_size;
@@ -388,31 +528,29 @@ radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info, enum am
}
static uint32_t
-radv_get_indirect_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info, enum amd_ip_type ip_type)
+radv_get_indirect_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, enum amd_ip_type ip_type)
{
- VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout);
- VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline);
- const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk);
- const bool use_preamble = radv_dgc_use_preamble(cmd_info);
- const uint32_t sequences_count = cmd_info->sequencesCount;
+ VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
+ const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
+ const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo);
+ const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount;
struct dgc_cmdbuf_layout cmdbuf_layout;
- get_dgc_cmdbuf_layout(device, layout, pipeline, sequences_count, use_preamble, &cmdbuf_layout);
+ get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout);
return ip_type == AMD_IP_GFX ? cmdbuf_layout.main_preamble_offset : cmdbuf_layout.ace_preamble_offset;
}
static uint32_t
-radv_get_indirect_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info, enum amd_ip_type ip_type)
+radv_get_indirect_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, enum amd_ip_type ip_type)
{
- VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout);
- VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline);
- const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk);
- const bool use_preamble = radv_dgc_use_preamble(cmd_info);
- const uint32_t sequences_count = cmd_info->sequencesCount;
+ VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
+ const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
+ const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo);
+ const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount;
struct dgc_cmdbuf_layout cmdbuf_layout;
- get_dgc_cmdbuf_layout(device, layout, pipeline, sequences_count, use_preamble, &cmdbuf_layout);
+ get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout);
const uint32_t offset = ip_type == AMD_IP_GFX ? cmdbuf_layout.main_trailer_offset : cmdbuf_layout.ace_trailer_offset;
@@ -420,39 +558,39 @@ radv_get_indirect_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info, enum
}
uint32_t
-radv_get_indirect_main_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info)
+radv_get_indirect_main_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
{
- return radv_get_indirect_cmdbuf_offset(cmd_info, AMD_IP_GFX);
+ return radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo, AMD_IP_GFX);
}
uint32_t
-radv_get_indirect_main_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info)
+radv_get_indirect_main_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
{
- return radv_get_indirect_cmdbuf_size(cmd_info, AMD_IP_GFX);
+ return radv_get_indirect_cmdbuf_offset(pGeneratedCommandsInfo, AMD_IP_GFX);
}
uint32_t
-radv_get_indirect_main_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info)
+radv_get_indirect_main_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
{
- return radv_get_indirect_trailer_offset(cmd_info, AMD_IP_GFX);
+ return radv_get_indirect_trailer_offset(pGeneratedCommandsInfo, AMD_IP_GFX);
}
uint32_t
-radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoNV *cmd_info)
+radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
{
- return radv_get_indirect_cmdbuf_offset(cmd_info, AMD_IP_COMPUTE);
+ return radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo, AMD_IP_COMPUTE);
}
uint32_t
-radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info)
+radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
{
- return radv_get_indirect_cmdbuf_size(cmd_info, AMD_IP_COMPUTE);
+ return radv_get_indirect_cmdbuf_offset(pGeneratedCommandsInfo, AMD_IP_COMPUTE);
}
uint32_t
-radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoNV *cmd_info)
+radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
{
- return radv_get_indirect_trailer_offset(cmd_info, AMD_IP_COMPUTE);
+ return radv_get_indirect_trailer_offset(pGeneratedCommandsInfo, AMD_IP_COMPUTE);
}
struct radv_dgc_params {
@@ -472,9 +610,13 @@ struct radv_dgc_params {
uint64_t sequence_count_addr;
uint64_t stream_addr;
+ uint8_t queue_family;
+ uint8_t use_preamble;
+
/* draw info */
uint16_t vtx_base_sgpr;
uint32_t max_index_count;
+ uint32_t max_draw_count;
/* task/mesh info */
uint8_t has_task_shader;
@@ -483,26 +625,34 @@ struct radv_dgc_params {
uint16_t task_ring_entry_sgpr;
uint16_t task_xyz_sgpr;
uint16_t task_draw_id_sgpr;
- uint8_t wave32;
- uint8_t const_copy;
+ /* dispatch info */
+ uint16_t grid_base_sgpr;
+ uint32_t wave32;
- uint16_t vbo_reg;
+ /* RT info */
+ uint16_t cs_sbt_descriptors;
+ uint16_t cs_ray_launch_size_addr;
+
+ /* VBO info */
uint32_t vb_desc_usage_mask;
+ uint16_t vbo_reg;
uint8_t dynamic_vs_input;
uint8_t use_per_attribute_vb_descs;
+ /* push constants info */
+ uint8_t const_copy;
uint16_t push_constant_stages;
- uint8_t use_preamble;
+ /* IES info */
+ uint64_t ies_addr;
+ uint32_t ies_stride;
+ uint32_t indirect_desc_sets_va;
/* For conditional rendering on ACE. */
uint8_t predicating;
uint8_t predication_type;
uint64_t predication_va;
-
- /* For indirect descriptor sets */
- uint32_t indirect_desc_sets_va;
};
enum {
@@ -520,7 +670,7 @@ struct dgc_cmdbuf {
nir_variable *offset;
nir_variable *upload_offset;
- nir_def *pipeline_va; /* For compute pipelines */
+ nir_def *ies_va;
};
static void
@@ -566,15 +716,19 @@ dgc_upload(struct dgc_cmdbuf *cs, nir_def *data)
nir_pack_64_2x32((b), nir_load_push_constant((b), 2, 32, nir_imm_int((b), 0), \
.base = offsetof(struct radv_dgc_params, field), .range = 8))
-/* Pipeline metadata */
static nir_def *
-dgc_get_pipeline_va(struct dgc_cmdbuf *cs, nir_def *stream_addr)
+dgc_load_ies_va(struct dgc_cmdbuf *cs, nir_def *stream_addr)
{
const struct radv_indirect_command_layout *layout = cs->layout;
nir_builder *b = cs->b;
- return nir_build_load_global(b, 1, 64, nir_iadd_imm(b, stream_addr, layout->pipeline_params_offset),
- .access = ACCESS_NON_WRITEABLE);
+ nir_def *offset = nir_imm_int(b, layout->vk.ies_src_offset_B);
+ nir_def *ies_index =
+ nir_build_load_global(b, 1, 32, nir_iadd(b, stream_addr, nir_u2u64(b, offset)), .access = ACCESS_NON_WRITEABLE);
+ nir_def *ies_stride = load_param32(b, ies_stride);
+ nir_def *ies_offset = nir_imul(b, ies_index, ies_stride);
+
+ return nir_iadd(b, load_param64(b, ies_addr), nir_u2u64(b, ies_offset));
}
static nir_def *
@@ -583,8 +737,8 @@ dgc_load_shader_metadata(struct dgc_cmdbuf *cs, uint32_t bitsize, uint32_t field
const struct radv_indirect_command_layout *layout = cs->layout;
nir_builder *b = cs->b;
- if (layout->bind_pipeline) {
- return nir_load_global(b, nir_iadd_imm(b, cs->pipeline_va, field_offset), 4, 1, bitsize);
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) {
+ return nir_load_global(b, nir_iadd_imm(b, cs->ies_va, field_offset), 4, 1, bitsize);
} else {
nir_def *params_buf = radv_meta_load_descriptor(b, 0, 0);
@@ -646,61 +800,9 @@ nir_pkt3(nir_builder *b, unsigned op, nir_def *len)
return nir_pkt3_base(b, op, len, false);
}
-static void
-dgc_emit_userdata_vertex(struct dgc_cmdbuf *cs, nir_def *first_vertex, nir_def *first_instance, nir_def *drawid)
-{
- nir_builder *b = cs->b;
-
- nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
- vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr);
-
- nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
- nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);
-
- nir_def *pkt_cnt = nir_imm_int(b, 1);
- pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
- pkt_cnt = nir_bcsel(b, has_baseinstance, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
-
- dgc_cs_begin(cs);
- dgc_cs_emit(nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt));
- dgc_cs_emit(nir_iand_imm(b, vtx_base_sgpr, 0x3FFF));
- dgc_cs_emit(first_vertex);
- dgc_cs_emit(nir_bcsel(b, nir_ior(b, has_drawid, has_baseinstance), nir_bcsel(b, has_drawid, drawid, first_instance),
- nir_imm_int(b, PKT3_NOP_PAD)));
- dgc_cs_emit(nir_bcsel(b, nir_iand(b, has_drawid, has_baseinstance), first_instance, nir_imm_int(b, PKT3_NOP_PAD)));
- dgc_cs_end();
-}
-
-static void
-dgc_emit_userdata_mesh(struct dgc_cmdbuf *cs, nir_def *x, nir_def *y, nir_def *z, nir_def *drawid)
-{
- nir_builder *b = cs->b;
-
- nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
- vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr);
-
- nir_def *has_grid_size = nir_test_mask(b, vtx_base_sgpr, DGC_USES_GRID_SIZE);
- nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
-
- nir_push_if(b, nir_ior(b, has_grid_size, has_drawid));
- {
- nir_def *pkt_cnt = nir_imm_int(b, 0);
- pkt_cnt = nir_bcsel(b, has_grid_size, nir_iadd_imm(b, pkt_cnt, 3), pkt_cnt);
- pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
-
- dgc_cs_begin(cs);
- dgc_cs_emit(nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt));
- dgc_cs_emit(nir_iand_imm(b, vtx_base_sgpr, 0x3FFF));
- /* DrawID needs to be first if no GridSize. */
- dgc_cs_emit(nir_bcsel(b, has_grid_size, x, drawid));
- dgc_cs_emit(nir_bcsel(b, has_grid_size, y, nir_imm_int(b, PKT3_NOP_PAD)));
- dgc_cs_emit(nir_bcsel(b, has_grid_size, z, nir_imm_int(b, PKT3_NOP_PAD)));
- dgc_cs_emit(nir_bcsel(b, has_drawid, drawid, nir_imm_int(b, PKT3_NOP_PAD)));
- dgc_cs_end();
- }
- nir_pop_if(b, NULL);
-}
-
+/**
+ * SQTT
+ */
static void
dgc_emit_sqtt_userdata(struct dgc_cmdbuf *cs, nir_def *data)
{
@@ -788,184 +890,9 @@ dgc_emit_sqtt_end_api_marker(struct dgc_cmdbuf *cs, enum rgp_sqtt_marker_general
dgc_emit_sqtt_userdata(cs, nir_imm_int(b, marker.dword01));
}
-static void
-dgc_emit_instance_count(struct dgc_cmdbuf *cs, nir_def *instance_count)
-{
- dgc_cs_begin(cs);
- dgc_cs_emit_imm(PKT3(PKT3_NUM_INSTANCES, 0, 0));
- dgc_cs_emit(instance_count);
- dgc_cs_end();
-}
-
-static void
-dgc_emit_draw_index_offset_2(struct dgc_cmdbuf *cs, nir_def *index_offset, nir_def *index_count,
- nir_def *max_index_count)
-{
- dgc_cs_begin(cs);
- dgc_cs_emit_imm(PKT3(PKT3_DRAW_INDEX_OFFSET_2, 3, 0));
- dgc_cs_emit(max_index_count);
- dgc_cs_emit(index_offset);
- dgc_cs_emit(index_count);
- dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_DMA);
- dgc_cs_end();
-}
-
-static void
-dgc_emit_draw_index_auto(struct dgc_cmdbuf *cs, nir_def *vertex_count)
-{
- dgc_cs_begin(cs);
- dgc_cs_emit_imm(PKT3(PKT3_DRAW_INDEX_AUTO, 1, 0));
- dgc_cs_emit(vertex_count);
- dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX);
- dgc_cs_end();
-}
-
-static void
-dgc_emit_dispatch_direct(struct dgc_cmdbuf *cs, nir_def *wg_x, nir_def *wg_y, nir_def *wg_z,
- nir_def *dispatch_initiator)
-{
- dgc_cs_begin(cs);
- dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
- dgc_cs_emit(wg_x);
- dgc_cs_emit(wg_y);
- dgc_cs_emit(wg_z);
- dgc_cs_emit(dispatch_initiator);
- dgc_cs_end();
-}
-
-static void
-dgc_emit_dispatch_mesh_direct(struct dgc_cmdbuf *cs, nir_def *x, nir_def *y, nir_def *z)
-{
- dgc_cs_begin(cs);
- dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_MESH_DIRECT, 3, 0));
- dgc_cs_emit(x);
- dgc_cs_emit(y);
- dgc_cs_emit(z);
- dgc_cs_emit_imm(S_0287F0_SOURCE_SELECT(V_0287F0_DI_SRC_SEL_AUTO_INDEX));
- dgc_cs_end();
-}
-
-static void
-dgc_emit_grid_size_user_sgpr(struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *wg_x, nir_def *wg_y,
- nir_def *wg_z)
-{
- dgc_cs_begin(cs);
- dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 3, 0));
- dgc_cs_emit(grid_base_sgpr);
- dgc_cs_emit(wg_x);
- dgc_cs_emit(wg_y);
- dgc_cs_emit(wg_z);
- dgc_cs_end();
-}
-
-static void
-dgc_emit_grid_size_pointer(struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *stream_addr)
-{
- const struct radv_indirect_command_layout *layout = cs->layout;
- nir_builder *b = cs->b;
-
- nir_def *va = nir_iadd_imm(b, stream_addr, layout->dispatch_params_offset);
-
- nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va);
- nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va);
-
- dgc_cs_begin(cs);
- dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 2, 0));
- dgc_cs_emit(grid_base_sgpr);
- dgc_cs_emit(va_lo);
- dgc_cs_emit(va_hi);
- dgc_cs_end();
-}
-
-static void
-dgc_emit_pkt3_set_base(struct dgc_cmdbuf *cs, nir_def *va)
-{
- nir_builder *b = cs->b;
-
- nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va);
- nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va);
-
- dgc_cs_begin(cs);
- dgc_cs_emit_imm(PKT3(PKT3_SET_BASE, 2, 0));
- dgc_cs_emit_imm(1);
- dgc_cs_emit(va_lo);
- dgc_cs_emit(va_hi);
- dgc_cs_end();
-}
-
-static void
-dgc_emit_pkt3_draw_indirect(struct dgc_cmdbuf *cs, bool indexed)
-{
- const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
- nir_builder *b = cs->b;
-
- nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
-
- nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
- nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);
-
- vtx_base_sgpr = nir_iand_imm(b, nir_u2u32(b, vtx_base_sgpr), 0x3FFF);
-
- /* vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2 */
- nir_def *vertex_offset_reg = vtx_base_sgpr;
-
- /* start_instance_reg = (base_reg + (draw_id_enable ? 8 : 4) - SI_SH_REG_OFFSET) >> 2 */
- nir_def *start_instance_offset = nir_bcsel(b, has_drawid, nir_imm_int(b, 2), nir_imm_int(b, 1));
- nir_def *start_instance_reg = nir_iadd(b, vtx_base_sgpr, start_instance_offset);
-
- /* draw_id_reg = (base_reg + 4 - SI_SH_REG_OFFSET) >> 2 */
- nir_def *draw_id_reg = nir_iadd(b, vtx_base_sgpr, nir_imm_int(b, 1));
-
- nir_if *if_drawid = nir_push_if(b, has_drawid);
- {
- const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI;
-
- dgc_cs_begin(cs);
- dgc_cs_emit_imm(PKT3(pkt3_op, 8, 0));
- dgc_cs_emit_imm(0);
- dgc_cs_emit(vertex_offset_reg);
- dgc_cs_emit(nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0)));
- dgc_cs_emit(nir_ior(b, draw_id_reg, nir_imm_int(b, S_2C3_DRAW_INDEX_ENABLE(1))));
- dgc_cs_emit_imm(1); /* draw count */
- dgc_cs_emit_imm(0); /* count va low */
- dgc_cs_emit_imm(0); /* count va high */
- dgc_cs_emit_imm(0); /* stride */
- dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX);
- dgc_cs_end();
- }
- nir_push_else(b, if_drawid);
- {
- const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT;
-
- dgc_cs_begin(cs);
- dgc_cs_emit_imm(PKT3(pkt3_op, 3, 0));
- dgc_cs_emit_imm(0);
- dgc_cs_emit(vertex_offset_reg);
- dgc_cs_emit(nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0)));
- dgc_cs_emit_imm(di_src_sel);
- dgc_cs_end();
- }
- nir_pop_if(b, if_drawid);
-}
-
-static void
-dgc_emit_draw_indirect(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, bool indexed)
-{
- const struct radv_indirect_command_layout *layout = cs->layout;
- nir_builder *b = cs->b;
-
- nir_def *va = nir_iadd_imm(b, stream_addr, layout->draw_params_offset);
-
- dgc_emit_sqtt_begin_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirect : ApiCmdDrawIndirect);
- dgc_emit_sqtt_marker_event(cs, sequence_id, indexed ? EventCmdDrawIndexedIndirect : EventCmdDrawIndirect);
-
- dgc_emit_pkt3_set_base(cs, va);
- dgc_emit_pkt3_draw_indirect(cs, indexed);
-
- dgc_emit_sqtt_thread_trace_marker(cs);
- dgc_emit_sqtt_end_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirect : ApiCmdDrawIndirect);
-}
-
+/**
+ * Command buffer
+ */
static nir_def *
dgc_cmd_buf_size(nir_builder *b, nir_def *sequence_count, bool is_ace, const struct radv_device *device)
{
@@ -987,10 +914,11 @@ dgc_cmd_buf_size(nir_builder *b, nir_def *sequence_count, bool is_ace, const str
static void
build_dgc_buffer_tail(nir_builder *b, nir_def *cmd_buf_offset, nir_def *cmd_buf_size, nir_def *cmd_buf_stride,
- nir_def *cmd_buf_trailer_offset, nir_def *sequence_count, unsigned trailer_size,
+ nir_def *cmd_buf_trailer_offset, nir_def *sequence_count, unsigned trailer_size, bool is_ace,
const struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
+ nir_def *is_compute_queue = nir_ior_imm(b, nir_ieq_imm(b, load_param8(b, queue_family), RADV_QUEUE_COMPUTE), is_ace);
nir_def *global_id = get_global_ids(b, 1);
@@ -1001,8 +929,11 @@ build_dgc_buffer_tail(nir_builder *b, nir_def *cmd_buf_offset, nir_def *cmd_buf_
nir_variable *offset = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "offset");
nir_store_var(b, offset, cmd_buf_tail_start, 0x1);
- /* Add NOPs padding but leave space for the INDIRECT_BUFFER packet. */
- cmd_buf_size = nir_iadd_imm(b, cmd_buf_size, -PKT3_INDIRECT_BUFFER_BYTES);
+ /* On compute queue, the DGC command buffer is chained by patching the
+ * trailer but this isn't needed on graphics because it's using IB2.
+ */
+ cmd_buf_size =
+ nir_bcsel(b, is_compute_queue, nir_iadd_imm(b, cmd_buf_size, -PKT3_INDIRECT_BUFFER_BYTES), cmd_buf_size);
nir_def *va = nir_pack_64_2x32_split(b, load_param32(b, upload_addr), nir_imm_int(b, pdev->info.address32_hi));
nir_push_loop(b);
@@ -1028,16 +959,20 @@ build_dgc_buffer_tail(nir_builder *b, nir_def *cmd_buf_offset, nir_def *cmd_buf_
}
nir_pop_loop(b, NULL);
- nir_def *chain_packet[] = {
- nir_imm_int(b, PKT3(PKT3_INDIRECT_BUFFER, 2, 0)),
- nir_iadd(b, load_param32(b, upload_addr), cmd_buf_trailer_offset),
- nir_imm_int(b, pdev->info.address32_hi),
- nir_imm_int(b, trailer_size | S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(false)),
- };
+ nir_push_if(b, is_compute_queue);
+ {
+ nir_def *chain_packets[] = {
+ nir_imm_int(b, PKT3(PKT3_INDIRECT_BUFFER, 2, 0)),
+ nir_iadd(b, load_param32(b, upload_addr), cmd_buf_trailer_offset),
+ nir_imm_int(b, pdev->info.address32_hi),
+ nir_imm_int(b, trailer_size | S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(false)),
+ };
- nir_build_store_global(b, nir_vec(b, chain_packet, 4),
- nir_iadd(b, va, nir_u2u64(b, nir_iadd(b, nir_load_var(b, offset), cmd_buf_offset))),
- .access = ACCESS_NON_READABLE);
+ nir_build_store_global(b, nir_vec(b, chain_packets, 4),
+ nir_iadd(b, va, nir_u2u64(b, nir_iadd(b, nir_load_var(b, offset), cmd_buf_offset))),
+ .access = ACCESS_NON_READABLE);
+ }
+ nir_pop_if(b, NULL);
}
nir_pop_if(b, NULL);
}
@@ -1052,7 +987,7 @@ build_dgc_buffer_tail_main(nir_builder *b, nir_def *sequence_count, const struct
unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_GFX) / 4;
build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, cmd_buf_trailer_offset, sequence_count,
- trailer_size, device);
+ trailer_size, false, device);
}
static void
@@ -1065,7 +1000,7 @@ build_dgc_buffer_tail_ace(nir_builder *b, nir_def *sequence_count, const struct
unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE) / 4;
build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, cmd_buf_trailer_offset, sequence_count,
- trailer_size, device);
+ trailer_size, true, device);
}
static void
@@ -1183,15 +1118,161 @@ build_dgc_buffer_preamble_ace(nir_builder *b, nir_def *sequence_count, const str
}
/**
- * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV.
+ * Draw
*/
+static void
+dgc_emit_userdata_vertex(struct dgc_cmdbuf *cs, nir_def *first_vertex, nir_def *first_instance, nir_def *drawid)
+{
+ nir_builder *b = cs->b;
+
+ nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
+ vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr);
+
+ nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
+ nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);
+
+ nir_def *pkt_cnt = nir_imm_int(b, 1);
+ pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
+ pkt_cnt = nir_bcsel(b, has_baseinstance, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
+
+ dgc_cs_begin(cs);
+ dgc_cs_emit(nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt));
+ dgc_cs_emit(nir_iand_imm(b, vtx_base_sgpr, 0x3FFF));
+ dgc_cs_emit(first_vertex);
+ dgc_cs_emit(nir_bcsel(b, nir_ior(b, has_drawid, has_baseinstance), nir_bcsel(b, has_drawid, drawid, first_instance),
+ nir_imm_int(b, PKT3_NOP_PAD)));
+ dgc_cs_emit(nir_bcsel(b, nir_iand(b, has_drawid, has_baseinstance), first_instance, nir_imm_int(b, PKT3_NOP_PAD)));
+ dgc_cs_end();
+}
+
+static void
+dgc_emit_instance_count(struct dgc_cmdbuf *cs, nir_def *instance_count)
+{
+ dgc_cs_begin(cs);
+ dgc_cs_emit_imm(PKT3(PKT3_NUM_INSTANCES, 0, 0));
+ dgc_cs_emit(instance_count);
+ dgc_cs_end();
+}
+
+static void
+dgc_emit_draw_index_offset_2(struct dgc_cmdbuf *cs, nir_def *index_offset, nir_def *index_count,
+ nir_def *max_index_count)
+{
+ dgc_cs_begin(cs);
+ dgc_cs_emit_imm(PKT3(PKT3_DRAW_INDEX_OFFSET_2, 3, 0));
+ dgc_cs_emit(max_index_count);
+ dgc_cs_emit(index_offset);
+ dgc_cs_emit(index_count);
+ dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_DMA);
+ dgc_cs_end();
+}
+
+static void
+dgc_emit_draw_index_auto(struct dgc_cmdbuf *cs, nir_def *vertex_count)
+{
+ dgc_cs_begin(cs);
+ dgc_cs_emit_imm(PKT3(PKT3_DRAW_INDEX_AUTO, 1, 0));
+ dgc_cs_emit(vertex_count);
+ dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX);
+ dgc_cs_end();
+}
+
+static void
+dgc_emit_pkt3_set_base(struct dgc_cmdbuf *cs, nir_def *va)
+{
+ nir_builder *b = cs->b;
+
+ nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va);
+ nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va);
+
+ dgc_cs_begin(cs);
+ dgc_cs_emit_imm(PKT3(PKT3_SET_BASE, 2, 0));
+ dgc_cs_emit_imm(1);
+ dgc_cs_emit(va_lo);
+ dgc_cs_emit(va_hi);
+ dgc_cs_end();
+}
+
+static void
+dgc_emit_pkt3_draw_indirect(struct dgc_cmdbuf *cs, bool indexed)
+{
+ const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
+ nir_builder *b = cs->b;
+
+ nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
+
+ nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
+ nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);
+
+ vtx_base_sgpr = nir_iand_imm(b, nir_u2u32(b, vtx_base_sgpr), 0x3FFF);
+
+ /* vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2 */
+ nir_def *vertex_offset_reg = vtx_base_sgpr;
+
+ /* start_instance_reg = (base_reg + (draw_id_enable ? 8 : 4) - SI_SH_REG_OFFSET) >> 2 */
+ nir_def *start_instance_offset = nir_bcsel(b, has_drawid, nir_imm_int(b, 2), nir_imm_int(b, 1));
+ nir_def *start_instance_reg = nir_iadd(b, vtx_base_sgpr, start_instance_offset);
+
+ /* draw_id_reg = (base_reg + 4 - SI_SH_REG_OFFSET) >> 2 */
+ nir_def *draw_id_reg = nir_iadd(b, vtx_base_sgpr, nir_imm_int(b, 1));
+
+ nir_if *if_drawid = nir_push_if(b, has_drawid);
+ {
+ const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI;
+
+ dgc_cs_begin(cs);
+ dgc_cs_emit_imm(PKT3(pkt3_op, 8, 0));
+ dgc_cs_emit_imm(0);
+ dgc_cs_emit(vertex_offset_reg);
+ dgc_cs_emit(nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0)));
+ dgc_cs_emit(nir_ior(b, draw_id_reg, nir_imm_int(b, S_2C3_DRAW_INDEX_ENABLE(1))));
+ dgc_cs_emit_imm(1); /* draw count */
+ dgc_cs_emit_imm(0); /* count va low */
+ dgc_cs_emit_imm(0); /* count va high */
+ dgc_cs_emit_imm(0); /* stride */
+ dgc_cs_emit_imm(di_src_sel);
+ dgc_cs_end();
+ }
+ nir_push_else(b, if_drawid);
+ {
+ const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT;
+
+ dgc_cs_begin(cs);
+ dgc_cs_emit_imm(PKT3(pkt3_op, 3, 0));
+ dgc_cs_emit_imm(0);
+ dgc_cs_emit(vertex_offset_reg);
+ dgc_cs_emit(nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0)));
+ dgc_cs_emit_imm(di_src_sel);
+ dgc_cs_end();
+ }
+ nir_pop_if(b, if_drawid);
+}
+
+static void
+dgc_emit_draw_indirect(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, bool indexed)
+{
+ const struct radv_indirect_command_layout *layout = cs->layout;
+ nir_builder *b = cs->b;
+
+ nir_def *va = nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B);
+
+ dgc_emit_sqtt_begin_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirect : ApiCmdDrawIndirect);
+ dgc_emit_sqtt_marker_event(cs, sequence_id, indexed ? EventCmdDrawIndexedIndirect : EventCmdDrawIndirect);
+
+ dgc_emit_pkt3_set_base(cs, va);
+ dgc_emit_pkt3_draw_indirect(cs, indexed);
+
+ dgc_emit_sqtt_thread_trace_marker(cs);
+ dgc_emit_sqtt_end_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirect : ApiCmdDrawIndirect);
+}
+
static void
dgc_emit_draw(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
{
const struct radv_indirect_command_layout *layout = cs->layout;
nir_builder *b = cs->b;
- nir_def *draw_data0 = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->draw_params_offset),
+ nir_def *draw_data0 = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
.access = ACCESS_NON_WRITEABLE);
nir_def *vertex_count = nir_channel(b, draw_data0, 0);
nir_def *instance_count = nir_channel(b, draw_data0, 1);
@@ -1203,7 +1284,7 @@ dgc_emit_draw(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDraw);
dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDraw);
- dgc_emit_userdata_vertex(cs, vertex_offset, first_instance, sequence_id);
+ dgc_emit_userdata_vertex(cs, vertex_offset, first_instance, nir_imm_int(b, 0));
dgc_emit_instance_count(cs, instance_count);
dgc_emit_draw_index_auto(cs, vertex_count);
@@ -1213,19 +1294,16 @@ dgc_emit_draw(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
nir_pop_if(b, 0);
}
-/**
- * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV.
- */
static void
dgc_emit_draw_indexed(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, nir_def *max_index_count)
{
const struct radv_indirect_command_layout *layout = cs->layout;
nir_builder *b = cs->b;
- nir_def *draw_data0 = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->draw_params_offset),
+ nir_def *draw_data0 = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
.access = ACCESS_NON_WRITEABLE);
nir_def *draw_data1 =
- nir_build_load_global(b, 1, 32, nir_iadd_imm(b, nir_iadd_imm(b, stream_addr, layout->draw_params_offset), 16),
+ nir_build_load_global(b, 1, 32, nir_iadd_imm(b, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B), 16),
.access = ACCESS_NON_WRITEABLE);
nir_def *index_count = nir_channel(b, draw_data0, 0);
nir_def *instance_count = nir_channel(b, draw_data0, 1);
@@ -1238,7 +1316,7 @@ dgc_emit_draw_indexed(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequ
dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawIndexed);
dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawIndexed);
- dgc_emit_userdata_vertex(cs, vertex_offset, first_instance, sequence_id);
+ dgc_emit_userdata_vertex(cs, vertex_offset, first_instance, nir_imm_int(b, 0));
dgc_emit_instance_count(cs, instance_count);
dgc_emit_draw_index_offset_2(cs, first_index, index_count, max_index_count);
@@ -1248,8 +1326,56 @@ dgc_emit_draw_indexed(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequ
nir_pop_if(b, 0);
}
+static void
+dgc_emit_draw_with_count(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, bool indexed)
+{
+ const struct radv_indirect_command_layout *layout = cs->layout;
+ nir_builder *b = cs->b;
+
+ nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
+ nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
+ nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);
+
+ nir_def *draw_data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
+ .access = ACCESS_NON_WRITEABLE);
+ nir_def *va = nir_pack_64_2x32(b, nir_channels(b, draw_data, 0x3));
+ nir_def *stride = nir_channel(b, draw_data, 2);
+ nir_def *draw_count = nir_umin(b, load_param32(b, max_draw_count), nir_channel(b, draw_data, 3));
+
+ dgc_emit_pkt3_set_base(cs, va);
+
+ nir_def *vertex_offset_reg = nir_iand_imm(b, vtx_base_sgpr, 0x3FFF);
+ nir_def *start_instance_offset = nir_bcsel(b, has_drawid, nir_imm_int(b, 2), nir_imm_int(b, 1));
+ nir_def *start_instance_reg =
+ nir_bcsel(b, has_baseinstance, nir_iadd(b, vertex_offset_reg, start_instance_offset), nir_imm_int(b, 0));
+ nir_def *draw_id_reg = nir_bcsel(
+ b, has_drawid, nir_ior_imm(b, nir_iadd(b, vertex_offset_reg, nir_imm_int(b, 1)), S_2C3_DRAW_INDEX_ENABLE(1)),
+ nir_imm_int(b, 0));
+
+ nir_def *di_src_sel = nir_imm_int(b, indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX);
+
+ dgc_emit_sqtt_begin_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirectCount : ApiCmdDrawIndirectCount);
+ dgc_emit_sqtt_marker_event(cs, sequence_id, indexed ? EventCmdDrawIndexedIndirectCount : EventCmdDrawIndirectCount);
+
+ dgc_cs_begin(cs);
+ dgc_cs_emit_imm(PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8, false));
+ dgc_cs_emit_imm(0);
+ dgc_cs_emit(vertex_offset_reg);
+ dgc_cs_emit(start_instance_reg);
+ dgc_cs_emit(draw_id_reg);
+ dgc_cs_emit(draw_count);
+ dgc_cs_emit_imm(0);
+ dgc_cs_emit_imm(0);
+ dgc_cs_emit(stride);
+ dgc_cs_emit(di_src_sel);
+ dgc_cs_end();
+
+ dgc_emit_sqtt_thread_trace_marker(cs);
+ dgc_emit_sqtt_end_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirectCount : ApiCmdDrawIndirectCount);
+}
+
/**
- * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV.
+ * Index buffer
*/
static nir_def *
dgc_get_index_type(struct dgc_cmdbuf *cs, nir_def *user_index_type)
@@ -1257,10 +1383,17 @@ dgc_get_index_type(struct dgc_cmdbuf *cs, nir_def *user_index_type)
const struct radv_indirect_command_layout *layout = cs->layout;
nir_builder *b = cs->b;
- nir_def *index_type = nir_bcsel(b, nir_ieq_imm(b, user_index_type, layout->ibo_type_32),
- nir_imm_int(b, V_028A7C_VGT_INDEX_32), nir_imm_int(b, V_028A7C_VGT_INDEX_16));
- return nir_bcsel(b, nir_ieq_imm(b, user_index_type, layout->ibo_type_8), nir_imm_int(b, V_028A7C_VGT_INDEX_8),
- index_type);
+ if (layout->vk.index_mode_is_dx) {
+ nir_def *index_type = nir_bcsel(b, nir_ieq_imm(b, user_index_type, 0x2a /* DXGI_FORMAT_R32_UINT */),
+ nir_imm_int(b, V_028A7C_VGT_INDEX_32), nir_imm_int(b, V_028A7C_VGT_INDEX_16));
+ return nir_bcsel(b, nir_ieq_imm(b, user_index_type, 0x3e /* DXGI_FORMAT_R8_UINT */),
+ nir_imm_int(b, V_028A7C_VGT_INDEX_8), index_type);
+ } else {
+ nir_def *index_type = nir_bcsel(b, nir_ieq_imm(b, user_index_type, VK_INDEX_TYPE_UINT32),
+ nir_imm_int(b, V_028A7C_VGT_INDEX_32), nir_imm_int(b, V_028A7C_VGT_INDEX_16));
+ return nir_bcsel(b, nir_ieq_imm(b, user_index_type, VK_INDEX_TYPE_UINT8_KHR),
+ nir_imm_int(b, V_028A7C_VGT_INDEX_8), index_type);
+ }
}
static void
@@ -1271,7 +1404,7 @@ dgc_emit_index_buffer(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_variable
const struct radv_physical_device *pdev = radv_device_physical(device);
nir_builder *b = cs->b;
- nir_def *data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->index_buffer_offset),
+ nir_def *data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.index_src_offset_B),
.access = ACCESS_NON_WRITEABLE);
nir_def *index_type = dgc_get_index_type(cs, nir_channel(b, data, 3));
@@ -1309,15 +1442,15 @@ dgc_emit_index_buffer(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_variable
}
/**
- * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV.
+ * Push constants
*/
static nir_def *
-dgc_get_push_constant_stages(struct dgc_cmdbuf *cs, nir_def *stream_addr)
+dgc_get_push_constant_stages(struct dgc_cmdbuf *cs)
{
const struct radv_indirect_command_layout *layout = cs->layout;
nir_builder *b = cs->b;
- if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) {
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
nir_def *has_push_constant = nir_ine_imm(b, load_shader_metadata32(cs, push_const_sgpr), 0);
return nir_bcsel(b, has_push_constant, nir_imm_int(b, VK_SHADER_STAGE_COMPUTE_BIT), nir_imm_int(b, 0));
} else {
@@ -1332,7 +1465,7 @@ dgc_get_upload_sgpr(struct dgc_cmdbuf *cs, nir_def *param_buf, nir_def *param_of
nir_builder *b = cs->b;
nir_def *res;
- if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) {
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
res = load_shader_metadata32(cs, push_const_sgpr);
} else {
res = nir_load_ssbo(b, 1, 32, param_buf, nir_iadd_imm(b, param_offset, stage * 12));
@@ -1348,7 +1481,7 @@ dgc_get_inline_sgpr(struct dgc_cmdbuf *cs, nir_def *param_buf, nir_def *param_of
nir_builder *b = cs->b;
nir_def *res;
- if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) {
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
res = load_shader_metadata32(cs, push_const_sgpr);
} else {
res = nir_load_ssbo(b, 1, 32, param_buf, nir_iadd_imm(b, param_offset, stage * 12));
@@ -1363,7 +1496,7 @@ dgc_get_inline_mask(struct dgc_cmdbuf *cs, nir_def *param_buf, nir_def *param_of
const struct radv_indirect_command_layout *layout = cs->layout;
nir_builder *b = cs->b;
- if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) {
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
return load_shader_metadata64(cs, inline_push_const_mask);
} else {
nir_def *reg_info = nir_load_ssbo(b, 2, 32, param_buf, nir_iadd_imm(b, param_offset, stage * 12 + 4));
@@ -1377,7 +1510,7 @@ dgc_push_constant_needs_copy(struct dgc_cmdbuf *cs)
const struct radv_indirect_command_layout *layout = cs->layout;
nir_builder *b = cs->b;
- if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) {
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
return nir_ine_imm(b, nir_ubfe_imm(b, load_shader_metadata32(cs, push_const_sgpr), 0, 16), 0);
} else {
return nir_ine_imm(b, load_param8(b, const_copy), 0);
@@ -1400,13 +1533,12 @@ dgc_get_pc_params(struct dgc_cmdbuf *cs)
params.buf = radv_meta_load_descriptor(b, 0, 0);
uint32_t offset = 0;
-
- if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) {
- offset = layout->bind_pipeline ? 0 : sizeof(struct radv_compute_pipeline_metadata);
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
+ offset =
+ (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) ? 0 : sizeof(struct radv_compute_pipeline_metadata);
} else {
- if (layout->bind_vbo_mask) {
- offset += MAX_VBS * DGC_VBO_INFO_SIZE;
- }
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB))
+ offset = MAX_VBS * DGC_VBO_INFO_SIZE;
}
params.offset = nir_imm_int(b, offset);
@@ -1416,15 +1548,19 @@ dgc_get_pc_params(struct dgc_cmdbuf *cs)
}
static void
-dgc_alloc_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, const struct dgc_pc_params *params)
+dgc_alloc_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id,
+ const struct dgc_pc_params *params)
{
const struct radv_indirect_command_layout *layout = cs->layout;
+ VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout);
nir_builder *b = cs->b;
- for (uint32_t i = 0; i < layout->push_constant_size / 4; i++) {
+ for (uint32_t i = 0; i < pipeline_layout->push_constant_size / 4; i++) {
nir_def *data;
- if ((layout->push_constant_mask & (1ull << i))) {
+ if (layout->sequence_index_mask & (1ull << i)) {
+ data = sequence_id;
+ } else if ((layout->push_constant_mask & (1ull << i))) {
data = nir_build_load_global(b, 1, 32, nir_iadd_imm(b, stream_addr, layout->push_constant_offsets[i]),
.access = ACCESS_NON_WRITEABLE);
} else {
@@ -1436,10 +1572,11 @@ dgc_alloc_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, const struc
}
static void
-dgc_emit_push_constant_for_stage(struct dgc_cmdbuf *cs, nir_def *stream_addr, const struct dgc_pc_params *params,
- gl_shader_stage stage)
+dgc_emit_push_constant_for_stage(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id,
+ const struct dgc_pc_params *params, gl_shader_stage stage)
{
const struct radv_indirect_command_layout *layout = cs->layout;
+ VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout);
nir_builder *b = cs->b;
nir_def *upload_sgpr = dgc_get_upload_sgpr(cs, params->buf, params->offset, stage);
@@ -1461,15 +1598,17 @@ dgc_emit_push_constant_for_stage(struct dgc_cmdbuf *cs, nir_def *stream_addr, co
nir_variable *pc_idx = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "pc_idx");
nir_store_var(b, pc_idx, nir_imm_int(b, 0), 0x1);
- for (uint32_t i = 0; i < layout->push_constant_size / 4; i++) {
+ for (uint32_t i = 0; i < pipeline_layout->push_constant_size / 4; i++) {
nir_push_if(b, nir_ine_imm(b, nir_iand_imm(b, inline_mask, 1ull << i), 0));
{
nir_def *data = NULL;
- if (layout->push_constant_mask & (1ull << i)) {
+ if (layout->sequence_index_mask & (1ull << i)) {
+ data = sequence_id;
+ } else if (layout->push_constant_mask & (1ull << i)) {
data = nir_build_load_global(b, 1, 32, nir_iadd_imm(b, stream_addr, layout->push_constant_offsets[i]),
.access = ACCESS_NON_WRITEABLE);
- } else if (layout->bind_pipeline) {
+ } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) {
/* For indirect pipeline binds, partial push constant updates can't be emitted when
* the DGC execute is called because there is no bound pipeline and they have to be
* emitted from the DGC prepare shader.
@@ -1494,17 +1633,17 @@ dgc_emit_push_constant_for_stage(struct dgc_cmdbuf *cs, nir_def *stream_addr, co
}
static void
-dgc_emit_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, VkShaderStageFlags stages)
+dgc_emit_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, VkShaderStageFlags stages)
{
const struct dgc_pc_params params = dgc_get_pc_params(cs);
nir_builder *b = cs->b;
- nir_def *push_constant_stages = dgc_get_push_constant_stages(cs, stream_addr);
+ nir_def *push_constant_stages = dgc_get_push_constant_stages(cs);
radv_foreach_stage(s, stages)
{
nir_push_if(b, nir_test_mask(b, push_constant_stages, mesa_to_vk_shader_stage(s)));
{
- dgc_emit_push_constant_for_stage(cs, stream_addr, ¶ms, s);
+ dgc_emit_push_constant_for_stage(cs, stream_addr, sequence_id, ¶ms, s);
}
nir_pop_if(b, NULL);
}
@@ -1512,13 +1651,13 @@ dgc_emit_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, VkShaderStag
nir_def *const_copy = dgc_push_constant_needs_copy(cs);
nir_push_if(b, const_copy);
{
- dgc_alloc_push_constant(cs, stream_addr, ¶ms);
+ dgc_alloc_push_constant(cs, stream_addr, sequence_id, ¶ms);
}
nir_pop_if(b, NULL);
}
/**
- * For emitting VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV.
+ * Vertex buffers
*/
struct dgc_vbo_info {
nir_def *va;
@@ -1689,8 +1828,8 @@ dgc_emit_vertex_buffer(struct dgc_cmdbuf *cs, nir_def *stream_addr)
nir_def *binding = load_vbo_metadata32(cs, cur_idx, binding);
- nir_def *vbo_override =
- nir_ine_imm(b, nir_iand(b, nir_imm_int(b, layout->bind_vbo_mask), nir_ishl(b, nir_imm_int(b, 1), binding)), 0);
+ nir_def *vbo_override = nir_ine_imm(
+ b, nir_iand(b, nir_imm_int(b, layout->vk.vertex_bindings), nir_ishl(b, nir_imm_int(b, 1), binding)), 0);
nir_push_if(b, vbo_override);
{
nir_def *stream_offset = load_vbo_offset(cs, cur_idx);
@@ -1700,12 +1839,7 @@ dgc_emit_vertex_buffer(struct dgc_cmdbuf *cs, nir_def *stream_addr)
nir_def *va = nir_pack_64_2x32(b, nir_trim_vector(b, stream_data, 2));
nir_def *size = nir_channel(b, stream_data, 2);
- nir_def *stride;
- if (layout->vertex_dynamic_stride) {
- stride = nir_channel(b, stream_data, 3);
- } else {
- stride = load_vbo_metadata32(cs, cur_idx, stride);
- }
+ nir_def *stride = nir_channel(b, stream_data, 3);
nir_store_var(b, va_var, va, 0x1);
nir_store_var(b, size_var, size, 0x1);
@@ -1750,7 +1884,7 @@ dgc_emit_vertex_buffer(struct dgc_cmdbuf *cs, nir_def *stream_addr)
}
/**
- * For emitting VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV.
+ * Compute dispatch
*/
static nir_def *
dgc_get_dispatch_initiator(struct dgc_cmdbuf *cs)
@@ -1765,36 +1899,66 @@ dgc_get_dispatch_initiator(struct dgc_cmdbuf *cs)
}
static void
-dgc_emit_dispatch(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
+dgc_emit_grid_size_user_sgpr(struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *wg_x, nir_def *wg_y,
+ nir_def *wg_z)
+{
+ dgc_cs_begin(cs);
+ dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 3, 0));
+ dgc_cs_emit(grid_base_sgpr);
+ dgc_cs_emit(wg_x);
+ dgc_cs_emit(wg_y);
+ dgc_cs_emit(wg_z);
+ dgc_cs_end();
+}
+
+static void
+dgc_emit_grid_size_pointer(struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *size_va)
+{
+ nir_builder *b = cs->b;
+
+ nir_def *va_lo = nir_unpack_64_2x32_split_x(b, size_va);
+ nir_def *va_hi = nir_unpack_64_2x32_split_y(b, size_va);
+
+ dgc_cs_begin(cs);
+ dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 2, 0));
+ dgc_cs_emit(grid_base_sgpr);
+ dgc_cs_emit(va_lo);
+ dgc_cs_emit(va_hi);
+ dgc_cs_end();
+}
+
+static void
+dgc_emit_dispatch_direct(struct dgc_cmdbuf *cs, nir_def *wg_x, nir_def *wg_y, nir_def *wg_z,
+ nir_def *dispatch_initiator, nir_def *grid_sgpr, nir_def *size_va, nir_def *sequence_id,
+ bool is_rt)
{
- const struct radv_indirect_command_layout *layout = cs->layout;
const struct radv_device *device = cs->dev;
nir_builder *b = cs->b;
- nir_def *dispatch_data = nir_build_load_global(
- b, 3, 32, nir_iadd_imm(b, stream_addr, layout->dispatch_params_offset), .access = ACCESS_NON_WRITEABLE);
- nir_def *wg_x = nir_channel(b, dispatch_data, 0);
- nir_def *wg_y = nir_channel(b, dispatch_data, 1);
- nir_def *wg_z = nir_channel(b, dispatch_data, 2);
-
nir_push_if(b, nir_iand(b, nir_ine_imm(b, wg_x, 0), nir_iand(b, nir_ine_imm(b, wg_y, 0), nir_ine_imm(b, wg_z, 0))));
{
- nir_def *grid_sgpr = load_shader_metadata32(cs, grid_base_sgpr);
nir_push_if(b, nir_ine_imm(b, grid_sgpr, 0));
{
if (device->load_grid_size_from_user_sgpr) {
dgc_emit_grid_size_user_sgpr(cs, grid_sgpr, wg_x, wg_y, wg_z);
} else {
- dgc_emit_grid_size_pointer(cs, grid_sgpr, stream_addr);
+ dgc_emit_grid_size_pointer(cs, grid_sgpr, size_va);
}
}
nir_pop_if(b, 0);
dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDispatch);
- dgc_emit_sqtt_marker_event_with_dims(cs, sequence_id, wg_x, wg_y, wg_z, EventCmdDispatch);
+ dgc_emit_sqtt_marker_event_with_dims(
+ cs, sequence_id, wg_x, wg_y, wg_z,
+ is_rt ? EventCmdTraceRaysKHR | ApiRayTracingSeparateCompiled : EventCmdDispatch);
- nir_def *dispatch_initiator = dgc_get_dispatch_initiator(cs);
- dgc_emit_dispatch_direct(cs, wg_x, wg_y, wg_z, dispatch_initiator);
+ dgc_cs_begin(cs);
+ dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
+ dgc_cs_emit(wg_x);
+ dgc_cs_emit(wg_y);
+ dgc_cs_emit(wg_z);
+ dgc_cs_emit(dispatch_initiator);
+ dgc_cs_end();
dgc_emit_sqtt_thread_trace_marker(cs);
dgc_emit_sqtt_end_api_marker(cs, ApiCmdDispatch);
@@ -1802,11 +1966,72 @@ dgc_emit_dispatch(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence
nir_pop_if(b, 0);
}
+static void
+dgc_emit_dispatch(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
+{
+ const struct radv_indirect_command_layout *layout = cs->layout;
+ nir_builder *b = cs->b;
+
+ nir_def *dispatch_data = nir_build_load_global(
+ b, 3, 32, nir_iadd_imm(b, stream_addr, layout->vk.dispatch_src_offset_B), .access = ACCESS_NON_WRITEABLE);
+ nir_def *wg_x = nir_channel(b, dispatch_data, 0);
+ nir_def *wg_y = nir_channel(b, dispatch_data, 1);
+ nir_def *wg_z = nir_channel(b, dispatch_data, 2);
+
+ nir_def *grid_sgpr = load_shader_metadata32(cs, grid_base_sgpr);
+ nir_def *dispatch_initiator = dgc_get_dispatch_initiator(cs);
+ nir_def *size_va = nir_iadd_imm(b, stream_addr, layout->vk.dispatch_src_offset_B);
+
+ dgc_emit_dispatch_direct(cs, wg_x, wg_y, wg_z, dispatch_initiator, grid_sgpr, size_va, sequence_id, false);
+}
+
/**
- * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV.
+ * Draw mesh/task
*/
static void
-dgc_emit_dispatch_taskmesh_gfx(struct dgc_cmdbuf *cs)
+dgc_emit_userdata_mesh(struct dgc_cmdbuf *cs, nir_def *x, nir_def *y, nir_def *z, nir_def *drawid)
+{
+ nir_builder *b = cs->b;
+
+ nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
+ vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr);
+
+ nir_def *has_grid_size = nir_test_mask(b, vtx_base_sgpr, DGC_USES_GRID_SIZE);
+ nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
+
+ nir_push_if(b, nir_ior(b, has_grid_size, has_drawid));
+ {
+ nir_def *pkt_cnt = nir_imm_int(b, 0);
+ pkt_cnt = nir_bcsel(b, has_grid_size, nir_iadd_imm(b, pkt_cnt, 3), pkt_cnt);
+ pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
+
+ dgc_cs_begin(cs);
+ dgc_cs_emit(nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt));
+ dgc_cs_emit(nir_iand_imm(b, vtx_base_sgpr, 0x3FFF));
+ /* DrawID needs to be first if no GridSize. */
+ dgc_cs_emit(nir_bcsel(b, has_grid_size, x, drawid));
+ dgc_cs_emit(nir_bcsel(b, has_grid_size, y, nir_imm_int(b, PKT3_NOP_PAD)));
+ dgc_cs_emit(nir_bcsel(b, has_grid_size, z, nir_imm_int(b, PKT3_NOP_PAD)));
+ dgc_cs_emit(nir_bcsel(b, has_drawid, drawid, nir_imm_int(b, PKT3_NOP_PAD)));
+ dgc_cs_end();
+ }
+ nir_pop_if(b, NULL);
+}
+
+static void
+dgc_emit_dispatch_mesh_direct(struct dgc_cmdbuf *cs, nir_def *x, nir_def *y, nir_def *z)
+{
+ dgc_cs_begin(cs);
+ dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_MESH_DIRECT, 3, 0));
+ dgc_cs_emit(x);
+ dgc_cs_emit(y);
+ dgc_cs_emit(z);
+ dgc_cs_emit_imm(S_0287F0_SOURCE_SELECT(V_0287F0_DI_SRC_SEL_AUTO_INDEX));
+ dgc_cs_end();
+}
+
+static void
+dgc_emit_dispatch_taskmesh_gfx(struct dgc_cmdbuf *cs, nir_def *sequence_id)
{
const struct radv_device *device = cs->dev;
const struct radv_physical_device *pdev = radv_device_physical(device);
@@ -1826,6 +2051,9 @@ dgc_emit_dispatch_taskmesh_gfx(struct dgc_cmdbuf *cs)
nir_bcsel(b, has_linear_dispatch_en, nir_imm_int(b, S_4D1_LINEAR_DISPATCH_ENABLE(1)), nir_imm_int(b, 0));
nir_def *sqtt_enable = nir_imm_int(b, device->sqtt.bo ? S_4D1_THREAD_TRACE_MARKER_ENABLE(1) : 0);
+ dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawMeshTasksEXT);
+ dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawMeshTasksEXT);
+
dgc_cs_begin(cs);
dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_TASKMESH_GFX, 2, 0) | PKT3_RESET_FILTER_CAM_S(1));
/* S_4D0_RING_ENTRY_REG(ring_entry_reg) | S_4D0_XYZ_DIM_REG(xyz_dim_reg) */
@@ -1837,6 +2065,9 @@ dgc_emit_dispatch_taskmesh_gfx(struct dgc_cmdbuf *cs)
}
dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX);
dgc_cs_end();
+
+ dgc_emit_sqtt_thread_trace_marker(cs);
+ dgc_emit_sqtt_end_api_marker(cs, ApiCmdDrawMeshTasksEXT);
}
static void
@@ -1847,7 +2078,7 @@ dgc_emit_draw_mesh_tasks_gfx(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_de
const struct radv_physical_device *pdev = radv_device_physical(device);
nir_builder *b = cs->b;
- nir_def *draw_data = nir_build_load_global(b, 3, 32, nir_iadd_imm(b, stream_addr, layout->draw_params_offset),
+ nir_def *draw_data = nir_build_load_global(b, 3, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
.access = ACCESS_NON_WRITEABLE);
nir_def *x = nir_channel(b, draw_data, 0);
nir_def *y = nir_channel(b, draw_data, 1);
@@ -1855,15 +2086,15 @@ dgc_emit_draw_mesh_tasks_gfx(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_de
nir_push_if(b, nir_iand(b, nir_ine_imm(b, x, 0), nir_iand(b, nir_ine_imm(b, y, 0), nir_ine_imm(b, z, 0))));
{
- dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawMeshTasksEXT);
- dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawMeshTasksEXT);
-
nir_push_if(b, nir_ieq_imm(b, load_param8(b, has_task_shader), 1));
{
- dgc_emit_dispatch_taskmesh_gfx(cs);
+ dgc_emit_dispatch_taskmesh_gfx(cs, sequence_id);
}
nir_push_else(b, NULL);
{
+ dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawMeshTasksEXT);
+ dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawMeshTasksEXT);
+
dgc_emit_userdata_mesh(cs, x, y, z, sequence_id);
dgc_emit_instance_count(cs, nir_imm_int(b, 1));
@@ -1882,6 +2113,76 @@ dgc_emit_draw_mesh_tasks_gfx(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_de
nir_pop_if(b, NULL);
}
+static void
+dgc_emit_draw_mesh_tasks_with_count_gfx(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
+{
+ const struct radv_indirect_command_layout *layout = cs->layout;
+ const struct radv_device *device = cs->dev;
+ const struct radv_physical_device *pdev = radv_device_physical(device);
+ nir_builder *b = cs->b;
+
+ nir_push_if(b, nir_ieq_imm(b, load_param8(b, has_task_shader), 1));
+ {
+ dgc_emit_dispatch_taskmesh_gfx(cs, sequence_id);
+ }
+ nir_push_else(b, NULL);
+ {
+ nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
+ nir_def *has_grid_size = nir_test_mask(b, vtx_base_sgpr, DGC_USES_GRID_SIZE);
+ nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
+
+ nir_def *draw_data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
+ .access = ACCESS_NON_WRITEABLE);
+ nir_def *va = nir_pack_64_2x32(b, nir_channels(b, draw_data, 0x3));
+ nir_def *stride = nir_channel(b, draw_data, 2);
+ nir_def *draw_count = nir_umin(b, load_param32(b, max_draw_count), nir_channel(b, draw_data, 3));
+
+ dgc_emit_pkt3_set_base(cs, va);
+
+ nir_def *base_reg = nir_iand_imm(b, vtx_base_sgpr, 0x3FFF);
+ nir_def *xyz_dim_reg = nir_bcsel(b, has_grid_size, base_reg, nir_imm_int(b, 0));
+ nir_def *draw_id_offset = nir_bcsel(b, has_grid_size, nir_imm_int(b, 3), nir_imm_int(b, 0));
+ nir_def *draw_id_reg = nir_bcsel(b, has_drawid, nir_iadd(b, base_reg, draw_id_offset), nir_imm_int(b, 0));
+
+ nir_push_if(b, has_drawid);
+ {
+ nir_def *packet[3] = {nir_imm_int(b, PKT3(PKT3_SET_SH_REG, 1, 0)), draw_id_reg, nir_imm_int(b, 0)};
+ dgc_emit(cs, 3, packet);
+ }
+ nir_pop_if(b, NULL);
+
+ nir_def *draw_index_enable =
+ nir_bcsel(b, has_drawid, nir_imm_int(b, S_4C2_DRAW_INDEX_ENABLE(1)), nir_imm_int(b, 0));
+ nir_def *xyz_dim_enable = nir_bcsel(b, has_grid_size, nir_imm_int(b, S_4C2_XYZ_DIM_ENABLE(1)), nir_imm_int(b, 0));
+
+ dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawMeshTasksIndirectCountEXT);
+ dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawMeshTasksIndirectCountEXT);
+
+ dgc_cs_begin(cs);
+ dgc_cs_emit(nir_imm_int(b, PKT3(PKT3_DISPATCH_MESH_INDIRECT_MULTI, 7, false) | PKT3_RESET_FILTER_CAM_S(1)));
+ dgc_cs_emit_imm(0); /* data offset */
+ /* S_4C1_XYZ_DIM_REG(xyz_dim_reg) | S_4C1_DRAW_INDEX_REG(draw_id_reg) */
+ dgc_cs_emit(
+ nir_ior(b, nir_iand_imm(b, xyz_dim_reg, 0xFFFF), nir_ishl_imm(b, nir_iand_imm(b, draw_id_reg, 0xFFFF), 16)));
+ if (pdev->info.gfx_level >= GFX11) {
+ dgc_cs_emit(nir_ior_imm(b, nir_ior(b, draw_index_enable, xyz_dim_enable),
+ S_4C2_MODE1_ENABLE(!pdev->mesh_fast_launch_2)));
+ } else {
+ dgc_cs_emit(draw_index_enable);
+ }
+ dgc_cs_emit(draw_count);
+ dgc_cs_emit_imm(0);
+ dgc_cs_emit_imm(0);
+ dgc_cs_emit(stride);
+ dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX);
+ dgc_cs_end();
+
+ dgc_emit_sqtt_thread_trace_marker(cs);
+ dgc_emit_sqtt_end_api_marker(cs, ApiCmdDrawMeshTasksIndirectCountEXT);
+ }
+ nir_pop_if(b, NULL);
+}
+
static void
dgc_emit_userdata_task(struct dgc_cmdbuf *ace_cs, nir_def *x, nir_def *y, nir_def *z)
{
@@ -1912,18 +2213,24 @@ dgc_emit_userdata_task(struct dgc_cmdbuf *ace_cs, nir_def *x, nir_def *y, nir_de
nir_pop_if(b, NULL);
}
+static nir_def *
+dgc_get_dispatch_initiator_task(struct dgc_cmdbuf *ace_cs)
+{
+ const struct radv_device *device = ace_cs->dev;
+ const uint32_t dispatch_initiator_task = device->dispatch_initiator_task;
+ nir_builder *b = ace_cs->b;
+
+ nir_def *is_wave32 = nir_ieq_imm(b, load_param8(b, wave32), 1);
+ return nir_bcsel(b, is_wave32, nir_imm_int(b, dispatch_initiator_task | S_00B800_CS_W32_EN(1)),
+ nir_imm_int(b, dispatch_initiator_task));
+}
+
static void
dgc_emit_dispatch_taskmesh_direct_ace(struct dgc_cmdbuf *ace_cs, nir_def *x, nir_def *y, nir_def *z)
{
- const struct radv_device *device = ace_cs->dev;
+ nir_def *dispatch_initiator = dgc_get_dispatch_initiator_task(ace_cs);
nir_builder *b = ace_cs->b;
- const uint32_t dispatch_initiator_task = device->dispatch_initiator_task;
- nir_def *is_wave32 = nir_ieq_imm(b, load_param8(b, wave32), 1);
- nir_def *dispatch_initiator =
- nir_bcsel(b, is_wave32, nir_imm_int(b, dispatch_initiator_task | S_00B800_CS_W32_EN(1)),
- nir_imm_int(b, dispatch_initiator_task));
-
dgc_cs_begin(ace_cs);
dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_TASKMESH_DIRECT_ACE, 4, 0) | PKT3_SHADER_TYPE_S(1));
dgc_cs_emit(x);
@@ -1940,7 +2247,7 @@ dgc_emit_draw_mesh_tasks_ace(struct dgc_cmdbuf *ace_cs, nir_def *stream_addr)
const struct radv_indirect_command_layout *layout = ace_cs->layout;
nir_builder *b = ace_cs->b;
- nir_def *draw_data = nir_build_load_global(b, 3, 32, nir_iadd_imm(b, stream_addr, layout->draw_params_offset),
+ nir_def *draw_data = nir_build_load_global(b, 3, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
.access = ACCESS_NON_WRITEABLE);
nir_def *x = nir_channel(b, draw_data, 0);
nir_def *y = nir_channel(b, draw_data, 1);
@@ -1954,8 +2261,47 @@ dgc_emit_draw_mesh_tasks_ace(struct dgc_cmdbuf *ace_cs, nir_def *stream_addr)
nir_pop_if(b, NULL);
}
+static void
+dgc_emit_draw_mesh_tasks_with_count_ace(struct dgc_cmdbuf *ace_cs, nir_def *stream_addr, nir_def *sequence_id)
+{
+ const struct radv_indirect_command_layout *layout = ace_cs->layout;
+ nir_builder *b = ace_cs->b;
+
+ nir_def *draw_data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
+ .access = ACCESS_NON_WRITEABLE);
+ nir_def *va_lo = nir_channel(b, draw_data, 0);
+ nir_def *va_hi = nir_channel(b, draw_data, 1);
+ nir_def *stride = nir_channel(b, draw_data, 2);
+ nir_def *draw_count = nir_umin(b, load_param32(b, max_draw_count), nir_channel(b, draw_data, 3));
+
+ nir_def *xyz_dim_reg = load_param16(b, task_xyz_sgpr);
+ nir_def *ring_entry_reg = load_param16(b, task_ring_entry_sgpr);
+ nir_def *draw_id_reg = load_param16(b, task_draw_id_sgpr);
+
+ nir_def *draw_index_enable =
+ nir_bcsel(b, nir_ine_imm(b, draw_id_reg, 0), nir_imm_int(b, S_AD3_DRAW_INDEX_ENABLE(1)), nir_imm_int(b, 0));
+ nir_def *xyz_dim_enable =
+ nir_bcsel(b, nir_ine_imm(b, xyz_dim_reg, 0), nir_imm_int(b, S_AD3_XYZ_DIM_ENABLE(1)), nir_imm_int(b, 0));
+
+ nir_def *dispatch_initiator = dgc_get_dispatch_initiator_task(ace_cs);
+
+ dgc_cs_begin(ace_cs);
+ dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE, 9, 0) | PKT3_SHADER_TYPE_S(1));
+ dgc_cs_emit(va_lo);
+ dgc_cs_emit(va_hi);
+ dgc_cs_emit(ring_entry_reg);
+ dgc_cs_emit(nir_ior(b, draw_index_enable, nir_ior(b, xyz_dim_enable, nir_ishl_imm(b, draw_id_reg, 16))));
+ dgc_cs_emit(xyz_dim_reg);
+ dgc_cs_emit(draw_count);
+ dgc_cs_emit_imm(0);
+ dgc_cs_emit_imm(0);
+ dgc_cs_emit(stride);
+ dgc_cs_emit(dispatch_initiator);
+ dgc_cs_end();
+}
+
/**
- * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NV.
+ * Indirect execution set
*/
static void
dgc_emit_indirect_sets(struct dgc_cmdbuf *cs)
@@ -1975,11 +2321,11 @@ dgc_emit_indirect_sets(struct dgc_cmdbuf *cs)
}
static void
-dgc_emit_bind_pipeline(struct dgc_cmdbuf *cs)
+dgc_emit_ies(struct dgc_cmdbuf *cs)
{
nir_builder *b = cs->b;
- nir_def *va = nir_iadd_imm(b, cs->pipeline_va, sizeof(struct radv_compute_pipeline_metadata));
+ nir_def *va = nir_iadd_imm(b, cs->ies_va, sizeof(struct radv_compute_pipeline_metadata));
nir_def *num_dw = nir_build_load_global(b, 1, 32, va, .access = ACCESS_NON_WRITEABLE);
nir_def *cs_va = nir_iadd_imm(b, va, 4);
@@ -2006,6 +2352,66 @@ dgc_emit_bind_pipeline(struct dgc_cmdbuf *cs)
dgc_emit_indirect_sets(cs);
}
+/**
+ * Raytracing.
+ */
+static void
+dgc_emit_shader_pointer(struct dgc_cmdbuf *cs, nir_def *sh_offset, nir_def *va)
+{
+ nir_builder *b = cs->b;
+
+ nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va);
+ nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va);
+
+ dgc_cs_begin(cs);
+ dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 2, 0));
+ dgc_cs_emit(sh_offset);
+ dgc_cs_emit(va_lo);
+ dgc_cs_emit(va_hi);
+ dgc_cs_end();
+}
+
+static void
+dgc_emit_rt(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
+{
+ const struct radv_indirect_command_layout *layout = cs->layout;
+ const struct radv_device *device = cs->dev;
+ nir_builder *b = cs->b;
+
+ nir_def *indirect_va = nir_iadd_imm(b, stream_addr, layout->vk.dispatch_src_offset_B);
+
+ nir_def *cs_sbt_descriptors = load_param16(b, cs_sbt_descriptors);
+ nir_push_if(b, nir_ine_imm(b, cs_sbt_descriptors, 0));
+ {
+ dgc_emit_shader_pointer(cs, cs_sbt_descriptors, indirect_va);
+ }
+ nir_pop_if(b, NULL);
+
+ nir_def *launch_size_va = nir_iadd_imm(b, indirect_va, offsetof(VkTraceRaysIndirectCommand2KHR, width));
+
+ nir_def *cs_ray_launch_size_addr = load_param16(b, cs_ray_launch_size_addr);
+ nir_push_if(b, nir_ine_imm(b, cs_ray_launch_size_addr, 0));
+ {
+ dgc_emit_shader_pointer(cs, cs_ray_launch_size_addr, launch_size_va);
+ }
+ nir_pop_if(b, NULL);
+
+ const uint32_t dispatch_initiator = device->dispatch_initiator | S_00B800_USE_THREAD_DIMENSIONS(1);
+ nir_def *is_wave32 = nir_ieq_imm(b, load_param8(b, wave32), 1);
+ nir_def *dispatch_initiator_rt = nir_bcsel(b, is_wave32, nir_imm_int(b, dispatch_initiator | S_00B800_CS_W32_EN(1)),
+ nir_imm_int(b, dispatch_initiator));
+
+ nir_def *dispatch_data = nir_build_load_global(b, 3, 32, launch_size_va, .access = ACCESS_NON_WRITEABLE);
+ nir_def *width = nir_channel(b, dispatch_data, 0);
+ nir_def *height = nir_channel(b, dispatch_data, 1);
+ nir_def *depth = nir_channel(b, dispatch_data, 2);
+
+ nir_def *grid_sgpr = load_param16(b, grid_base_sgpr);
+
+ dgc_emit_dispatch_direct(cs, width, height, depth, dispatch_initiator_rt, grid_sgpr, launch_size_va, sequence_id,
+ true);
+}
+
static nir_def *
dgc_is_cond_render_enabled(nir_builder *b)
{
@@ -2059,23 +2465,22 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l
nir_def *sequence_id = global_id;
nir_def *cmd_buf_stride = load_param32(&b, cmd_buf_stride);
- nir_def *sequence_count = load_param32(&b, sequence_count);
-
- nir_def *use_count = nir_iand_imm(&b, sequence_count, 1u << 31);
- sequence_count = nir_iand_imm(&b, sequence_count, UINT32_MAX >> 1);
-
nir_def *cmd_buf_base_offset = load_param32(&b, cmd_buf_main_offset);
+ nir_def *sequence_count = load_param32(&b, sequence_count);
+ nir_def *sequence_count_addr = load_param64(&b, sequence_count_addr);
+
/* The effective number of draws is
* min(sequencesCount, sequencesCountBuffer[sequencesCountOffset]) when
* using sequencesCountBuffer. Otherwise it is sequencesCount. */
nir_variable *count_var = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "sequence_count");
nir_store_var(&b, count_var, sequence_count, 0x1);
- nir_push_if(&b, nir_ine_imm(&b, use_count, 0));
+ nir_push_if(&b, nir_ine_imm(&b, sequence_count_addr, 0));
{
nir_def *cnt =
nir_build_load_global(&b, 1, 32, load_param64(&b, sequence_count_addr), .access = ACCESS_NON_WRITEABLE);
+
/* Must clamp count against the API count explicitly.
* The workgroup potentially contains more threads than maxSequencesCount from API,
* and we have to ensure these threads write NOP packets to pad out the IB. */
@@ -2111,33 +2516,40 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l
nir_def *cmd_buf_end = nir_iadd(&b, nir_load_var(&b, cmd_buf.offset), cmd_buf_stride);
nir_def *stream_addr = load_param64(&b, stream_addr);
- stream_addr = nir_iadd(&b, stream_addr, nir_u2u64(&b, nir_imul_imm(&b, sequence_id, layout->input_stride)));
-
- if (layout->bind_pipeline)
- cmd_buf.pipeline_va = dgc_get_pipeline_va(&cmd_buf, stream_addr);
+ stream_addr = nir_iadd(&b, stream_addr, nir_u2u64(&b, nir_imul_imm(&b, sequence_id, layout->vk.stride)));
nir_def *upload_offset_init =
nir_iadd(&b, load_param32(&b, upload_main_offset), nir_imul(&b, load_param32(&b, upload_stride), sequence_id));
nir_store_var(&b, cmd_buf.upload_offset, upload_offset_init, 0x1);
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES))
+ cmd_buf.ies_va = dgc_load_ies_va(&cmd_buf, stream_addr);
+
if (layout->push_constant_mask) {
const VkShaderStageFlags stages =
VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_COMPUTE_BIT | VK_SHADER_STAGE_MESH_BIT_EXT;
- dgc_emit_push_constant(&cmd_buf, stream_addr, stages);
+ dgc_emit_push_constant(&cmd_buf, stream_addr, sequence_id, stages);
}
- if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
- if (layout->bind_vbo_mask) {
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) {
+ /* Raytracing */
+ dgc_emit_rt(&cmd_buf, stream_addr, sequence_id);
+ } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
+ /* Compute */
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) {
+ dgc_emit_ies(&cmd_buf);
+ }
+
+ dgc_emit_dispatch(&cmd_buf, stream_addr, sequence_id);
+ } else {
+ /* Graphics */
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) {
dgc_emit_vertex_buffer(&cmd_buf, stream_addr);
}
- if (layout->indexed) {
- /* Emit direct draws when index buffers are also updated by DGC. Otherwise, emit
- * indirect draws to remove the dependency on the cmdbuf state in order to enable
- * preprocessing.
- */
- if (layout->binds_index_buffer) {
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_INDEXED)) {
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IB)) {
nir_variable *max_index_count_var =
nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "max_index_count");
@@ -2145,23 +2557,34 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l
nir_def *max_index_count = nir_load_var(&b, max_index_count_var);
- dgc_emit_draw_indexed(&cmd_buf, stream_addr, sequence_id, max_index_count);
+ if (layout->vk.draw_count) {
+ dgc_emit_draw_with_count(&cmd_buf, stream_addr, sequence_id, true);
+ } else {
+ dgc_emit_draw_indexed(&cmd_buf, stream_addr, sequence_id, max_index_count);
+ }
} else {
- dgc_emit_draw_indirect(&cmd_buf, stream_addr, sequence_id, true);
+ if (layout->vk.draw_count) {
+ dgc_emit_draw_with_count(&cmd_buf, stream_addr, sequence_id, true);
+ } else {
+ dgc_emit_draw_indirect(&cmd_buf, stream_addr, sequence_id, true);
+ }
}
} else {
- if (layout->draw_mesh_tasks) {
- dgc_emit_draw_mesh_tasks_gfx(&cmd_buf, stream_addr, sequence_id);
+ /* Non-indexed draws */
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) {
+ if (layout->vk.draw_count) {
+ dgc_emit_draw_mesh_tasks_with_count_gfx(&cmd_buf, stream_addr, sequence_id);
+ } else {
+ dgc_emit_draw_mesh_tasks_gfx(&cmd_buf, stream_addr, sequence_id);
+ }
} else {
- dgc_emit_draw(&cmd_buf, stream_addr, sequence_id);
+ if (layout->vk.draw_count) {
+ dgc_emit_draw_with_count(&cmd_buf, stream_addr, sequence_id, false);
+ } else {
+ dgc_emit_draw(&cmd_buf, stream_addr, sequence_id);
+ }
}
}
- } else {
- if (layout->bind_pipeline) {
- dgc_emit_bind_pipeline(&cmd_buf);
- }
-
- dgc_emit_dispatch(&cmd_buf, stream_addr, sequence_id);
}
/* Pad the cmdbuffer if we did not use the whole stride */
@@ -2195,27 +2618,31 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l
nir_def *cmd_buf_end = nir_iadd(&b, nir_load_var(&b, cmd_buf.offset), ace_cmd_buf_stride);
nir_def *stream_addr = load_param64(&b, stream_addr);
- stream_addr = nir_iadd(&b, stream_addr, nir_u2u64(&b, nir_imul_imm(&b, sequence_id, layout->input_stride)));
-
- if (layout->bind_pipeline)
- cmd_buf.pipeline_va = dgc_get_pipeline_va(&cmd_buf, stream_addr);
+ stream_addr = nir_iadd(&b, stream_addr, nir_u2u64(&b, nir_imul_imm(&b, sequence_id, layout->vk.stride)));
nir_def *upload_offset_init = nir_iadd(&b, load_param32(&b, upload_main_offset),
nir_imul(&b, load_param32(&b, upload_stride), sequence_id));
nir_store_var(&b, cmd_buf.upload_offset, upload_offset_init, 0x1);
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES))
+ cmd_buf.ies_va = dgc_load_ies_va(&cmd_buf, stream_addr);
+
if (layout->push_constant_mask) {
- nir_def *push_constant_stages = dgc_get_push_constant_stages(&cmd_buf, stream_addr);
+ nir_def *push_constant_stages = dgc_get_push_constant_stages(&cmd_buf);
nir_push_if(&b, nir_test_mask(&b, push_constant_stages, VK_SHADER_STAGE_TASK_BIT_EXT));
{
const struct dgc_pc_params params = dgc_get_pc_params(&cmd_buf);
- dgc_emit_push_constant_for_stage(&cmd_buf, stream_addr, ¶ms, MESA_SHADER_TASK);
+ dgc_emit_push_constant_for_stage(&cmd_buf, stream_addr, sequence_id, ¶ms, MESA_SHADER_TASK);
}
nir_pop_if(&b, NULL);
}
- dgc_emit_draw_mesh_tasks_ace(&cmd_buf, stream_addr);
+ if (layout->vk.draw_count) {
+ dgc_emit_draw_mesh_tasks_with_count_ace(&cmd_buf, stream_addr, sequence_id);
+ } else {
+ dgc_emit_draw_mesh_tasks_ace(&cmd_buf, stream_addr);
+ }
/* Pad the cmdbuffer if we did not use the whole stride */
dgc_pad_cmdbuf(&cmd_buf, cmd_buf_end);
@@ -2300,130 +2727,17 @@ radv_create_dgc_pipeline(struct radv_device *device, struct radv_indirect_comman
return result;
}
-static void
-radv_destroy_indirect_commands_layout(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
- struct radv_indirect_command_layout *layout)
-{
- radv_DestroyPipeline(radv_device_to_handle(device), layout->pipeline, &device->meta_state.alloc);
-
- vk_object_base_finish(&layout->base);
- vk_free2(&device->vk.alloc, pAllocator, layout);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-radv_CreateIndirectCommandsLayoutNV(VkDevice _device, const VkIndirectCommandsLayoutCreateInfoNV *pCreateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkIndirectCommandsLayoutNV *pIndirectCommandsLayout)
-{
- VK_FROM_HANDLE(radv_device, device, _device);
- struct radv_indirect_command_layout *layout;
- VkResult result;
-
- size_t size = sizeof(*layout) + pCreateInfo->tokenCount * sizeof(VkIndirectCommandsLayoutTokenNV);
-
- layout = vk_zalloc2(&device->vk.alloc, pAllocator, size, alignof(struct radv_indirect_command_layout),
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- if (!layout)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- vk_object_base_init(&device->vk, &layout->base, VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV);
-
- layout->flags = pCreateInfo->flags;
- layout->pipeline_bind_point = pCreateInfo->pipelineBindPoint;
- layout->input_stride = pCreateInfo->pStreamStrides[0];
- layout->token_count = pCreateInfo->tokenCount;
- typed_memcpy(layout->tokens, pCreateInfo->pTokens, pCreateInfo->tokenCount);
-
- layout->ibo_type_32 = VK_INDEX_TYPE_UINT32;
- layout->ibo_type_8 = VK_INDEX_TYPE_UINT8_KHR;
-
- for (unsigned i = 0; i < pCreateInfo->tokenCount; ++i) {
- switch (pCreateInfo->pTokens[i].tokenType) {
- case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV:
- layout->draw_params_offset = pCreateInfo->pTokens[i].offset;
- break;
- case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV:
- layout->indexed = true;
- layout->draw_params_offset = pCreateInfo->pTokens[i].offset;
- break;
- case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV:
- layout->dispatch_params_offset = pCreateInfo->pTokens[i].offset;
- break;
- case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV:
- layout->binds_index_buffer = true;
- layout->index_buffer_offset = pCreateInfo->pTokens[i].offset;
- /* 16-bit is implied if we find no match. */
- for (unsigned j = 0; j < pCreateInfo->pTokens[i].indexTypeCount; j++) {
- if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT32)
- layout->ibo_type_32 = pCreateInfo->pTokens[i].pIndexTypeValues[j];
- else if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT8_KHR)
- layout->ibo_type_8 = pCreateInfo->pTokens[i].pIndexTypeValues[j];
- }
- break;
- case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV:
- layout->bind_vbo_mask |= 1u << pCreateInfo->pTokens[i].vertexBindingUnit;
- layout->vbo_offsets[pCreateInfo->pTokens[i].vertexBindingUnit] = pCreateInfo->pTokens[i].offset;
- layout->vertex_dynamic_stride = pCreateInfo->pTokens[i].vertexDynamicStride;
- break;
- case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV: {
- VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->pTokens[i].pushconstantPipelineLayout);
- for (unsigned j = pCreateInfo->pTokens[i].pushconstantOffset / 4, k = 0;
- k < pCreateInfo->pTokens[i].pushconstantSize / 4; ++j, ++k) {
- layout->push_constant_mask |= 1ull << j;
- layout->push_constant_offsets[j] = pCreateInfo->pTokens[i].offset + k * 4;
- }
- layout->push_constant_size = pipeline_layout->push_constant_size;
- assert(!pipeline_layout->dynamic_offset_count);
- break;
- }
- case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV:
- layout->draw_mesh_tasks = true;
- layout->draw_params_offset = pCreateInfo->pTokens[i].offset;
- break;
- case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NV:
- layout->bind_pipeline = true;
- layout->pipeline_params_offset = pCreateInfo->pTokens[i].offset;
- break;
- default:
- unreachable("Unhandled token type");
- }
- }
-
- result = radv_create_dgc_pipeline(device, layout);
- if (result != VK_SUCCESS) {
- radv_destroy_indirect_commands_layout(device, pAllocator, layout);
- return result;
- }
-
- *pIndirectCommandsLayout = radv_indirect_command_layout_to_handle(layout);
- return VK_SUCCESS;
-}
-
VKAPI_ATTR void VKAPI_CALL
-radv_DestroyIndirectCommandsLayoutNV(VkDevice _device, VkIndirectCommandsLayoutNV indirectCommandsLayout,
- const VkAllocationCallbacks *pAllocator)
-{
- VK_FROM_HANDLE(radv_device, device, _device);
- VK_FROM_HANDLE(radv_indirect_command_layout, layout, indirectCommandsLayout);
-
- if (!layout)
- return;
-
- radv_destroy_indirect_commands_layout(device, pAllocator, layout);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-radv_GetGeneratedCommandsMemoryRequirementsNV(VkDevice _device,
- const VkGeneratedCommandsMemoryRequirementsInfoNV *pInfo,
- VkMemoryRequirements2 *pMemoryRequirements)
+radv_GetGeneratedCommandsMemoryRequirementsEXT(VkDevice _device,
+ const VkGeneratedCommandsMemoryRequirementsInfoEXT *pInfo,
+ VkMemoryRequirements2 *pMemoryRequirements)
{
VK_FROM_HANDLE(radv_device, device, _device);
const struct radv_physical_device *pdev = radv_device_physical(device);
VK_FROM_HANDLE(radv_indirect_command_layout, layout, pInfo->indirectCommandsLayout);
- VK_FROM_HANDLE(radv_pipeline, pipeline, pInfo->pipeline);
struct dgc_cmdbuf_layout cmdbuf_layout;
- get_dgc_cmdbuf_layout(device, layout, pipeline, pInfo->maxSequencesCount, true, &cmdbuf_layout);
+ get_dgc_cmdbuf_layout(device, layout, pInfo->pNext, pInfo->maxSequenceCount, true, &cmdbuf_layout);
pMemoryRequirements->memoryRequirements.memoryTypeBits = pdev->memory_types_32bit;
pMemoryRequirements->memoryRequirements.alignment =
@@ -2433,28 +2747,12 @@ radv_GetGeneratedCommandsMemoryRequirementsNV(VkDevice _device,
}
bool
-radv_dgc_with_task_shader(const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
+radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
{
- VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
-
- if (layout->pipeline_bind_point != VK_PIPELINE_BIND_POINT_GRAPHICS)
- return false;
-
- if (!layout->draw_mesh_tasks)
- return false;
-
- VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
- const struct radv_shader *task_shader = radv_get_shader(pipeline->shaders, MESA_SHADER_TASK);
- if (!task_shader)
- return false;
-
- return true;
-}
-
-bool
-radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
-{
- VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer);
+ const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
+ vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
+ const VkGeneratedCommandsShaderInfoEXT *eso_info =
+ vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
/* Enable conditional rendering (if not enabled by user) to skip prepare/execute DGC calls when
* the indirect sequence count might be zero. This can only be enabled on GFX because on ACE it's
@@ -2462,137 +2760,42 @@ radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCo
* when the graphics pipelines has a task shader for the same reason (otherwise the DGC ACE IB
* would be uninitialized).
*/
- return cmd_buffer->qf == RADV_QUEUE_GENERAL && !radv_dgc_with_task_shader(pGeneratedCommandsInfo) &&
- seq_count_buffer && !cmd_buffer->state.predicating;
+ return cmd_buffer->qf == RADV_QUEUE_GENERAL && !radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK) &&
+ pGeneratedCommandsInfo->sequenceCountAddress != 0 && !cmd_buffer->state.predicating;
}
-static bool
-radv_dgc_need_push_constants_copy(const struct radv_pipeline *pipeline)
-{
- for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); ++i) {
- const struct radv_shader *shader = pipeline->shaders[i];
-
- if (!shader)
- continue;
-
- const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
- if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0)
- return true;
- }
-
- return false;
-}
-
-bool
-radv_dgc_can_preprocess(const struct radv_indirect_command_layout *layout, struct radv_pipeline *pipeline)
-{
- if (!(layout->flags & VK_INDIRECT_COMMANDS_LAYOUT_USAGE_EXPLICIT_PREPROCESS_BIT_NV))
- return false;
-
- /* From the Vulkan spec (1.3.269, chapter 32):
- * "The bound descriptor sets and push constants that will be used with indirect command generation for the compute
- * piplines must already be specified at the time of preprocessing commands with vkCmdPreprocessGeneratedCommandsNV.
- * They must not change until the execution of indirect commands is submitted with vkCmdExecuteGeneratedCommandsNV."
- *
- * So we can always preprocess compute layouts.
- */
- if (layout->pipeline_bind_point != VK_PIPELINE_BIND_POINT_COMPUTE) {
- /* VBO binding (in particular partial VBO binding) uses some draw state which we don't generate at preprocess time
- * yet. */
- if (layout->bind_vbo_mask)
- return false;
-
- /* Do not preprocess when all push constants can't be inlined because they need to be copied
- * to the upload BO.
- */
- if (layout->push_constant_mask && radv_dgc_need_push_constants_copy(pipeline))
- return false;
- }
-
- return true;
-}
-
-/* Always need to call this directly before draw due to dependence on bound state. */
-static void
-radv_prepare_dgc_graphics(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo,
- unsigned *upload_size, unsigned *upload_offset, void **upload_data,
- struct radv_dgc_params *params)
+VKAPI_ATTR void VKAPI_CALL
+radv_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer,
+ const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
+ VkCommandBuffer stateCommandBuffer)
{
+ VK_FROM_HANDLE(radv_cmd_buffer, state_cmd_buffer, stateCommandBuffer);
+ VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
- VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
- struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
- struct radv_shader *vs = radv_get_shader(graphics_pipeline->base.shaders, MESA_SHADER_VERTEX);
- unsigned vb_size = layout->bind_vbo_mask ? MAX_VBS * DGC_VBO_INFO_SIZE : 0;
- *upload_size = MAX2(*upload_size + vb_size, 16);
+ assert(layout->vk.usage & VK_INDIRECT_COMMANDS_LAYOUT_USAGE_EXPLICIT_PREPROCESS_BIT_EXT);
- if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) {
- vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
- return;
- }
+ /* VK_EXT_conditional_rendering says that copy commands should not be
+ * affected by conditional rendering.
+ */
+ const bool old_predicating = cmd_buffer->state.predicating;
+ cmd_buffer->state.predicating = false;
- uint16_t vtx_base_sgpr = 0;
+ radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, state_cmd_buffer, old_predicating);
- if (graphics_pipeline->vtx_base_sgpr)
- vtx_base_sgpr = (graphics_pipeline->vtx_base_sgpr - SI_SH_REG_OFFSET) >> 2;
-
- if (graphics_pipeline->uses_drawid)
- vtx_base_sgpr |= DGC_USES_DRAWID;
-
- if (layout->draw_mesh_tasks) {
- struct radv_shader *mesh_shader = radv_get_shader(graphics_pipeline->base.shaders, MESA_SHADER_MESH);
- const struct radv_shader *task_shader = radv_get_shader(graphics_pipeline->base.shaders, MESA_SHADER_TASK);
-
- if (mesh_shader->info.cs.uses_grid_size)
- vtx_base_sgpr |= DGC_USES_GRID_SIZE;
-
- if (task_shader) {
- params->has_task_shader = 1;
- params->mesh_ring_entry_sgpr = radv_get_user_sgpr(mesh_shader, AC_UD_TASK_RING_ENTRY);
- params->wave32 = task_shader->info.wave_size == 32;
- params->linear_dispatch_en = task_shader->info.cs.linear_taskmesh_dispatch;
- params->task_ring_entry_sgpr = radv_get_user_sgpr(task_shader, AC_UD_TASK_RING_ENTRY);
- params->task_xyz_sgpr = radv_get_user_sgpr(task_shader, AC_UD_CS_GRID_SIZE);
- params->task_draw_id_sgpr = radv_get_user_sgpr(task_shader, AC_UD_CS_TASK_DRAW_ID);
- }
- } else {
- if (graphics_pipeline->uses_baseinstance)
- vtx_base_sgpr |= DGC_USES_BASEINSTANCE;
- }
-
- params->vtx_base_sgpr = vtx_base_sgpr;
- params->max_index_count = cmd_buffer->state.max_index_count;
- params->dynamic_vs_input = layout->bind_vbo_mask && vs->info.vs.dynamic_inputs;
- params->use_per_attribute_vb_descs = layout->bind_vbo_mask && vs->info.vs.use_per_attribute_vb_descs;
-
- if (layout->bind_vbo_mask) {
- uint8_t *ptr = (uint8_t *)((char *)*upload_data);
-
- for (uint32_t i = 0; i < MAX_VBS; i++) {
- struct radv_vbo_info vbo_info;
- radv_get_vbo_info(cmd_buffer, i, &vbo_info);
-
- memcpy(ptr, &vbo_info, sizeof(vbo_info));
- ptr += sizeof(struct radv_vbo_info);
-
- memcpy(ptr, &layout->vbo_offsets[vbo_info.binding], sizeof(uint32_t));
- ptr += sizeof(uint32_t);
- }
- params->vb_desc_usage_mask = vs->info.vs.vb_desc_usage_mask;
- params->vbo_reg = radv_get_user_sgpr(vs, AC_UD_VS_VERTEX_BUFFERS);
-
- *upload_data = (char *)*upload_data + vb_size;
- }
+ /* Restore conditional rendering. */
+ cmd_buffer->state.predicating = old_predicating;
}
static void
-radv_prepare_dgc_compute(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo,
- unsigned *upload_size, unsigned *upload_offset, void **upload_data,
- struct radv_dgc_params *params, bool cond_render_enabled)
+radv_prepare_dgc_compute(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
+ struct radv_cmd_buffer *state_cmd_buffer, unsigned *upload_size, unsigned *upload_offset,
+ void **upload_data, struct radv_dgc_params *params, bool cond_render_enabled)
+
{
- VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
+ VK_FROM_HANDLE(radv_indirect_execution_set, ies, pGeneratedCommandsInfo->indirectExecutionSet);
const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
- const uint32_t alloc_size = pipeline ? sizeof(struct radv_compute_pipeline_metadata) : 0;
+ const uint32_t alloc_size = ies ? 0 : sizeof(struct radv_compute_pipeline_metadata);
*upload_size = MAX2(*upload_size + alloc_size, 16);
@@ -2607,59 +2810,172 @@ radv_prepare_dgc_compute(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCo
params->predication_type = cmd_buffer->state.predication_type;
}
- if (pipeline) {
- struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
- struct radv_shader *cs = radv_get_shader(compute_pipeline->base.shaders, MESA_SHADER_COMPUTE);
+ if (ies) {
+ struct radv_descriptor_state *descriptors_state =
+ radv_get_descriptors_state(state_cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
+
+ radv_upload_indirect_descriptor_sets(cmd_buffer, descriptors_state);
+
+ params->ies_stride = ies->stride;
+ params->indirect_desc_sets_va = descriptors_state->indirect_descriptor_sets_va;
+ } else {
+ const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
+ vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
+ const VkGeneratedCommandsShaderInfoEXT *eso_info =
+ vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
+ const struct radv_shader *cs = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_COMPUTE);
struct radv_compute_pipeline_metadata *metadata = (struct radv_compute_pipeline_metadata *)(*upload_data);
radv_get_compute_shader_metadata(device, cs, metadata);
*upload_data = (char *)*upload_data + alloc_size;
+ }
+}
+
+static void
+radv_prepare_dgc_rt(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
+ unsigned *upload_size, unsigned *upload_offset, void **upload_data, struct radv_dgc_params *params)
+{
+ if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) {
+ vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return;
+ }
+
+ const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
+ vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
+ VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
+ const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
+ const struct radv_shader *rt_prolog = rt_pipeline->prolog;
+
+ params->wave32 = rt_prolog->info.wave_size == 32;
+ params->grid_base_sgpr = radv_get_user_sgpr(rt_prolog, AC_UD_CS_GRID_SIZE);
+ params->cs_sbt_descriptors = radv_get_user_sgpr(rt_prolog, AC_UD_CS_SBT_DESCRIPTORS);
+ params->cs_ray_launch_size_addr = radv_get_user_sgpr(rt_prolog, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR);
+}
+
+static uint32_t
+get_dgc_vertex_binding_offset(const struct radv_indirect_command_layout *layout, uint32_t binding)
+{
+ for (uint32_t i = 0; i < layout->vk.n_vb_layouts; i++) {
+ if (layout->vk.vb_layouts[i].binding == binding)
+ return layout->vk.vb_layouts[i].src_offset_B;
+ }
+
+ return -1;
+}
+
+static void
+radv_prepare_dgc_graphics(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
+ struct radv_cmd_buffer *state_cmd_buffer, unsigned *upload_size, unsigned *upload_offset,
+ void **upload_data, struct radv_dgc_params *params)
+{
+ VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
+
+ const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
+ vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
+ const VkGeneratedCommandsShaderInfoEXT *eso_info =
+ vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
+
+ const gl_shader_stage first_stage =
+ (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) ? MESA_SHADER_MESH : MESA_SHADER_VERTEX;
+ struct radv_shader *first_shader = radv_dgc_get_shader(pipeline_info, eso_info, first_stage);
+
+ unsigned vb_size = (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) ? MAX_VBS * DGC_VBO_INFO_SIZE : 0;
+
+ *upload_size = MAX2(*upload_size + vb_size, 16);
+
+ if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) {
+ vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return;
+ }
+
+ uint16_t vtx_base_sgpr = radv_get_user_sgpr(first_shader, AC_UD_VS_BASE_VERTEX_START_INSTANCE);
+ const bool uses_drawid = first_shader->info.vs.needs_draw_id;
+
+ if (uses_drawid)
+ vtx_base_sgpr |= DGC_USES_DRAWID;
+
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) {
+ if (first_shader->info.cs.uses_grid_size)
+ vtx_base_sgpr |= DGC_USES_GRID_SIZE;
+
+ const struct radv_shader *task_shader = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK);
+ if (task_shader) {
+ params->has_task_shader = 1;
+ params->mesh_ring_entry_sgpr = radv_get_user_sgpr(first_shader, AC_UD_TASK_RING_ENTRY);
+ params->linear_dispatch_en = task_shader->info.cs.linear_taskmesh_dispatch;
+ params->task_ring_entry_sgpr = radv_get_user_sgpr(task_shader, AC_UD_TASK_RING_ENTRY);
+ params->wave32 = task_shader->info.wave_size == 32;
+ params->task_xyz_sgpr = radv_get_user_sgpr(task_shader, AC_UD_CS_GRID_SIZE);
+ params->task_draw_id_sgpr = radv_get_user_sgpr(task_shader, AC_UD_CS_TASK_DRAW_ID);
+ }
} else {
- struct radv_descriptor_state *descriptors_state =
- radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
+ const bool uses_baseinstance = first_shader->info.vs.needs_base_instance;
- radv_upload_indirect_descriptor_sets(cmd_buffer, descriptors_state);
+ if (uses_baseinstance)
+ vtx_base_sgpr |= DGC_USES_BASEINSTANCE;
+ }
- params->indirect_desc_sets_va = descriptors_state->indirect_descriptor_sets_va;
+ params->vtx_base_sgpr = vtx_base_sgpr;
+ params->max_index_count = state_cmd_buffer->state.max_index_count;
+ params->max_draw_count = pGeneratedCommandsInfo->maxDrawCount;
+ params->dynamic_vs_input =
+ (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) && first_shader->info.vs.dynamic_inputs;
+ params->use_per_attribute_vb_descs =
+ (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) && first_shader->info.vs.use_per_attribute_vb_descs;
+
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) {
+ uint8_t *ptr = (uint8_t *)((char *)*upload_data);
+
+ for (uint32_t i = 0; i < MAX_VBS; i++) {
+ struct radv_vbo_info vbo_info;
+ radv_get_vbo_info(state_cmd_buffer, i, &vbo_info);
+
+ const uint32_t vbo_offset = get_dgc_vertex_binding_offset(layout, vbo_info.binding);
+
+ memcpy(ptr, &vbo_info, sizeof(vbo_info));
+ ptr += sizeof(struct radv_vbo_info);
+
+ memcpy(ptr, &vbo_offset, sizeof(uint32_t));
+ ptr += sizeof(uint32_t);
+ }
+ params->vb_desc_usage_mask = first_shader->info.vs.vb_desc_usage_mask;
+ params->vbo_reg = radv_get_user_sgpr(first_shader, AC_UD_VS_VERTEX_BUFFERS);
+
+ *upload_data = (char *)*upload_data + vb_size;
}
}
void
-radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo,
- bool cond_render_enabled)
+radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
+ struct radv_cmd_buffer *state_cmd_buffer, bool cond_render_enabled)
{
VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
- VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
- VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
- VK_FROM_HANDLE(radv_buffer, stream_buffer, pGeneratedCommandsInfo->pStreams[0].buffer);
- VK_FROM_HANDLE(radv_buffer, sequence_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer);
+ VK_FROM_HANDLE(radv_indirect_execution_set, ies, pGeneratedCommandsInfo->indirectExecutionSet);
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const struct radv_physical_device *pdev = radv_device_physical(device);
struct radv_meta_saved_state saved_state;
- unsigned upload_offset, upload_size;
+ unsigned upload_offset, upload_size = 0;
struct radv_buffer token_buffer;
void *upload_data;
- uint64_t upload_addr =
- radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset + pGeneratedCommandsInfo->preprocessOffset;
-
- uint64_t stream_addr =
- radv_buffer_get_va(stream_buffer->bo) + stream_buffer->offset + pGeneratedCommandsInfo->pStreams[0].offset;
-
- uint64_t sequence_count_addr = 0;
- if (sequence_count_buffer)
- sequence_count_addr = radv_buffer_get_va(sequence_count_buffer->bo) + sequence_count_buffer->offset +
- pGeneratedCommandsInfo->sequencesCountOffset;
+ const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
+ vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
+ const VkGeneratedCommandsShaderInfoEXT *eso_info =
+ vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo);
- const uint32_t sequences_count = pGeneratedCommandsInfo->sequencesCount;
+ const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount;
struct dgc_cmdbuf_layout cmdbuf_layout;
- get_dgc_cmdbuf_layout(device, layout, pipeline, sequences_count, use_preamble, &cmdbuf_layout);
+ get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout);
- assert((cmdbuf_layout.main_offset + upload_addr) % pdev->info.ip[AMD_IP_GFX].ib_alignment == 0);
- assert((cmdbuf_layout.ace_main_offset + upload_addr) % pdev->info.ip[AMD_IP_COMPUTE].ib_alignment == 0);
+ assert((cmdbuf_layout.main_offset + pGeneratedCommandsInfo->preprocessAddress) %
+ pdev->info.ip[AMD_IP_GFX].ib_alignment ==
+ 0);
+ assert((cmdbuf_layout.ace_main_offset + pGeneratedCommandsInfo->preprocessAddress) %
+ pdev->info.ip[AMD_IP_COMPUTE].ib_alignment ==
+ 0);
struct radv_dgc_params params = {
.cmd_buf_preamble_offset = cmdbuf_layout.main_preamble_offset,
@@ -2672,67 +2988,98 @@ radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsIn
.ace_cmd_buf_stride = cmdbuf_layout.ace_cmd_stride,
.ace_cmd_buf_size = cmdbuf_layout.ace_size,
.upload_main_offset = cmdbuf_layout.upload_offset,
- .upload_addr = (uint32_t)upload_addr,
+ .upload_addr = (uint32_t)pGeneratedCommandsInfo->preprocessAddress,
.upload_stride = cmdbuf_layout.upload_stride,
- .sequence_count = sequences_count | (sequence_count_addr ? 1u << 31 : 0),
- .sequence_count_addr = sequence_count_addr,
+ .sequence_count = sequences_count,
.use_preamble = use_preamble,
- .stream_addr = stream_addr,
+ .stream_addr = pGeneratedCommandsInfo->indirectAddress,
+ .sequence_count_addr = pGeneratedCommandsInfo->sequenceCountAddress,
+ .ies_addr = ies ? ies->va : 0,
+ .queue_family = state_cmd_buffer->qf,
};
- upload_size = layout->push_constant_size + ARRAY_SIZE(pipeline->shaders) * 12;
- if (!layout->push_constant_mask)
- upload_size = 0;
+ VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout);
- if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
- radv_prepare_dgc_graphics(cmd_buffer, pGeneratedCommandsInfo, &upload_size, &upload_offset, &upload_data,
- ¶ms);
+ if (layout->vk.dgc_info & (BITFIELD_BIT(MESA_VK_DGC_PC) | BITFIELD_BIT(MESA_VK_DGC_SI))) {
+ upload_size = pipeline_layout->push_constant_size + MESA_VULKAN_SHADER_STAGES * 12;
+ }
+
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
+ radv_prepare_dgc_compute(cmd_buffer, pGeneratedCommandsInfo, state_cmd_buffer, &upload_size, &upload_offset,
+ &upload_data, ¶ms, cond_render_enabled);
+ } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) {
+ radv_prepare_dgc_rt(cmd_buffer, pGeneratedCommandsInfo, &upload_size, &upload_offset, &upload_data, ¶ms);
} else {
- assert(layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE);
- radv_prepare_dgc_compute(cmd_buffer, pGeneratedCommandsInfo, &upload_size, &upload_offset, &upload_data, ¶ms,
- cond_render_enabled);
+ radv_prepare_dgc_graphics(cmd_buffer, pGeneratedCommandsInfo, state_cmd_buffer, &upload_size, &upload_offset,
+ &upload_data, ¶ms);
}
if (layout->push_constant_mask) {
VkShaderStageFlags pc_stages = 0;
uint32_t *desc = upload_data;
- upload_data = (char *)upload_data + ARRAY_SIZE(pipeline->shaders) * 12;
+ upload_data = (char *)upload_data + MESA_VULKAN_SHADER_STAGES * 12;
- if (pipeline) {
- for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); ++i) {
- if (!pipeline->shaders[i])
- continue;
+ struct radv_shader *shaders[MESA_VULKAN_SHADER_STAGES] = {0};
+ if (pipeline_info) {
+ VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
- const struct radv_shader *shader = pipeline->shaders[i];
- const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
- if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0)
- params.const_copy = 1;
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) {
+ const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
+ struct radv_shader *rt_prolog = rt_pipeline->prolog;
- if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0 ||
- locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
- unsigned upload_sgpr = 0;
- unsigned inline_sgpr = 0;
+ shaders[MESA_SHADER_COMPUTE] = rt_prolog;
+ } else {
+ memcpy(shaders, pipeline->shaders, sizeof(shaders));
+ }
+ } else if (eso_info) {
+ for (unsigned i = 0; i < eso_info->shaderCount; ++i) {
+ VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]);
+ struct radv_shader *shader = shader_object->shader;
+ gl_shader_stage stage = shader->info.stage;
- if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
- upload_sgpr = radv_get_user_sgpr(shader, AC_UD_PUSH_CONSTANTS);
- }
+ shaders[stage] = shader;
+ }
+ }
- if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
- inline_sgpr = radv_get_user_sgpr(shader, AC_UD_INLINE_PUSH_CONSTANTS);
- desc[i * 3 + 1] = pipeline->shaders[i]->info.inline_push_constant_mask;
- desc[i * 3 + 2] = pipeline->shaders[i]->info.inline_push_constant_mask >> 32;
- }
- desc[i * 3] = upload_sgpr | (inline_sgpr << 16);
+ for (unsigned i = 0; i < ARRAY_SIZE(shaders); i++) {
+ const struct radv_shader *shader = shaders[i];
- pc_stages |= mesa_to_vk_shader_stage(i);
+ if (!shader)
+ continue;
+
+ const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
+ if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
+ params.const_copy = 1;
+ }
+
+ if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0 ||
+ locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
+ unsigned upload_sgpr = 0;
+ unsigned inline_sgpr = 0;
+
+ if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
+ upload_sgpr = (shader->info.user_data_0 + 4 * locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx -
+ SI_SH_REG_OFFSET) >>
+ 2;
}
+
+ if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
+ inline_sgpr = (shader->info.user_data_0 + 4 * locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx -
+ SI_SH_REG_OFFSET) >>
+ 2;
+ desc[i * 3 + 1] = shader->info.inline_push_constant_mask;
+ desc[i * 3 + 2] = shader->info.inline_push_constant_mask >> 32;
+ }
+ desc[i * 3] = upload_sgpr | (inline_sgpr << 16);
+
+ pc_stages |= mesa_to_vk_shader_stage(i);
}
}
params.push_constant_stages = pc_stages;
- memcpy(upload_data, cmd_buffer->push_constants, layout->push_constant_size);
- upload_data = (char *)upload_data + layout->push_constant_size;
+ memcpy(upload_data, state_cmd_buffer->push_constants, pipeline_layout->push_constant_size);
+ upload_data = (char *)upload_data + pipeline_layout->push_constant_size;
}
radv_buffer_init(&token_buffer, device, cmd_buffer->upload.upload_bo, upload_size, upload_offset);
@@ -2756,36 +3103,256 @@ radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsIn
.offset = 0,
.range = upload_size}}});
- unsigned block_count = MAX2(1, DIV_ROUND_UP(pGeneratedCommandsInfo->sequencesCount, 64));
+ unsigned block_count = MAX2(1, DIV_ROUND_UP(pGeneratedCommandsInfo->maxSequenceCount, 64));
vk_common_CmdDispatch(radv_cmd_buffer_to_handle(cmd_buffer), block_count, 1, 1);
radv_buffer_finish(&token_buffer);
radv_meta_restore(&saved_state, cmd_buffer);
}
-/* VK_NV_device_generated_commands_compute */
-VKAPI_ATTR void VKAPI_CALL
-radv_GetPipelineIndirectMemoryRequirementsNV(VkDevice _device, const VkComputePipelineCreateInfo *pCreateInfo,
- VkMemoryRequirements2 *pMemoryRequirements)
+static void
+radv_destroy_indirect_commands_layout(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
+ struct radv_indirect_command_layout *layout)
+{
+ radv_DestroyPipeline(radv_device_to_handle(device), layout->pipeline, &device->meta_state.alloc);
+
+ vk_indirect_command_layout_destroy(&device->vk, pAllocator, &layout->vk);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+radv_CreateIndirectCommandsLayoutEXT(VkDevice _device, const VkIndirectCommandsLayoutCreateInfoEXT *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkIndirectCommandsLayoutEXT *pIndirectCommandsLayout)
+{
+ VK_FROM_HANDLE(radv_device, device, _device);
+ struct radv_indirect_command_layout *layout;
+ VkResult result;
+
+ layout = vk_indirect_command_layout_create(&device->vk, pCreateInfo, pAllocator, sizeof(*layout));
+ if (!layout)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ for (uint32_t i = 0; i < layout->vk.n_pc_layouts; i++) {
+ for (uint32_t j = layout->vk.pc_layouts[i].dst_offset_B / 4, k = 0; k < layout->vk.pc_layouts[i].size_B / 4;
+ j++, k++) {
+ layout->push_constant_mask |= 1ull << j;
+ layout->push_constant_offsets[j] = layout->vk.pc_layouts[i].src_offset_B + k * 4;
+ }
+ }
+
+ if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_SI)) {
+ layout->sequence_index_mask = 1ull << (layout->vk.si_layout.dst_offset_B / 4);
+ layout->push_constant_mask |= layout->sequence_index_mask;
+ }
+
+ result = radv_create_dgc_pipeline(device, layout);
+ if (result != VK_SUCCESS) {
+ radv_destroy_indirect_commands_layout(device, pAllocator, layout);
+ return result;
+ }
+
+ *pIndirectCommandsLayout = radv_indirect_command_layout_to_handle(layout);
+ return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+radv_DestroyIndirectCommandsLayoutEXT(VkDevice _device, VkIndirectCommandsLayoutEXT indirectCommandsLayout,
+ const VkAllocationCallbacks *pAllocator)
+{
+ VK_FROM_HANDLE(radv_device, device, _device);
+ VK_FROM_HANDLE(radv_indirect_command_layout, layout, indirectCommandsLayout);
+
+ if (!layout)
+ return;
+
+ vk_indirect_command_layout_destroy(&device->vk, pAllocator, &layout->vk);
+}
+
+static void
+radv_update_ies_shader(struct radv_device *device, struct radv_indirect_execution_set *set, uint32_t index,
+ struct radv_shader *shader)
+{
+ const struct radv_physical_device *pdev = radv_device_physical(device);
+ uint8_t *ptr = set->mapped_ptr + set->stride * index;
+ struct radv_compute_pipeline_metadata md;
+ struct radeon_cmdbuf *cs;
+
+ assert(shader->info.stage == MESA_SHADER_COMPUTE);
+ radv_get_compute_shader_metadata(device, shader, &md);
+
+ cs = calloc(1, sizeof(*cs));
+ if (!cs)
+ return;
+
+ cs->reserved_dw = cs->max_dw = 32;
+ cs->buf = malloc(cs->max_dw * 4);
+ if (!cs->buf) {
+ free(cs);
+ return;
+ }
+
+ radv_emit_compute_shader(pdev, cs, shader);
+
+ memcpy(ptr, &md, sizeof(md));
+ ptr += sizeof(md);
+
+ memcpy(ptr, &cs->cdw, sizeof(uint32_t));
+ ptr += sizeof(uint32_t);
+
+ memcpy(ptr, cs->buf, cs->cdw * sizeof(uint32_t));
+ ptr += cs->cdw * sizeof(uint32_t);
+
+ set->compute_scratch_size_per_wave = MAX2(set->compute_scratch_size_per_wave, shader->config.scratch_bytes_per_wave);
+ set->compute_scratch_waves = MAX2(set->compute_scratch_waves, radv_get_max_scratch_waves(device, shader));
+
+ free(cs->buf);
+ free(cs);
+}
+
+static void
+radv_update_ies_pipeline(struct radv_device *device, struct radv_indirect_execution_set *set, uint32_t index,
+ const struct radv_pipeline *pipeline)
+{
+ assert(pipeline->type == RADV_PIPELINE_COMPUTE);
+ radv_update_ies_shader(device, set, index, pipeline->shaders[MESA_SHADER_COMPUTE]);
+}
+
+static void
+radv_destroy_indirect_execution_set(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
+ struct radv_indirect_execution_set *set)
+{
+ if (set->bo)
+ radv_bo_destroy(device, &set->base, set->bo);
+
+ vk_object_base_finish(&set->base);
+ vk_free2(&device->vk.alloc, pAllocator, set);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+radv_CreateIndirectExecutionSetEXT(VkDevice _device, const VkIndirectExecutionSetCreateInfoEXT *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkIndirectExecutionSetEXT *pIndirectExecutionSet)
{
- VkMemoryRequirements *reqs = &pMemoryRequirements->memoryRequirements;
VK_FROM_HANDLE(radv_device, device, _device);
const struct radv_physical_device *pdev = radv_device_physical(device);
- uint32_t size;
+ struct radv_indirect_execution_set *set;
+ uint32_t num_entries;
+ uint32_t stride;
+ VkResult result;
- size = sizeof(struct radv_compute_pipeline_metadata);
- size += 4 /* num CS DW */;
- size += (pdev->info.gfx_level >= GFX10 ? 19 : 16) * 4;
+ set = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*set), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (!set)
+ return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
- reqs->memoryTypeBits = ((1u << pdev->memory_properties.memoryTypeCount) - 1u) & ~pdev->memory_types_32bit;
- reqs->alignment = 4;
- reqs->size = align(size, reqs->alignment);
+ vk_object_base_init(&device->vk, &set->base, VK_OBJECT_TYPE_INDIRECT_EXECUTION_SET_EXT);
+
+ switch (pCreateInfo->type) {
+ case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT: {
+ const VkIndirectExecutionSetPipelineInfoEXT *pipeline_info = pCreateInfo->info.pPipelineInfo;
+ VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->initialPipeline);
+
+ assert(pipeline->type == RADV_PIPELINE_COMPUTE);
+ num_entries = pipeline_info->maxPipelineCount;
+ break;
+ }
+ case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT: {
+ const VkIndirectExecutionSetShaderInfoEXT *shaders_info = pCreateInfo->info.pShaderInfo;
+ VK_FROM_HANDLE(radv_shader_object, shader_object, shaders_info->pInitialShaders[0]);
+
+ assert(shader_object->stage == MESA_SHADER_COMPUTE);
+ num_entries = shaders_info->maxShaderCount;
+ break;
+ }
+ default:
+ unreachable("Invalid IES type");
+ }
+
+ stride = sizeof(struct radv_compute_pipeline_metadata);
+ stride += 4 /* num CS DW */;
+ stride += (pdev->info.gfx_level >= GFX10 ? 19 : 16) * 4;
+
+ result = radv_bo_create(device, &set->base, num_entries * stride, 8, RADEON_DOMAIN_VRAM,
+ RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY, RADV_BO_PRIORITY_DESCRIPTOR, 0,
+ false, &set->bo);
+ if (result != VK_SUCCESS) {
+ radv_destroy_indirect_execution_set(device, pAllocator, set);
+ return vk_error(device, result);
+ }
+
+ set->mapped_ptr = (uint8_t *)radv_buffer_map(device->ws, set->bo);
+ if (!set->mapped_ptr) {
+ radv_destroy_indirect_execution_set(device, pAllocator, set);
+ return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ }
+
+ set->va = radv_buffer_get_va(set->bo);
+ set->stride = stride;
+
+ /* The driver is supposed to always populate slot 0 with the initial pipeline/shader. */
+ switch (pCreateInfo->type) {
+ case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT: {
+ const VkIndirectExecutionSetPipelineInfoEXT *pipeline_info = pCreateInfo->info.pPipelineInfo;
+ VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->initialPipeline);
+
+ radv_update_ies_pipeline(device, set, 0, pipeline);
+ break;
+ }
+ case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT: {
+ const VkIndirectExecutionSetShaderInfoEXT *shaders_info = pCreateInfo->info.pShaderInfo;
+ VK_FROM_HANDLE(radv_shader_object, shader_object, shaders_info->pInitialShaders[0]);
+
+ radv_update_ies_shader(device, set, 0, shader_object->shader);
+ break;
+ }
+ default:
+ unreachable("Invalid IES type");
+ }
+
+ *pIndirectExecutionSet = radv_indirect_execution_set_to_handle(set);
+ return VK_SUCCESS;
}
-VKAPI_ATTR VkDeviceAddress VKAPI_CALL
-radv_GetPipelineIndirectDeviceAddressNV(VkDevice device, const VkPipelineIndirectDeviceAddressInfoNV *pInfo)
+VKAPI_ATTR void VKAPI_CALL
+radv_DestroyIndirectExecutionSetEXT(VkDevice _device, VkIndirectExecutionSetEXT indirectExecutionSet,
+ const VkAllocationCallbacks *pAllocator)
{
- VK_FROM_HANDLE(radv_pipeline, pipeline, pInfo->pipeline);
+ VK_FROM_HANDLE(radv_device, device, _device);
+ VK_FROM_HANDLE(radv_indirect_execution_set, set, indirectExecutionSet);
- return radv_pipeline_to_compute(pipeline)->indirect.va;
+ if (!set)
+ return;
+
+ radv_destroy_indirect_execution_set(device, pAllocator, set);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+radv_UpdateIndirectExecutionSetPipelineEXT(VkDevice _device, VkIndirectExecutionSetEXT indirectExecutionSet,
+ uint32_t executionSetWriteCount,
+ const VkWriteIndirectExecutionSetPipelineEXT *pExecutionSetWrites)
+{
+ VK_FROM_HANDLE(radv_indirect_execution_set, set, indirectExecutionSet);
+ VK_FROM_HANDLE(radv_device, device, _device);
+
+ for (uint32_t i = 0; i < executionSetWriteCount; i++) {
+ const VkWriteIndirectExecutionSetPipelineEXT *writeset = &pExecutionSetWrites[i];
+ VK_FROM_HANDLE(radv_pipeline, pipeline, writeset->pipeline);
+
+ radv_update_ies_pipeline(device, set, writeset->index, pipeline);
+ }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+radv_UpdateIndirectExecutionSetShaderEXT(VkDevice _device, VkIndirectExecutionSetEXT indirectExecutionSet,
+ uint32_t executionSetWriteCount,
+ const VkWriteIndirectExecutionSetShaderEXT *pExecutionSetWrites)
+{
+ VK_FROM_HANDLE(radv_indirect_execution_set, set, indirectExecutionSet);
+ VK_FROM_HANDLE(radv_device, device, _device);
+
+ for (uint32_t i = 0; i < executionSetWriteCount; i++) {
+ const VkWriteIndirectExecutionSetShaderEXT *writeset = &pExecutionSetWrites[i];
+ VK_FROM_HANDLE(radv_shader_object, shader_object, writeset->shader);
+
+ radv_update_ies_shader(device, set, writeset->index, shader_object->shader);
+ }
}
diff --git a/src/amd/vulkan/radv_dgc.h b/src/amd/vulkan/radv_dgc.h
new file mode 100644
index 00000000000..f208a07805a
--- /dev/null
+++ b/src/amd/vulkan/radv_dgc.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright © 2024 Valve Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef RADV_DGC_H
+#define RADV_DGC_H
+
+#include "compiler/shader_enums.h"
+
+#include "radv_constants.h"
+
+#include "vk_device_generated_commands.h"
+
+struct radv_cmd_buffer;
+enum radv_queue_family;
+
+struct radv_indirect_command_layout {
+ struct vk_indirect_command_layout vk;
+
+ uint64_t push_constant_mask;
+ uint32_t push_constant_offsets[MAX_PUSH_CONSTANTS_SIZE / 4];
+ uint64_t sequence_index_mask;
+
+ VkPipeline pipeline;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_indirect_command_layout, vk.base, VkIndirectCommandsLayoutEXT,
+ VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_EXT)
+
+struct radv_indirect_execution_set {
+ struct vk_object_base base;
+
+ struct radeon_winsys_bo *bo;
+ uint64_t va;
+ uint8_t *mapped_ptr;
+
+ uint32_t stride;
+
+ uint32_t compute_scratch_size_per_wave;
+ uint32_t compute_scratch_waves;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_indirect_execution_set, base, VkIndirectExecutionSetEXT,
+ VK_OBJECT_TYPE_INDIRECT_EXECUTION_SET_EXT);
+
+uint32_t radv_get_indirect_main_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo);
+uint32_t radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo);
+
+uint32_t radv_get_indirect_main_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo);
+uint32_t radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo);
+
+uint32_t radv_get_indirect_main_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo);
+uint32_t radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo);
+
+void radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
+ struct radv_cmd_buffer *state_cmd_buffer, bool cond_render_enabled);
+
+bool radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer,
+ const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo);
+
+struct radv_shader *radv_dgc_get_shader(const VkGeneratedCommandsPipelineInfoEXT *pipeline_info,
+ const VkGeneratedCommandsShaderInfoEXT *eso_info, gl_shader_stage stage);
+
+#endif /* RADV_DGC_H */
diff --git a/src/amd/vulkan/radv_instance.c b/src/amd/vulkan/radv_instance.c
index 9ec86ee0d8a..ff98e336596 100644
--- a/src/amd/vulkan/radv_instance.c
+++ b/src/amd/vulkan/radv_instance.c
@@ -148,7 +148,6 @@ static const driOptionDescription radv_dri_options[] = {
DRI_CONF_RADV_DISABLE_TRUNC_COORD(false)
DRI_CONF_RADV_DISABLE_SINKING_LOAD_INPUT_FS(false)
DRI_CONF_RADV_DISABLE_DEPTH_STORAGE(false)
- DRI_CONF_RADV_DGC(false)
DRI_CONF_RADV_FLUSH_BEFORE_QUERY_COPY(false)
DRI_CONF_RADV_ENABLE_UNIFIED_HEAP_ON_APU(false)
DRI_CONF_RADV_TEX_NON_UNIFORM(false)
@@ -243,8 +242,6 @@ radv_init_dri_options(struct radv_instance *instance)
instance->drirc.override_ray_tracing_shader_version =
driQueryOptioni(&instance->drirc.options, "radv_override_ray_tracing_shader_version");
- instance->drirc.enable_dgc = driQueryOptionb(&instance->drirc.options, "radv_dgc");
-
instance->drirc.override_vram_size = driQueryOptioni(&instance->drirc.options, "override_vram_size");
instance->drirc.enable_khr_present_wait = driQueryOptionb(&instance->drirc.options, "vk_khr_present_wait");
diff --git a/src/amd/vulkan/radv_instance.h b/src/amd/vulkan/radv_instance.h
index 9c16c806cc3..fc103e1a1da 100644
--- a/src/amd/vulkan/radv_instance.h
+++ b/src/amd/vulkan/radv_instance.h
@@ -66,7 +66,6 @@ struct radv_instance {
bool legacy_sparse_binding;
bool force_pstate_peak_gfx11_dgpu;
bool clear_lds;
- bool enable_dgc;
bool enable_khr_present_wait;
bool report_llvm9_version_string;
bool vk_require_etc2;
diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c
index d6ce01d3c29..baf1ad57a9c 100644
--- a/src/amd/vulkan/radv_physical_device.c
+++ b/src/amd/vulkan/radv_physical_device.c
@@ -747,10 +747,6 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device
.INTEL_shader_integer_functions2 = true,
.MESA_image_alignment_control = pdev->info.gfx_level >= GFX9 && pdev->info.gfx_level <= GFX11_5,
.NV_compute_shader_derivatives = true,
- .NV_device_generated_commands =
- pdev->info.gfx_level >= GFX8 && instance->drirc.enable_dgc && !(instance->debug_flags & RADV_DEBUG_NO_IBS),
- .NV_device_generated_commands_compute =
- pdev->info.gfx_level >= GFX8 && instance->drirc.enable_dgc && !(instance->debug_flags & RADV_DEBUG_NO_IBS),
/* Undocumented extension purely for vkd3d-proton. This check is to prevent anyone else from
* using it.
*/
@@ -1129,9 +1125,6 @@ radv_physical_device_get_features(const struct radv_physical_device *pdev, struc
.performanceCounterQueryPools = has_perf_query,
.performanceCounterMultipleQueryPools = has_perf_query,
- /* VK_NV_device_generated_commands */
- .deviceGeneratedCommandsNV = true,
-
/* VK_EXT_attachment_feedback_loop_layout */
.attachmentFeedbackLoopLayout = true,
@@ -1214,11 +1207,6 @@ radv_physical_device_get_features(const struct radv_physical_device *pdev, struc
/* VK_KHR_maintenance5 */
.maintenance5 = true,
- /* VK_NV_device_generated_commands_compute */
- .deviceGeneratedCompute = true,
- .deviceGeneratedComputePipelines = true,
- .deviceGeneratedComputeCaptureReplay = false,
-
/* VK_KHR_cooperative_matrix */
.cooperativeMatrix = pdev->info.gfx_level >= GFX11 && !pdev->use_llvm,
.cooperativeMatrixRobustBufferAccess = pdev->info.gfx_level >= GFX11 && !pdev->use_llvm,
@@ -1830,20 +1818,6 @@ radv_get_physical_device_properties(struct radv_physical_device *pdev)
/* VK_KHR_performance_query */
p->allowCommandBufferQueryCopies = false;
- /* VK_NV_device_generated_commands */
- p->maxIndirectCommandsStreamCount = 1;
- p->maxIndirectCommandsStreamStride = UINT32_MAX;
- p->maxIndirectCommandsTokenCount = 512;
- p->maxIndirectCommandsTokenOffset = UINT16_MAX;
- p->minIndirectCommandsBufferOffsetAlignment = 4;
- p->minSequencesCountBufferOffsetAlignment = 4;
- p->minSequencesIndexBufferOffsetAlignment = 4;
- /* Don't support even a shader group count = 1 until we support shader
- * overrides during pipeline creation. */
- p->maxGraphicsShaderGroupCount = 0;
- /* MSB reserved for signalling indirect count enablement. */
- p->maxIndirectSequenceCount = UINT32_MAX >> 1;
-
/* VK_EXT_graphics_pipeline_library */
p->graphicsPipelineLibraryFastLinking = true;
p->graphicsPipelineLibraryIndependentInterpolationDecoration = true;
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 56cc2b4b62e..a9df9b6b8ae 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -137,7 +137,7 @@ radv_pipeline_get_shader_key(const struct radv_device *device, const VkPipelineS
if (flags & VK_PIPELINE_CREATE_2_VIEW_INDEX_FROM_DEVICE_INDEX_BIT_KHR)
key.view_index_from_device_index = 1;
- if (flags & VK_PIPELINE_CREATE_INDIRECT_BINDABLE_BIT_NV)
+ if (flags & VK_PIPELINE_CREATE_2_INDIRECT_BINDABLE_BIT_EXT)
key.indirect_bindable = 1;
if (stage->stage & RADV_GRAPHICS_STAGE_BITS) {
diff --git a/src/amd/vulkan/radv_pipeline_compute.c b/src/amd/vulkan/radv_pipeline_compute.c
index de9abc08505..32d04d44257 100644
--- a/src/amd/vulkan/radv_pipeline_compute.c
+++ b/src/amd/vulkan/radv_pipeline_compute.c
@@ -305,37 +305,6 @@ radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache, const VkC
radv_compute_pipeline_init(pipeline, pipeline_layout, pipeline->base.shaders[MESA_SHADER_COMPUTE]);
- if (pipeline->base.create_flags & VK_PIPELINE_CREATE_INDIRECT_BINDABLE_BIT_NV) {
- const VkComputePipelineIndirectBufferInfoNV *indirect_buffer =
- vk_find_struct_const(pCreateInfo->pNext, COMPUTE_PIPELINE_INDIRECT_BUFFER_INFO_NV);
- struct radv_shader *shader = pipeline->base.shaders[MESA_SHADER_COMPUTE];
- const struct radv_physical_device *pdev = radv_device_physical(device);
- struct radeon_cmdbuf *cs = &pipeline->indirect.cs;
-
- cs->reserved_dw = cs->max_dw = 32;
- cs->buf = malloc(cs->max_dw * 4);
- if (!cs->buf) {
- radv_pipeline_destroy(device, &pipeline->base, pAllocator);
- return result;
- }
-
- radv_emit_compute_shader(pdev, cs, shader);
-
- pipeline->indirect.va = indirect_buffer->deviceAddress;
- pipeline->indirect.size = indirect_buffer->size;
-
- /* vkCmdUpdatePipelineIndirectBufferNV() can be called on any queues supporting transfer
- * operations and it's not required to call it on the same queue as the DGC execute. Because
- * it's not possible to know if the compute shader uses scratch when DGC execute is called,
- * the only solution is gather the max scratch size of all indirect pipelines.
- */
- simple_mtx_lock(&device->compute_scratch_mtx);
- device->compute_scratch_size_per_wave =
- MAX2(device->compute_scratch_size_per_wave, shader->config.scratch_bytes_per_wave);
- device->compute_scratch_waves = MAX2(device->compute_scratch_waves, radv_get_max_scratch_waves(device, shader));
- simple_mtx_unlock(&device->compute_scratch_mtx);
- }
-
*pPipeline = radv_pipeline_to_handle(&pipeline->base);
radv_rmv_log_compute_pipeline_create(device, &pipeline->base, pipeline->base.is_internal);
return VK_SUCCESS;
@@ -371,12 +340,8 @@ radv_create_compute_pipelines(VkDevice _device, VkPipelineCache pipelineCache, u
void
radv_destroy_compute_pipeline(struct radv_device *device, struct radv_compute_pipeline *pipeline)
{
- struct radeon_cmdbuf *cs = &pipeline->indirect.cs;
-
if (pipeline->base.shaders[MESA_SHADER_COMPUTE])
radv_shader_unref(device, pipeline->base.shaders[MESA_SHADER_COMPUTE]);
-
- free(cs->buf);
}
VKAPI_ATTR VkResult VKAPI_CALL
diff --git a/src/amd/vulkan/radv_pipeline_compute.h b/src/amd/vulkan/radv_pipeline_compute.h
index cc75d90396a..9ac879d7ad9 100644
--- a/src/amd/vulkan/radv_pipeline_compute.h
+++ b/src/amd/vulkan/radv_pipeline_compute.h
@@ -19,12 +19,6 @@ struct radv_shader_info;
struct radv_compute_pipeline {
struct radv_pipeline base;
-
- struct {
- struct radeon_cmdbuf cs;
- uint64_t va;
- uint64_t size;
- } indirect;
};
RADV_DECL_PIPELINE_DOWNCAST(compute, RADV_PIPELINE_COMPUTE)
diff --git a/src/amd/vulkan/radv_queue.c b/src/amd/vulkan/radv_queue.c
index 4c02188d0d4..16127383a4d 100644
--- a/src/amd/vulkan/radv_queue.c
+++ b/src/amd/vulkan/radv_queue.c
@@ -1268,7 +1268,6 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
bool *has_follower)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
- bool has_indirect_pipeline_binds = false;
if (queue->qf != RADV_QUEUE_GENERAL && queue->qf != RADV_QUEUE_COMPUTE) {
for (uint32_t j = 0; j < cmd_buffer_count; j++) {
@@ -1308,16 +1307,6 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
needs.sample_positions |= cmd_buffer->sample_positions_needed;
*use_perf_counters |= cmd_buffer->state.uses_perf_counters;
*has_follower |= !!cmd_buffer->gang.cs;
-
- has_indirect_pipeline_binds |= cmd_buffer->has_indirect_pipeline_binds;
- }
-
- if (has_indirect_pipeline_binds) {
- /* Use the maximum possible scratch size for indirect compute pipelines with DGC. */
- simple_mtx_lock(&device->compute_scratch_mtx);
- needs.compute_scratch_size_per_wave = MAX2(needs.compute_scratch_waves, device->compute_scratch_size_per_wave);
- needs.compute_scratch_waves = MAX2(needs.compute_scratch_waves, device->compute_scratch_waves);
- simple_mtx_unlock(&device->compute_scratch_mtx);
}
/* Sanitize scratch size information. */
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 60c2d9c9b55..369586f2a0d 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -886,13 +886,6 @@ radv_create_shader_arena(struct radv_device *device, struct radv_shader_free_lis
if (replayable)
flags |= RADEON_FLAG_REPLAYABLE;
- /* vkCmdUpdatePipelineIndirectBufferNV() can be called on any queue supporting transfer
- * operations and it's not required to call it on the same queue as DGC execute. To make sure the
- * compute shader BO is part of the DGC execute submission, force all shaders to be local BOs.
- */
- if (device->vk.enabled_features.deviceGeneratedComputePipelines)
- flags |= RADEON_FLAG_PREFER_LOCAL_BO;
-
VkResult result;
result = radv_bo_create(device, NULL, arena_size, RADV_SHADER_ALLOC_ALIGNMENT, RADEON_DOMAIN_VRAM, flags,
RADV_BO_PRIORITY_SHADER, replay_va, true, &arena->bo);
diff --git a/src/amd/vulkan/radv_shader_object.c b/src/amd/vulkan/radv_shader_object.c
index 36f55a4af4d..e5b57476e0f 100644
--- a/src/amd/vulkan/radv_shader_object.c
+++ b/src/amd/vulkan/radv_shader_object.c
@@ -110,6 +110,9 @@ radv_shader_stage_init(const VkShaderCreateInfoEXT *sinfo, struct radv_shader_st
out_stage->key.subgroup_require_full = 1;
}
+ if (sinfo->flags & VK_SHADER_CREATE_INDIRECT_BINDABLE_BIT_EXT)
+ out_stage->key.indirect_bindable = 1;
+
if (out_stage->stage == MESA_SHADER_MESH) {
out_stage->key.has_task_shader = !(sinfo->flags & VK_SHADER_CREATE_NO_TASK_SHADER_BIT_EXT);
}
diff --git a/src/util/00-radv-defaults.conf b/src/util/00-radv-defaults.conf
index 555fc3c79af..371f9c8a7ca 100644
--- a/src/util/00-radv-defaults.conf
+++ b/src/util/00-radv-defaults.conf
@@ -42,7 +42,6 @@ Application bugs worked around in this file:
-
diff --git a/src/util/driconf.h b/src/util/driconf.h
index 009e6386594..2160f7b33de 100644
--- a/src/util/driconf.h
+++ b/src/util/driconf.h
@@ -708,10 +708,6 @@
DRI_CONF_OPT_B(radv_disable_depth_storage, def, \
"Hides support for storage access to depth formats")
-#define DRI_CONF_RADV_DGC(def) \
- DRI_CONF_OPT_B(radv_dgc, def, \
- "Expose an experimental implementation of VK_NV_device_generated_commands on GFX8+")
-
#define DRI_CONF_RADV_FLUSH_BEFORE_QUERY_COPY(def) \
DRI_CONF_OPT_B( \
radv_flush_before_query_copy, def, \