radv: implement DGC IB chaining when the number of sequences is too high

The maximum number of DWORDS per IB is limited by the hardware. So,
when the number of sequences is too high, it would just hang.

The solution here is to implement IB chaining inside the DGC cmdbuf
itself, so that a sequence chains the next one basically.

In practice, games only use up to 4K sequences and they aren't affected
by this change.

This fixes dEQP-VK.dgc.ext.misc.properties.maxIndirectSequenceCount.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/13536
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36062>
This commit is contained in:
Samuel Pitoiset 2025-07-08 11:41:39 +02:00 committed by Marge Bot
parent 14399da3c6
commit 00a6e284c8
2 changed files with 149 additions and 17 deletions

View file

@ -53,8 +53,5 @@ dEQP-VK.reconvergence.maximal.compute.nesting4.7.38
# investigated.
dEQP-VK.api.command_buffers.many_indirect_disps_on_secondary
# RADV bug
dEQP-VK.dgc.ext.misc.properties.maxIndirectSequenceCount
# CTS bug
dEQP-VK.api.debug_utils.long_labels_video_decode

View file

@ -21,6 +21,11 @@
#define PKT3_INDIRECT_BUFFER_BYTES 16
#define DGC_VBO_INFO_SIZE (sizeof(struct radv_vbo_info) + 4 /* vbo_offsets */)
/* Arbitrary limit of the maximum number of sequences per IB to simplify the DGC IB chaining
* implementation which is already quite complicated.
*/
#define MAX_SEQUENCES_PER_IB 65536
/* The DGC command buffer layout is quite complex, here's some explanations:
*
* Without the DGC preamble, the default layout looks like:
@ -56,6 +61,9 @@
* - on GFX, the driver uses the IB2 packet which the easiest solution
* - on COMPUTE, IB2 isn't supported and the driver chains the DGC command buffer by patching the
* trailer
*
* In case the number of sequences is greater than the arbitrary limit defined above, IB chaining
* is used between sequences (ie. sequence #0 chains to sequence #1 etc).
*/
uint32_t
@ -96,9 +104,27 @@ radv_dgc_trailer_cmdbuf_size(const struct radv_device *device, enum amd_ip_type
return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type);
}
static bool
radv_dgc_use_ib_chaining(uint32_t sequences_count)
{
/* Use IB chaining when the number of sequences is too high because the maximum number of DWORDS
* per IB is limited by the hw. This limit is arbitrary and could be more optimal. Though in
* practice, most applications use <= 4K sequences and won't be affected by this.
*
* This IB chaining mechanism is implemented by emitting a IB packet at the end of each sequence
* to chain the next one. Except for the last sequence which emits the "jump to trailer" when on
* compute queue or only padding when on graphics queue.
*/
return sequences_count >= MAX_SEQUENCES_PER_IB;
}
static bool
radv_dgc_use_preamble(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
{
/* Use a preamble with IB chaining to determine the cmdbuf execution size dynamically. */
if (radv_dgc_use_ib_chaining(pGeneratedCommandsInfo->maxSequenceCount))
return true;
/* Heuristic on when the overhead for the preamble (i.e. double jump) is worth it. Obviously
* a bit of a guess as it depends on the actual count which we don't know. */
return pGeneratedCommandsInfo->sequenceCountAddress != 0 && pGeneratedCommandsInfo->maxSequenceCount >= 64;
@ -441,6 +467,7 @@ radv_get_sequence_size(const struct radv_indirect_command_layout *layout, const
struct dgc_cmdbuf_layout {
bool use_preamble;
bool use_ib_chaining;
uint32_t alloc_size;
uint32_t main_trailer_offset;
@ -472,6 +499,16 @@ get_dgc_cmdbuf_layout(const struct radv_device *device, const struct radv_indire
radv_get_sequence_size(dgc_layout, pNext, &layout->main_cmd_stride, &layout->ace_cmd_stride, &layout->upload_stride);
layout->use_ib_chaining = radv_dgc_use_ib_chaining(sequences_count);
if (layout->use_ib_chaining) {
layout->main_cmd_stride = radv_align_cmdbuf(
device, radv_pad_cmdbuf(device, layout->main_cmd_stride + PKT3_INDIRECT_BUFFER_BYTES, AMD_IP_GFX), AMD_IP_GFX);
layout->ace_cmd_stride = radv_align_cmdbuf(
device, radv_pad_cmdbuf(device, layout->ace_cmd_stride + PKT3_INDIRECT_BUFFER_BYTES, AMD_IP_COMPUTE),
AMD_IP_COMPUTE);
}
layout->use_preamble = use_preamble;
if (layout->use_preamble) {
layout->main_preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_GFX);
@ -626,6 +663,7 @@ struct radv_dgc_params {
uint8_t queue_family;
uint8_t use_preamble;
uint8_t use_ib_chaining;
uint64_t params_addr;
@ -1091,7 +1129,7 @@ build_dgc_buffer_trailer_ace(struct dgc_cmdbuf *cs)
}
static void
build_dgc_buffer_preamble(struct dgc_cmdbuf *cs, nir_def *cmd_buf_preamble_offset, nir_def *cmd_buf_size,
build_dgc_buffer_preamble(struct dgc_cmdbuf *cs, nir_def *cmd_buf_preamble_offset, nir_def *cmd_buf_exec_size,
nir_def *cmd_buf_main_offset, unsigned preamble_size)
{
nir_builder *b = cs->b;
@ -1103,7 +1141,7 @@ build_dgc_buffer_preamble(struct dgc_cmdbuf *cs, nir_def *cmd_buf_preamble_offse
{
nir_def *va = nir_iadd(b, cs->va, nir_u2u64(b, cmd_buf_preamble_offset));
nir_def *words = nir_ushr_imm(b, cmd_buf_size, 2);
nir_def *words = nir_ushr_imm(b, cmd_buf_exec_size, 2);
const uint32_t pad_size = preamble_size - PKT3_INDIRECT_BUFFER_BYTES;
@ -1114,6 +1152,30 @@ build_dgc_buffer_preamble(struct dgc_cmdbuf *cs, nir_def *cmd_buf_preamble_offse
nir_pop_if(b, NULL);
}
static nir_def *
dgc_cmd_buf_exec_size(struct dgc_cmdbuf *cs, nir_def *sequence_count, bool is_ace)
{
const enum amd_ip_type ip_type = is_ace ? AMD_IP_COMPUTE : AMD_IP_GFX;
const struct radv_device *device = cs->dev;
nir_builder *b = cs->b;
nir_def *use_ib_chaining = nir_ine_imm(b, load_param8(b, use_ib_chaining), 0);
unsigned tail_size = radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type);
nir_def *cmd_buf_stride = is_ace ? load_param32(b, ace_cmd_buf_stride) : load_param32(b, cmd_buf_stride);
/* When IB chaining is used, the cmdbuf execution size is either:
* - the size of the tail (ie. padding + "jump to trailer" IB on compute queue) when the number
* of sequences is 0.
* - the size of one stride (ie. sequence + padding + "chain to next sequence" IB).
*
* Otherwise, it's the whole cmdbuf size.
*/
return nir_bcsel(b, use_ib_chaining,
nir_bcsel(b, nir_ieq_imm(b, sequence_count, 0), nir_imm_int(b, tail_size), cmd_buf_stride),
dgc_cmd_buf_size(b, sequence_count, is_ace, device));
}
static void
build_dgc_buffer_preamble_main(struct dgc_cmdbuf *cs, nir_def *sequence_count)
{
@ -1122,10 +1184,10 @@ build_dgc_buffer_preamble_main(struct dgc_cmdbuf *cs, nir_def *sequence_count)
nir_def *cmd_buf_preamble_offset = load_param32(b, cmd_buf_preamble_offset);
nir_def *cmd_buf_main_offset = load_param32(b, cmd_buf_main_offset);
nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, false, device);
nir_def *cmd_buf_exec_size = dgc_cmd_buf_exec_size(cs, sequence_count, false);
unsigned preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_GFX);
build_dgc_buffer_preamble(cs, cmd_buf_preamble_offset, cmd_buf_size, cmd_buf_main_offset, preamble_size);
build_dgc_buffer_preamble(cs, cmd_buf_preamble_offset, cmd_buf_exec_size, cmd_buf_main_offset, preamble_size);
}
static void
@ -1136,10 +1198,10 @@ build_dgc_buffer_preamble_ace(struct dgc_cmdbuf *cs, nir_def *sequence_count)
nir_def *cmd_buf_preamble_offset = load_param32(b, ace_cmd_buf_preamble_offset);
nir_def *cmd_buf_main_offset = load_param32(b, ace_cmd_buf_main_offset);
nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, true, device);
nir_def *cmd_buf_exec_size = dgc_cmd_buf_exec_size(cs, sequence_count, true);
unsigned preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_COMPUTE);
build_dgc_buffer_preamble(cs, cmd_buf_preamble_offset, cmd_buf_size, cmd_buf_main_offset, preamble_size);
build_dgc_buffer_preamble(cs, cmd_buf_preamble_offset, cmd_buf_exec_size, cmd_buf_main_offset, preamble_size);
}
/**
@ -2507,7 +2569,60 @@ dgc_pad_cmdbuf(struct dgc_cmdbuf *cs, nir_def *cmd_buf_end)
}
static void
dgc_emit_one_sequence_main(struct dgc_cmdbuf *cs, nir_def *sequence_id, struct radv_indirect_command_layout *layout)
dgc_emit_ib_chaining(struct dgc_cmdbuf *cs, nir_def *sequence_id, nir_def *sequence_count, nir_def *cmd_buf_stride,
nir_def *cmd_buf_end, bool is_ace)
{
nir_builder *b = cs->b;
nir_def *cmd_buf_offset = is_ace ? load_param32(b, ace_cmd_buf_main_offset) : load_param32(b, cmd_buf_main_offset);
nir_def *is_last_sequence = nir_ieq(b, nir_iadd_imm(b, sequence_id, 1), sequence_count);
/* Determine if the "jump to trailer" IB needs to be emitted. */
nir_def *is_compute_queue = nir_ior_imm(b, nir_ieq_imm(b, load_param8(b, queue_family), RADV_QUEUE_COMPUTE), is_ace);
/* Leave space for emitting an indirect packet if necessary. */
cmd_buf_end = nir_bcsel(b, nir_ior(b, nir_inot(b, is_last_sequence), is_compute_queue),
nir_iadd_imm(b, cmd_buf_end, -PKT3_INDIRECT_BUFFER_BYTES), cmd_buf_end);
dgc_pad_cmdbuf(cs, cmd_buf_end);
/* Emit an IB to chain the next sequence. */
nir_push_if(b, is_last_sequence);
{
nir_push_if(b, is_compute_queue);
{
nir_def *va = nir_iadd(b, cs->va, nir_u2u64(b, cmd_buf_end));
nir_def *cmd_buf_trailer_offset;
unsigned trailer_size;
if (is_ace) {
cmd_buf_trailer_offset = load_param32(b, ace_cmd_buf_trailer_offset);
trailer_size = radv_dgc_trailer_cmdbuf_size(cs->dev, AMD_IP_COMPUTE) / 4;
} else {
cmd_buf_trailer_offset = nir_imm_int(b, 0);
trailer_size = radv_dgc_trailer_cmdbuf_size(cs->dev, AMD_IP_GFX) / 4;
}
dgc_emit_indirect_buffer(cs, va, cmd_buf_trailer_offset, nir_imm_int(b, trailer_size));
}
nir_pop_if(b, NULL);
}
nir_push_else(b, NULL);
{
nir_def *va = nir_iadd(b, cs->va, nir_u2u64(b, cmd_buf_end));
nir_def *ib_offset = nir_iadd(b, cmd_buf_offset, nir_imul(b, nir_iadd_imm(b, sequence_id, 1), cmd_buf_stride));
nir_def *ib_cdw = nir_udiv_imm(b, cmd_buf_stride, 4);
dgc_emit_indirect_buffer(cs, va, ib_offset, ib_cdw);
}
nir_pop_if(b, NULL);
}
static void
dgc_emit_one_sequence_main(struct dgc_cmdbuf *cs, nir_def *sequence_id, nir_def *sequence_count,
struct radv_indirect_command_layout *layout)
{
nir_builder *b = cs->b;
@ -2591,12 +2706,22 @@ dgc_emit_one_sequence_main(struct dgc_cmdbuf *cs, nir_def *sequence_id, struct r
}
}
/* Pad the cmdbuffer if we did not use the whole stride */
dgc_pad_cmdbuf(cs, cmd_buf_end);
nir_def *use_ib_chaining = nir_ine_imm(b, load_param8(b, use_ib_chaining), 0);
nir_push_if(b, use_ib_chaining);
{
dgc_emit_ib_chaining(cs, sequence_id, sequence_count, cmd_buf_stride, cmd_buf_end, false);
}
nir_push_else(b, NULL);
{
/* Pad the cmdbuffer if we did not use the whole stride */
dgc_pad_cmdbuf(cs, cmd_buf_end);
}
nir_pop_if(b, NULL);
}
static void
dgc_emit_one_sequence_ace(struct dgc_cmdbuf *cs, nir_def *sequence_id, struct radv_indirect_command_layout *layout)
dgc_emit_one_sequence_ace(struct dgc_cmdbuf *cs, nir_def *sequence_id, nir_def *sequence_count,
struct radv_indirect_command_layout *layout)
{
nir_builder *b = cs->b;
@ -2633,8 +2758,17 @@ dgc_emit_one_sequence_ace(struct dgc_cmdbuf *cs, nir_def *sequence_id, struct ra
dgc_emit_draw_mesh_tasks_ace(cs, stream_addr);
}
/* Pad the cmdbuffer if we did not use the whole stride */
dgc_pad_cmdbuf(cs, cmd_buf_end);
nir_def *use_ib_chaining = nir_ine_imm(b, load_param8(b, use_ib_chaining), 0);
nir_push_if(b, use_ib_chaining);
{
dgc_emit_ib_chaining(cs, sequence_id, sequence_count, ace_cmd_buf_stride, cmd_buf_end, true);
}
nir_push_else(b, NULL);
{
/* Pad the cmdbuffer if we did not use the whole stride */
dgc_pad_cmdbuf(cs, cmd_buf_end);
}
nir_pop_if(b, NULL);
}
static nir_shader *
@ -2695,7 +2829,7 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l
nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count));
{
dgc_emit_one_sequence_main(&cmd_buf, sequence_id, layout);
dgc_emit_one_sequence_main(&cmd_buf, sequence_id, sequence_count, layout);
}
nir_pop_if(&b, NULL);
@ -2709,7 +2843,7 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l
nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count));
{
dgc_emit_one_sequence_ace(&cmd_buf, sequence_id, layout);
dgc_emit_one_sequence_ace(&cmd_buf, sequence_id, sequence_count, layout);
}
nir_pop_if(&b, NULL);
@ -3026,6 +3160,7 @@ radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsIn
.upload_stride = cmdbuf_layout.upload_stride,
.sequence_count = sequences_count,
.use_preamble = use_preamble,
.use_ib_chaining = cmdbuf_layout.use_ib_chaining,
.stream_addr = pGeneratedCommandsInfo->indirectAddress,
.sequence_count_addr = pGeneratedCommandsInfo->sequenceCountAddress,
.ies_addr = ies ? ies->va : 0,