diff --git a/src/amd/ci/radv-skips.txt b/src/amd/ci/radv-skips.txt index 2cbb447d268..cf1ad45d4e8 100644 --- a/src/amd/ci/radv-skips.txt +++ b/src/amd/ci/radv-skips.txt @@ -53,8 +53,5 @@ dEQP-VK.reconvergence.maximal.compute.nesting4.7.38 # investigated. dEQP-VK.api.command_buffers.many_indirect_disps_on_secondary -# RADV bug -dEQP-VK.dgc.ext.misc.properties.maxIndirectSequenceCount - # CTS bug dEQP-VK.api.debug_utils.long_labels_video_decode diff --git a/src/amd/vulkan/radv_dgc.c b/src/amd/vulkan/radv_dgc.c index db02355c112..beea7c184ca 100644 --- a/src/amd/vulkan/radv_dgc.c +++ b/src/amd/vulkan/radv_dgc.c @@ -21,6 +21,11 @@ #define PKT3_INDIRECT_BUFFER_BYTES 16 #define DGC_VBO_INFO_SIZE (sizeof(struct radv_vbo_info) + 4 /* vbo_offsets */) +/* Arbitrary limit of the maximum number of sequences per IB to simplify the DGC IB chaining + * implementation which is already quite complicated. + */ +#define MAX_SEQUENCES_PER_IB 65536 + /* The DGC command buffer layout is quite complex, here's some explanations: * * Without the DGC preamble, the default layout looks like: @@ -56,6 +61,9 @@ * - on GFX, the driver uses the IB2 packet which the easiest solution * - on COMPUTE, IB2 isn't supported and the driver chains the DGC command buffer by patching the * trailer + * + * In case the number of sequences is greater than the arbitrary limit defined above, IB chaining + * is used between sequences (ie. sequence #0 chains to sequence #1 etc). */ uint32_t @@ -96,9 +104,27 @@ radv_dgc_trailer_cmdbuf_size(const struct radv_device *device, enum amd_ip_type return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type); } +static bool +radv_dgc_use_ib_chaining(uint32_t sequences_count) +{ + /* Use IB chaining when the number of sequences is too high because the maximum number of DWORDS + * per IB is limited by the hw. This limit is arbitrary and could be more optimal. Though in + * practice, most applications use <= 4K sequences and won't be affected by this. + * + * This IB chaining mechanism is implemented by emitting a IB packet at the end of each sequence + * to chain the next one. Except for the last sequence which emits the "jump to trailer" when on + * compute queue or only padding when on graphics queue. + */ + return sequences_count >= MAX_SEQUENCES_PER_IB; +} + static bool radv_dgc_use_preamble(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo) { + /* Use a preamble with IB chaining to determine the cmdbuf execution size dynamically. */ + if (radv_dgc_use_ib_chaining(pGeneratedCommandsInfo->maxSequenceCount)) + return true; + /* Heuristic on when the overhead for the preamble (i.e. double jump) is worth it. Obviously * a bit of a guess as it depends on the actual count which we don't know. */ return pGeneratedCommandsInfo->sequenceCountAddress != 0 && pGeneratedCommandsInfo->maxSequenceCount >= 64; @@ -441,6 +467,7 @@ radv_get_sequence_size(const struct radv_indirect_command_layout *layout, const struct dgc_cmdbuf_layout { bool use_preamble; + bool use_ib_chaining; uint32_t alloc_size; uint32_t main_trailer_offset; @@ -472,6 +499,16 @@ get_dgc_cmdbuf_layout(const struct radv_device *device, const struct radv_indire radv_get_sequence_size(dgc_layout, pNext, &layout->main_cmd_stride, &layout->ace_cmd_stride, &layout->upload_stride); + layout->use_ib_chaining = radv_dgc_use_ib_chaining(sequences_count); + if (layout->use_ib_chaining) { + layout->main_cmd_stride = radv_align_cmdbuf( + device, radv_pad_cmdbuf(device, layout->main_cmd_stride + PKT3_INDIRECT_BUFFER_BYTES, AMD_IP_GFX), AMD_IP_GFX); + + layout->ace_cmd_stride = radv_align_cmdbuf( + device, radv_pad_cmdbuf(device, layout->ace_cmd_stride + PKT3_INDIRECT_BUFFER_BYTES, AMD_IP_COMPUTE), + AMD_IP_COMPUTE); + } + layout->use_preamble = use_preamble; if (layout->use_preamble) { layout->main_preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_GFX); @@ -626,6 +663,7 @@ struct radv_dgc_params { uint8_t queue_family; uint8_t use_preamble; + uint8_t use_ib_chaining; uint64_t params_addr; @@ -1091,7 +1129,7 @@ build_dgc_buffer_trailer_ace(struct dgc_cmdbuf *cs) } static void -build_dgc_buffer_preamble(struct dgc_cmdbuf *cs, nir_def *cmd_buf_preamble_offset, nir_def *cmd_buf_size, +build_dgc_buffer_preamble(struct dgc_cmdbuf *cs, nir_def *cmd_buf_preamble_offset, nir_def *cmd_buf_exec_size, nir_def *cmd_buf_main_offset, unsigned preamble_size) { nir_builder *b = cs->b; @@ -1103,7 +1141,7 @@ build_dgc_buffer_preamble(struct dgc_cmdbuf *cs, nir_def *cmd_buf_preamble_offse { nir_def *va = nir_iadd(b, cs->va, nir_u2u64(b, cmd_buf_preamble_offset)); - nir_def *words = nir_ushr_imm(b, cmd_buf_size, 2); + nir_def *words = nir_ushr_imm(b, cmd_buf_exec_size, 2); const uint32_t pad_size = preamble_size - PKT3_INDIRECT_BUFFER_BYTES; @@ -1114,6 +1152,30 @@ build_dgc_buffer_preamble(struct dgc_cmdbuf *cs, nir_def *cmd_buf_preamble_offse nir_pop_if(b, NULL); } +static nir_def * +dgc_cmd_buf_exec_size(struct dgc_cmdbuf *cs, nir_def *sequence_count, bool is_ace) +{ + const enum amd_ip_type ip_type = is_ace ? AMD_IP_COMPUTE : AMD_IP_GFX; + const struct radv_device *device = cs->dev; + nir_builder *b = cs->b; + + nir_def *use_ib_chaining = nir_ine_imm(b, load_param8(b, use_ib_chaining), 0); + unsigned tail_size = radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type); + + nir_def *cmd_buf_stride = is_ace ? load_param32(b, ace_cmd_buf_stride) : load_param32(b, cmd_buf_stride); + + /* When IB chaining is used, the cmdbuf execution size is either: + * - the size of the tail (ie. padding + "jump to trailer" IB on compute queue) when the number + * of sequences is 0. + * - the size of one stride (ie. sequence + padding + "chain to next sequence" IB). + * + * Otherwise, it's the whole cmdbuf size. + */ + return nir_bcsel(b, use_ib_chaining, + nir_bcsel(b, nir_ieq_imm(b, sequence_count, 0), nir_imm_int(b, tail_size), cmd_buf_stride), + dgc_cmd_buf_size(b, sequence_count, is_ace, device)); +} + static void build_dgc_buffer_preamble_main(struct dgc_cmdbuf *cs, nir_def *sequence_count) { @@ -1122,10 +1184,10 @@ build_dgc_buffer_preamble_main(struct dgc_cmdbuf *cs, nir_def *sequence_count) nir_def *cmd_buf_preamble_offset = load_param32(b, cmd_buf_preamble_offset); nir_def *cmd_buf_main_offset = load_param32(b, cmd_buf_main_offset); - nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, false, device); + nir_def *cmd_buf_exec_size = dgc_cmd_buf_exec_size(cs, sequence_count, false); unsigned preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_GFX); - build_dgc_buffer_preamble(cs, cmd_buf_preamble_offset, cmd_buf_size, cmd_buf_main_offset, preamble_size); + build_dgc_buffer_preamble(cs, cmd_buf_preamble_offset, cmd_buf_exec_size, cmd_buf_main_offset, preamble_size); } static void @@ -1136,10 +1198,10 @@ build_dgc_buffer_preamble_ace(struct dgc_cmdbuf *cs, nir_def *sequence_count) nir_def *cmd_buf_preamble_offset = load_param32(b, ace_cmd_buf_preamble_offset); nir_def *cmd_buf_main_offset = load_param32(b, ace_cmd_buf_main_offset); - nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, true, device); + nir_def *cmd_buf_exec_size = dgc_cmd_buf_exec_size(cs, sequence_count, true); unsigned preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_COMPUTE); - build_dgc_buffer_preamble(cs, cmd_buf_preamble_offset, cmd_buf_size, cmd_buf_main_offset, preamble_size); + build_dgc_buffer_preamble(cs, cmd_buf_preamble_offset, cmd_buf_exec_size, cmd_buf_main_offset, preamble_size); } /** @@ -2507,7 +2569,60 @@ dgc_pad_cmdbuf(struct dgc_cmdbuf *cs, nir_def *cmd_buf_end) } static void -dgc_emit_one_sequence_main(struct dgc_cmdbuf *cs, nir_def *sequence_id, struct radv_indirect_command_layout *layout) +dgc_emit_ib_chaining(struct dgc_cmdbuf *cs, nir_def *sequence_id, nir_def *sequence_count, nir_def *cmd_buf_stride, + nir_def *cmd_buf_end, bool is_ace) +{ + nir_builder *b = cs->b; + nir_def *cmd_buf_offset = is_ace ? load_param32(b, ace_cmd_buf_main_offset) : load_param32(b, cmd_buf_main_offset); + + nir_def *is_last_sequence = nir_ieq(b, nir_iadd_imm(b, sequence_id, 1), sequence_count); + + /* Determine if the "jump to trailer" IB needs to be emitted. */ + nir_def *is_compute_queue = nir_ior_imm(b, nir_ieq_imm(b, load_param8(b, queue_family), RADV_QUEUE_COMPUTE), is_ace); + + /* Leave space for emitting an indirect packet if necessary. */ + cmd_buf_end = nir_bcsel(b, nir_ior(b, nir_inot(b, is_last_sequence), is_compute_queue), + nir_iadd_imm(b, cmd_buf_end, -PKT3_INDIRECT_BUFFER_BYTES), cmd_buf_end); + + dgc_pad_cmdbuf(cs, cmd_buf_end); + + /* Emit an IB to chain the next sequence. */ + nir_push_if(b, is_last_sequence); + { + nir_push_if(b, is_compute_queue); + { + nir_def *va = nir_iadd(b, cs->va, nir_u2u64(b, cmd_buf_end)); + + nir_def *cmd_buf_trailer_offset; + unsigned trailer_size; + + if (is_ace) { + cmd_buf_trailer_offset = load_param32(b, ace_cmd_buf_trailer_offset); + trailer_size = radv_dgc_trailer_cmdbuf_size(cs->dev, AMD_IP_COMPUTE) / 4; + } else { + cmd_buf_trailer_offset = nir_imm_int(b, 0); + trailer_size = radv_dgc_trailer_cmdbuf_size(cs->dev, AMD_IP_GFX) / 4; + } + + dgc_emit_indirect_buffer(cs, va, cmd_buf_trailer_offset, nir_imm_int(b, trailer_size)); + } + nir_pop_if(b, NULL); + } + nir_push_else(b, NULL); + { + nir_def *va = nir_iadd(b, cs->va, nir_u2u64(b, cmd_buf_end)); + + nir_def *ib_offset = nir_iadd(b, cmd_buf_offset, nir_imul(b, nir_iadd_imm(b, sequence_id, 1), cmd_buf_stride)); + nir_def *ib_cdw = nir_udiv_imm(b, cmd_buf_stride, 4); + + dgc_emit_indirect_buffer(cs, va, ib_offset, ib_cdw); + } + nir_pop_if(b, NULL); +} + +static void +dgc_emit_one_sequence_main(struct dgc_cmdbuf *cs, nir_def *sequence_id, nir_def *sequence_count, + struct radv_indirect_command_layout *layout) { nir_builder *b = cs->b; @@ -2591,12 +2706,22 @@ dgc_emit_one_sequence_main(struct dgc_cmdbuf *cs, nir_def *sequence_id, struct r } } - /* Pad the cmdbuffer if we did not use the whole stride */ - dgc_pad_cmdbuf(cs, cmd_buf_end); + nir_def *use_ib_chaining = nir_ine_imm(b, load_param8(b, use_ib_chaining), 0); + nir_push_if(b, use_ib_chaining); + { + dgc_emit_ib_chaining(cs, sequence_id, sequence_count, cmd_buf_stride, cmd_buf_end, false); + } + nir_push_else(b, NULL); + { + /* Pad the cmdbuffer if we did not use the whole stride */ + dgc_pad_cmdbuf(cs, cmd_buf_end); + } + nir_pop_if(b, NULL); } static void -dgc_emit_one_sequence_ace(struct dgc_cmdbuf *cs, nir_def *sequence_id, struct radv_indirect_command_layout *layout) +dgc_emit_one_sequence_ace(struct dgc_cmdbuf *cs, nir_def *sequence_id, nir_def *sequence_count, + struct radv_indirect_command_layout *layout) { nir_builder *b = cs->b; @@ -2633,8 +2758,17 @@ dgc_emit_one_sequence_ace(struct dgc_cmdbuf *cs, nir_def *sequence_id, struct ra dgc_emit_draw_mesh_tasks_ace(cs, stream_addr); } - /* Pad the cmdbuffer if we did not use the whole stride */ - dgc_pad_cmdbuf(cs, cmd_buf_end); + nir_def *use_ib_chaining = nir_ine_imm(b, load_param8(b, use_ib_chaining), 0); + nir_push_if(b, use_ib_chaining); + { + dgc_emit_ib_chaining(cs, sequence_id, sequence_count, ace_cmd_buf_stride, cmd_buf_end, true); + } + nir_push_else(b, NULL); + { + /* Pad the cmdbuffer if we did not use the whole stride */ + dgc_pad_cmdbuf(cs, cmd_buf_end); + } + nir_pop_if(b, NULL); } static nir_shader * @@ -2695,7 +2829,7 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count)); { - dgc_emit_one_sequence_main(&cmd_buf, sequence_id, layout); + dgc_emit_one_sequence_main(&cmd_buf, sequence_id, sequence_count, layout); } nir_pop_if(&b, NULL); @@ -2709,7 +2843,7 @@ build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_l nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count)); { - dgc_emit_one_sequence_ace(&cmd_buf, sequence_id, layout); + dgc_emit_one_sequence_ace(&cmd_buf, sequence_id, sequence_count, layout); } nir_pop_if(&b, NULL); @@ -3026,6 +3160,7 @@ radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsIn .upload_stride = cmdbuf_layout.upload_stride, .sequence_count = sequences_count, .use_preamble = use_preamble, + .use_ib_chaining = cmdbuf_layout.use_ib_chaining, .stream_addr = pGeneratedCommandsInfo->indirectAddress, .sequence_count_addr = pGeneratedCommandsInfo->sequenceCountAddress, .ies_addr = ies ? ies->va : 0,