diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 2f992310b08..66767d7f256 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -6351,6 +6351,22 @@ radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va) } } +static void +radv_emit_streamout_state(struct radv_cmd_buffer *cmd_buffer, uint64_t va) +{ + const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader; + const struct radv_userdata_info *loc = radv_get_user_sgpr(last_vgt_shader, AC_UD_STREAMOUT_STATE); + const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + uint32_t base_reg; + + if (loc->sgpr_idx == -1) + return; + + base_reg = last_vgt_shader->info.user_data_0; + + radv_emit_shader_pointer(device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va, false); +} + static void radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer) { @@ -6402,6 +6418,39 @@ radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer) desc_va += so_offset; radv_emit_streamout_buffers(cmd_buffer, desc_va); + + if (pdev->info.gfx_level >= GFX12) { + const uint8_t first_target = ffs(so->enabled_mask) - 1; + unsigned state_offset; + uint64_t state_va; + void *state_ptr; + + /* The layout is: + * struct { + * struct { + * uint32_t ordered_id; // equal for all buffers + * uint32_t dwords_written; + * } buffer[4]; + * }; + * + * The buffer must be initialized to 0 and the address must be aligned to 64 + * because it's faster when the atomic doesn't straddle a 64B block boundary. + */ + if (!radv_cmd_buffer_upload_alloc_aligned(cmd_buffer, MAX_SO_BUFFERS * 8, 64, &state_offset, &state_ptr)) + return; + + memset(state_ptr, 0, MAX_SO_BUFFERS * 8); + + state_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); + state_va += state_offset; + + /* The first enabled streamout target will contain the ordered ID/offset buffer for all + * targets. + */ + state_va += first_target * 8; + + radv_emit_streamout_state(cmd_buffer, state_va); + } } cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER; @@ -7900,7 +7949,7 @@ radv_bind_pre_rast_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_ */ cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER; - if (pdev->use_ngg_streamout) { + if (pdev->use_ngg_streamout && pdev->info.gfx_level < GFX12) { /* GFX11 needs GDS OA for streamout. */ cmd_buffer->gds_oa_needed = true; } @@ -13217,6 +13266,7 @@ radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstC struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; struct radv_streamout_state *so = &cmd_buffer->state.streamout; struct radeon_cmdbuf *cs = cmd_buffer->cs; + bool first_target = true; assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); if (!pdev->use_ngg_streamout) @@ -13245,7 +13295,24 @@ radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstC radv_cs_add_buffer(device->ws, cs, buffer->bo); } - if (pdev->use_ngg_streamout) { + if (pdev->info.gfx_level >= GFX12) { + /* Only the first streamout target holds information. */ + if (first_target) { + if (append) { + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit( + cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_WR_CONFIRM); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, (R_0309B0_GE_GS_ORDERED_ID_BASE >> 2)); + radeon_emit(cs, 0); + } else { + radeon_set_uconfig_reg(cs, R_0309B0_GE_GS_ORDERED_ID_BASE, 0); + } + + first_target = false; + } + } else if (pdev->use_ngg_streamout) { if (append) { radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); radeon_emit(cs, @@ -13309,6 +13376,14 @@ radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCou assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); + if (pdev->info.gfx_level >= GFX12) { + /* Nothing to do. The streamout state buffer already contains the next ordered ID, which + * is the only thing we need to restore. + */ + radv_set_streamout_enable(cmd_buffer, false); + return; + } + if (pdev->use_ngg_streamout) { /* Wait for streamout to finish before reading GDS_STRMOUT registers. */ cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;