diff --git a/.pick_status.json b/.pick_status.json index d5e17371294..f996124fc2f 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -58,7 +58,7 @@ "description": "radv: do not launch an IB2 for secondary cmdbuf with INDIRECT_MULTI on GFX7", "nominated": true, "nomination_type": 0, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null }, diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 27b016de4d4..d12166d5456 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -4748,6 +4748,15 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou for (uint32_t i = 0; i < commandBufferCount; i++) { RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]); + bool allow_ib2 = true; + + if (secondary->device->physical_device->rad_info.chip_class == GFX7 && + secondary->state.uses_draw_indirect_multi) { + /* Do not launch an IB2 for secondary command buffers that contain + * DRAW_{INDEX}_INDIRECT_MULTI on GFX7 because it's illegal and hang the GPU. + */ + allow_ib2 = false; + } primary->scratch_size_per_wave_needed = MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed); @@ -4779,7 +4788,7 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou radv_emit_framebuffer_state(primary); } - primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs); + primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2); /* When the secondary command buffer is compute only we don't * need to re-emit the current graphics pipeline. @@ -5165,6 +5174,8 @@ radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool index radeon_emit(cs, count_va >> 32); radeon_emit(cs, stride); /* stride */ radeon_emit(cs, di_src_sel); + + cmd_buffer->state.uses_draw_indirect_multi = true; } } diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index cb6842e30d9..55bff7ebeb6 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -1385,6 +1385,9 @@ struct radv_cmd_state { enum rgp_flush_bits sqtt_flush_bits; uint8_t cb_mip[MAX_RTS]; + + /* Whether DRAW_{INDEX}_INDIRECT_MULTI is emitted. */ + bool uses_draw_indirect_multi; }; struct radv_cmd_pool { diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h index 54b736cf9b0..e607f49aae7 100644 --- a/src/amd/vulkan/radv_radeon_winsys.h +++ b/src/amd/vulkan/radv_radeon_winsys.h @@ -280,7 +280,8 @@ struct radeon_winsys { void (*cs_add_buffer)(struct radeon_cmdbuf *cs, struct radeon_winsys_bo *bo); - void (*cs_execute_secondary)(struct radeon_cmdbuf *parent, struct radeon_cmdbuf *child); + void (*cs_execute_secondary)(struct radeon_cmdbuf *parent, struct radeon_cmdbuf *child, + bool allow_ib2); void (*cs_dump)(struct radeon_cmdbuf *cs, FILE *file, const int *trace_ids, int trace_id_count); diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c index 3f8a4ed227d..da0dbeee4cc 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -40,6 +40,11 @@ enum { VIRTUAL_BUFFER_HASH_TABLE_SIZE = 1024 }; +struct radv_amdgpu_ib { + struct radeon_winsys_bo *bo; + unsigned cdw; +}; + struct radv_amdgpu_cs { struct radeon_cmdbuf base; struct radv_amdgpu_winsys *ws; @@ -52,7 +57,7 @@ struct radv_amdgpu_cs { unsigned num_buffers; struct drm_amdgpu_bo_list_entry *handles; - struct radeon_winsys_bo **old_ib_buffers; + struct radv_amdgpu_ib *old_ib_buffers; unsigned num_old_ib_buffers; unsigned max_num_old_ib_buffers; unsigned *ib_size_ptr; @@ -154,7 +159,7 @@ radv_amdgpu_cs_destroy(struct radeon_cmdbuf *rcs) free(cs->base.buf); for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i) - cs->ws->base.buffer_destroy(&cs->ws->base, cs->old_ib_buffers[i]); + cs->ws->base.buffer_destroy(&cs->ws->base, cs->old_ib_buffers[i].bo); for (unsigned i = 0; i < cs->num_old_cs_buffers; ++i) { free(cs->old_cs_buffers[i].buf); @@ -311,8 +316,8 @@ radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size) if (cs->num_old_ib_buffers == cs->max_num_old_ib_buffers) { unsigned max_num_old_ib_buffers = MAX2(1, cs->max_num_old_ib_buffers * 2); - struct radeon_winsys_bo **old_ib_buffers = - realloc(cs->old_ib_buffers, max_num_old_ib_buffers * sizeof(void *)); + struct radv_amdgpu_ib *old_ib_buffers = + realloc(cs->old_ib_buffers, max_num_old_ib_buffers * sizeof(*old_ib_buffers)); if (!old_ib_buffers) { cs->status = VK_ERROR_OUT_OF_HOST_MEMORY; return; @@ -321,7 +326,8 @@ radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size) cs->old_ib_buffers = old_ib_buffers; } - cs->old_ib_buffers[cs->num_old_ib_buffers++] = cs->ib_buffer; + cs->old_ib_buffers[cs->num_old_ib_buffers].bo = cs->ib_buffer; + cs->old_ib_buffers[cs->num_old_ib_buffers++].cdw = cs->base.cdw; cs->ib_buffer = cs->ws->base.buffer_create(&cs->ws->base, ib_size, 0, radv_amdgpu_cs_domain(&cs->ws->base), @@ -332,7 +338,7 @@ radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size) if (!cs->ib_buffer) { cs->base.cdw = 0; cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY; - cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers]; + cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers].bo; } cs->ib_mapped = cs->ws->base.buffer_map(cs->ib_buffer); @@ -342,7 +348,7 @@ radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size) /* VK_ERROR_MEMORY_MAP_FAILED is not valid for vkEndCommandBuffer. */ cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY; - cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers]; + cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers].bo; } cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer); @@ -401,7 +407,7 @@ radv_amdgpu_cs_reset(struct radeon_cmdbuf *_cs) cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer); for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i) - cs->ws->base.buffer_destroy(&cs->ws->base, cs->old_ib_buffers[i]); + cs->ws->base.buffer_destroy(&cs->ws->base, cs->old_ib_buffers[i].bo); cs->num_old_ib_buffers = 0; cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va; @@ -539,10 +545,13 @@ radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *_b } static void -radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cmdbuf *_child) +radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cmdbuf *_child, + bool allow_ib2) { struct radv_amdgpu_cs *parent = radv_amdgpu_cs(_parent); struct radv_amdgpu_cs *child = radv_amdgpu_cs(_child); + struct radv_amdgpu_winsys *ws = parent->ws; + bool use_ib2 = ws->use_ib_bos && allow_ib2; if (parent->status != VK_SUCCESS || child->status != VK_SUCCESS) return; @@ -556,7 +565,7 @@ radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cm radv_amdgpu_cs_add_buffer(&parent->base, child->virtual_buffers[i]); } - if (parent->ws->use_ib_bos) { + if (use_ib2) { if (parent->base.cdw + 4 > parent->base.max_dw) radv_amdgpu_cs_grow(&parent->base, 4); @@ -565,57 +574,78 @@ radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cm radeon_emit(&parent->base, child->ib.ib_mc_address >> 32); radeon_emit(&parent->base, child->ib.size); } else { - /* When the secondary command buffer is huge we have to copy the list of CS buffers to the - * parent to submit multiple IBs. - */ - if (child->num_old_cs_buffers > 0) { - unsigned num_cs_buffers; - uint32_t *new_buf; + if (parent->ws->use_ib_bos) { + /* Copy and chain old IB buffers from the child to the parent IB. */ + for (unsigned i = 0; i < child->num_old_ib_buffers; i++) { + struct radv_amdgpu_ib *ib = &child->old_ib_buffers[i]; + uint8_t *mapped; - /* Compute the total number of CS buffers needed. */ - num_cs_buffers = parent->num_old_cs_buffers + child->num_old_cs_buffers + 1; + if (parent->base.cdw + ib->cdw > parent->base.max_dw) + radv_amdgpu_cs_grow(&parent->base, ib->cdw); - struct radeon_cmdbuf *old_cs_buffers = - realloc(parent->old_cs_buffers, num_cs_buffers * sizeof(*parent->old_cs_buffers)); - if (!old_cs_buffers) { - parent->status = VK_ERROR_OUT_OF_HOST_MEMORY; - parent->base.cdw = 0; - return; + mapped = ws->base.buffer_map(ib->bo); + if (!mapped) { + parent->status = VK_ERROR_OUT_OF_HOST_MEMORY; + return; + } + + /* Copy the IB data without the original chain link. */ + memcpy(parent->base.buf + parent->base.cdw, mapped, 4 * ib->cdw); + parent->base.cdw += ib->cdw; } - parent->old_cs_buffers = old_cs_buffers; + } else { + /* When the secondary command buffer is huge we have to copy the list of CS buffers to the + * parent to submit multiple IBs. + */ + if (child->num_old_cs_buffers > 0) { + unsigned num_cs_buffers; + uint32_t *new_buf; - /* Copy the parent CS to its list of CS buffers, so submission ordering is maintained. */ - new_buf = malloc(parent->base.max_dw * 4); - if (!new_buf) { - parent->status = VK_ERROR_OUT_OF_HOST_MEMORY; - parent->base.cdw = 0; - return; - } - memcpy(new_buf, parent->base.buf, parent->base.max_dw * 4); + /* Compute the total number of CS buffers needed. */ + num_cs_buffers = parent->num_old_cs_buffers + child->num_old_cs_buffers + 1; - parent->old_cs_buffers[parent->num_old_cs_buffers].cdw = parent->base.cdw; - parent->old_cs_buffers[parent->num_old_cs_buffers].max_dw = parent->base.max_dw; - parent->old_cs_buffers[parent->num_old_cs_buffers].buf = new_buf; - parent->num_old_cs_buffers++; + struct radeon_cmdbuf *old_cs_buffers = + realloc(parent->old_cs_buffers, num_cs_buffers * sizeof(*parent->old_cs_buffers)); + if (!old_cs_buffers) { + parent->status = VK_ERROR_OUT_OF_HOST_MEMORY; + parent->base.cdw = 0; + return; + } + parent->old_cs_buffers = old_cs_buffers; - /* Then, copy all child CS buffers to the parent list. */ - for (unsigned i = 0; i < child->num_old_cs_buffers; i++) { - new_buf = malloc(child->old_cs_buffers[i].max_dw * 4); + /* Copy the parent CS to its list of CS buffers, so submission ordering is maintained. */ + new_buf = malloc(parent->base.max_dw * 4); if (!new_buf) { parent->status = VK_ERROR_OUT_OF_HOST_MEMORY; parent->base.cdw = 0; return; } - memcpy(new_buf, child->old_cs_buffers[i].buf, child->old_cs_buffers[i].max_dw * 4); + memcpy(new_buf, parent->base.buf, parent->base.max_dw * 4); - parent->old_cs_buffers[parent->num_old_cs_buffers].cdw = child->old_cs_buffers[i].cdw; - parent->old_cs_buffers[parent->num_old_cs_buffers].max_dw = child->old_cs_buffers[i].max_dw; + parent->old_cs_buffers[parent->num_old_cs_buffers].cdw = parent->base.cdw; + parent->old_cs_buffers[parent->num_old_cs_buffers].max_dw = parent->base.max_dw; parent->old_cs_buffers[parent->num_old_cs_buffers].buf = new_buf; parent->num_old_cs_buffers++; - } - /* Reset the parent CS before copying the child CS into it. */ - parent->base.cdw = 0; + /* Then, copy all child CS buffers to the parent list. */ + for (unsigned i = 0; i < child->num_old_cs_buffers; i++) { + new_buf = malloc(child->old_cs_buffers[i].max_dw * 4); + if (!new_buf) { + parent->status = VK_ERROR_OUT_OF_HOST_MEMORY; + parent->base.cdw = 0; + return; + } + memcpy(new_buf, child->old_cs_buffers[i].buf, child->old_cs_buffers[i].max_dw * 4); + + parent->old_cs_buffers[parent->num_old_cs_buffers].cdw = child->old_cs_buffers[i].cdw; + parent->old_cs_buffers[parent->num_old_cs_buffers].max_dw = child->old_cs_buffers[i].max_dw; + parent->old_cs_buffers[parent->num_old_cs_buffers].buf = new_buf; + parent->num_old_cs_buffers++; + } + + /* Reset the parent CS before copying the child CS into it. */ + parent->base.cdw = 0; + } } if (parent->base.cdw + child->base.cdw > parent->base.max_dw) @@ -1150,7 +1180,7 @@ radv_amdgpu_winsys_get_cpu_addr(void *_cs, uint64_t addr) struct radv_amdgpu_winsys_bo *bo; bo = (struct radv_amdgpu_winsys_bo *)(i == cs->num_old_ib_buffers ? cs->ib_buffer - : cs->old_ib_buffers[i]); + : cs->old_ib_buffers[i].bo); if (addr >= bo->base.va && addr - bo->base.va < bo->size) { if (amdgpu_bo_cpu_map(bo->bo, &ret) == 0) return (char *)ret + (addr - bo->base.va);