radv: do not launch an IB2 for secondary cmdbuf with INDIRECT_MULTI on GFX7

It's illegal to emit DRAW_{INDEX}_INDIRECT_MULTI from an IB2 on GFX7.

PAL applies this workaround for indirect dispatches and also on
GFX8-9 but it doesn't seem needed.

This fixes various GPU hangs on Bonaire (GFX7).

Cc: 21.1 mesa-stable
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11214>
(cherry picked from commit a234840e60)
This commit is contained in:
Samuel Pitoiset 2021-06-07 10:14:01 +02:00 committed by Eric Engestrom
parent 6084c73575
commit 9ce9cbecad
5 changed files with 95 additions and 50 deletions

View file

@ -58,7 +58,7 @@
"description": "radv: do not launch an IB2 for secondary cmdbuf with INDIRECT_MULTI on GFX7",
"nominated": true,
"nomination_type": 0,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": null
},

View file

@ -4748,6 +4748,15 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou
for (uint32_t i = 0; i < commandBufferCount; i++) {
RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
bool allow_ib2 = true;
if (secondary->device->physical_device->rad_info.chip_class == GFX7 &&
secondary->state.uses_draw_indirect_multi) {
/* Do not launch an IB2 for secondary command buffers that contain
* DRAW_{INDEX}_INDIRECT_MULTI on GFX7 because it's illegal and hang the GPU.
*/
allow_ib2 = false;
}
primary->scratch_size_per_wave_needed =
MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed);
@ -4779,7 +4788,7 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou
radv_emit_framebuffer_state(primary);
}
primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2);
/* When the secondary command buffer is compute only we don't
* need to re-emit the current graphics pipeline.
@ -5165,6 +5174,8 @@ radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool index
radeon_emit(cs, count_va >> 32);
radeon_emit(cs, stride); /* stride */
radeon_emit(cs, di_src_sel);
cmd_buffer->state.uses_draw_indirect_multi = true;
}
}

View file

@ -1385,6 +1385,9 @@ struct radv_cmd_state {
enum rgp_flush_bits sqtt_flush_bits;
uint8_t cb_mip[MAX_RTS];
/* Whether DRAW_{INDEX}_INDIRECT_MULTI is emitted. */
bool uses_draw_indirect_multi;
};
struct radv_cmd_pool {

View file

@ -280,7 +280,8 @@ struct radeon_winsys {
void (*cs_add_buffer)(struct radeon_cmdbuf *cs, struct radeon_winsys_bo *bo);
void (*cs_execute_secondary)(struct radeon_cmdbuf *parent, struct radeon_cmdbuf *child);
void (*cs_execute_secondary)(struct radeon_cmdbuf *parent, struct radeon_cmdbuf *child,
bool allow_ib2);
void (*cs_dump)(struct radeon_cmdbuf *cs, FILE *file, const int *trace_ids, int trace_id_count);

View file

@ -40,6 +40,11 @@
enum { VIRTUAL_BUFFER_HASH_TABLE_SIZE = 1024 };
struct radv_amdgpu_ib {
struct radeon_winsys_bo *bo;
unsigned cdw;
};
struct radv_amdgpu_cs {
struct radeon_cmdbuf base;
struct radv_amdgpu_winsys *ws;
@ -52,7 +57,7 @@ struct radv_amdgpu_cs {
unsigned num_buffers;
struct drm_amdgpu_bo_list_entry *handles;
struct radeon_winsys_bo **old_ib_buffers;
struct radv_amdgpu_ib *old_ib_buffers;
unsigned num_old_ib_buffers;
unsigned max_num_old_ib_buffers;
unsigned *ib_size_ptr;
@ -154,7 +159,7 @@ radv_amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
free(cs->base.buf);
for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i)
cs->ws->base.buffer_destroy(&cs->ws->base, cs->old_ib_buffers[i]);
cs->ws->base.buffer_destroy(&cs->ws->base, cs->old_ib_buffers[i].bo);
for (unsigned i = 0; i < cs->num_old_cs_buffers; ++i) {
free(cs->old_cs_buffers[i].buf);
@ -311,8 +316,8 @@ radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size)
if (cs->num_old_ib_buffers == cs->max_num_old_ib_buffers) {
unsigned max_num_old_ib_buffers = MAX2(1, cs->max_num_old_ib_buffers * 2);
struct radeon_winsys_bo **old_ib_buffers =
realloc(cs->old_ib_buffers, max_num_old_ib_buffers * sizeof(void *));
struct radv_amdgpu_ib *old_ib_buffers =
realloc(cs->old_ib_buffers, max_num_old_ib_buffers * sizeof(*old_ib_buffers));
if (!old_ib_buffers) {
cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
return;
@ -321,7 +326,8 @@ radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size)
cs->old_ib_buffers = old_ib_buffers;
}
cs->old_ib_buffers[cs->num_old_ib_buffers++] = cs->ib_buffer;
cs->old_ib_buffers[cs->num_old_ib_buffers].bo = cs->ib_buffer;
cs->old_ib_buffers[cs->num_old_ib_buffers++].cdw = cs->base.cdw;
cs->ib_buffer =
cs->ws->base.buffer_create(&cs->ws->base, ib_size, 0, radv_amdgpu_cs_domain(&cs->ws->base),
@ -332,7 +338,7 @@ radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size)
if (!cs->ib_buffer) {
cs->base.cdw = 0;
cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers];
cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers].bo;
}
cs->ib_mapped = cs->ws->base.buffer_map(cs->ib_buffer);
@ -342,7 +348,7 @@ radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size)
/* VK_ERROR_MEMORY_MAP_FAILED is not valid for vkEndCommandBuffer. */
cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers];
cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers].bo;
}
cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
@ -401,7 +407,7 @@ radv_amdgpu_cs_reset(struct radeon_cmdbuf *_cs)
cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i)
cs->ws->base.buffer_destroy(&cs->ws->base, cs->old_ib_buffers[i]);
cs->ws->base.buffer_destroy(&cs->ws->base, cs->old_ib_buffers[i].bo);
cs->num_old_ib_buffers = 0;
cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
@ -539,10 +545,13 @@ radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *_b
}
static void
radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cmdbuf *_child)
radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cmdbuf *_child,
bool allow_ib2)
{
struct radv_amdgpu_cs *parent = radv_amdgpu_cs(_parent);
struct radv_amdgpu_cs *child = radv_amdgpu_cs(_child);
struct radv_amdgpu_winsys *ws = parent->ws;
bool use_ib2 = ws->use_ib_bos && allow_ib2;
if (parent->status != VK_SUCCESS || child->status != VK_SUCCESS)
return;
@ -556,7 +565,7 @@ radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cm
radv_amdgpu_cs_add_buffer(&parent->base, child->virtual_buffers[i]);
}
if (parent->ws->use_ib_bos) {
if (use_ib2) {
if (parent->base.cdw + 4 > parent->base.max_dw)
radv_amdgpu_cs_grow(&parent->base, 4);
@ -565,57 +574,78 @@ radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cm
radeon_emit(&parent->base, child->ib.ib_mc_address >> 32);
radeon_emit(&parent->base, child->ib.size);
} else {
/* When the secondary command buffer is huge we have to copy the list of CS buffers to the
* parent to submit multiple IBs.
*/
if (child->num_old_cs_buffers > 0) {
unsigned num_cs_buffers;
uint32_t *new_buf;
if (parent->ws->use_ib_bos) {
/* Copy and chain old IB buffers from the child to the parent IB. */
for (unsigned i = 0; i < child->num_old_ib_buffers; i++) {
struct radv_amdgpu_ib *ib = &child->old_ib_buffers[i];
uint8_t *mapped;
/* Compute the total number of CS buffers needed. */
num_cs_buffers = parent->num_old_cs_buffers + child->num_old_cs_buffers + 1;
if (parent->base.cdw + ib->cdw > parent->base.max_dw)
radv_amdgpu_cs_grow(&parent->base, ib->cdw);
struct radeon_cmdbuf *old_cs_buffers =
realloc(parent->old_cs_buffers, num_cs_buffers * sizeof(*parent->old_cs_buffers));
if (!old_cs_buffers) {
parent->status = VK_ERROR_OUT_OF_HOST_MEMORY;
parent->base.cdw = 0;
return;
mapped = ws->base.buffer_map(ib->bo);
if (!mapped) {
parent->status = VK_ERROR_OUT_OF_HOST_MEMORY;
return;
}
/* Copy the IB data without the original chain link. */
memcpy(parent->base.buf + parent->base.cdw, mapped, 4 * ib->cdw);
parent->base.cdw += ib->cdw;
}
parent->old_cs_buffers = old_cs_buffers;
} else {
/* When the secondary command buffer is huge we have to copy the list of CS buffers to the
* parent to submit multiple IBs.
*/
if (child->num_old_cs_buffers > 0) {
unsigned num_cs_buffers;
uint32_t *new_buf;
/* Copy the parent CS to its list of CS buffers, so submission ordering is maintained. */
new_buf = malloc(parent->base.max_dw * 4);
if (!new_buf) {
parent->status = VK_ERROR_OUT_OF_HOST_MEMORY;
parent->base.cdw = 0;
return;
}
memcpy(new_buf, parent->base.buf, parent->base.max_dw * 4);
/* Compute the total number of CS buffers needed. */
num_cs_buffers = parent->num_old_cs_buffers + child->num_old_cs_buffers + 1;
parent->old_cs_buffers[parent->num_old_cs_buffers].cdw = parent->base.cdw;
parent->old_cs_buffers[parent->num_old_cs_buffers].max_dw = parent->base.max_dw;
parent->old_cs_buffers[parent->num_old_cs_buffers].buf = new_buf;
parent->num_old_cs_buffers++;
struct radeon_cmdbuf *old_cs_buffers =
realloc(parent->old_cs_buffers, num_cs_buffers * sizeof(*parent->old_cs_buffers));
if (!old_cs_buffers) {
parent->status = VK_ERROR_OUT_OF_HOST_MEMORY;
parent->base.cdw = 0;
return;
}
parent->old_cs_buffers = old_cs_buffers;
/* Then, copy all child CS buffers to the parent list. */
for (unsigned i = 0; i < child->num_old_cs_buffers; i++) {
new_buf = malloc(child->old_cs_buffers[i].max_dw * 4);
/* Copy the parent CS to its list of CS buffers, so submission ordering is maintained. */
new_buf = malloc(parent->base.max_dw * 4);
if (!new_buf) {
parent->status = VK_ERROR_OUT_OF_HOST_MEMORY;
parent->base.cdw = 0;
return;
}
memcpy(new_buf, child->old_cs_buffers[i].buf, child->old_cs_buffers[i].max_dw * 4);
memcpy(new_buf, parent->base.buf, parent->base.max_dw * 4);
parent->old_cs_buffers[parent->num_old_cs_buffers].cdw = child->old_cs_buffers[i].cdw;
parent->old_cs_buffers[parent->num_old_cs_buffers].max_dw = child->old_cs_buffers[i].max_dw;
parent->old_cs_buffers[parent->num_old_cs_buffers].cdw = parent->base.cdw;
parent->old_cs_buffers[parent->num_old_cs_buffers].max_dw = parent->base.max_dw;
parent->old_cs_buffers[parent->num_old_cs_buffers].buf = new_buf;
parent->num_old_cs_buffers++;
}
/* Reset the parent CS before copying the child CS into it. */
parent->base.cdw = 0;
/* Then, copy all child CS buffers to the parent list. */
for (unsigned i = 0; i < child->num_old_cs_buffers; i++) {
new_buf = malloc(child->old_cs_buffers[i].max_dw * 4);
if (!new_buf) {
parent->status = VK_ERROR_OUT_OF_HOST_MEMORY;
parent->base.cdw = 0;
return;
}
memcpy(new_buf, child->old_cs_buffers[i].buf, child->old_cs_buffers[i].max_dw * 4);
parent->old_cs_buffers[parent->num_old_cs_buffers].cdw = child->old_cs_buffers[i].cdw;
parent->old_cs_buffers[parent->num_old_cs_buffers].max_dw = child->old_cs_buffers[i].max_dw;
parent->old_cs_buffers[parent->num_old_cs_buffers].buf = new_buf;
parent->num_old_cs_buffers++;
}
/* Reset the parent CS before copying the child CS into it. */
parent->base.cdw = 0;
}
}
if (parent->base.cdw + child->base.cdw > parent->base.max_dw)
@ -1150,7 +1180,7 @@ radv_amdgpu_winsys_get_cpu_addr(void *_cs, uint64_t addr)
struct radv_amdgpu_winsys_bo *bo;
bo = (struct radv_amdgpu_winsys_bo *)(i == cs->num_old_ib_buffers ? cs->ib_buffer
: cs->old_ib_buffers[i]);
: cs->old_ib_buffers[i].bo);
if (addr >= bo->base.va && addr - bo->base.va < bo->size) {
if (amdgpu_bo_cpu_map(bo->bo, &ret) == 0)
return (char *)ret + (addr - bo->base.va);