winsys/amdgpu: flatten huge if and reorder code in amdgpu_cs_submit_ib

This correctly tracks when we get a failure and jump to cleanup.

Reviewed-by: Mihai Preda <mhpreda@gmail.com>
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17968>
This commit is contained in:
Marek Olšák 2022-08-09 17:27:18 -04:00 committed by Marge Bot
parent 6d6da5bf4d
commit 471c82d21e

View file

@ -1487,151 +1487,167 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
if (acs->ip_type == AMD_IP_GFX) if (acs->ip_type == AMD_IP_GFX)
ws->gfx_bo_list_counter += cs->num_real_buffers; ws->gfx_bo_list_counter += cs->num_real_buffers;
bool noop = false; struct drm_amdgpu_cs_chunk chunks[7];
unsigned num_chunks = 0;
if (acs->ctx->num_rejected_cs) { /* BO list */
r = -ECANCELED; if (!use_bo_list_create) {
} else { chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
struct drm_amdgpu_cs_chunk chunks[7]; chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
unsigned num_chunks = 0; chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
num_chunks++;
}
/* BO list */ /* Fence dependencies. */
if (!use_bo_list_create) { unsigned num_dependencies = cs->fence_dependencies.num;
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES; if (num_dependencies) {
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4; struct drm_amdgpu_cs_chunk_dep *dep_chunk =
chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in; alloca(num_dependencies * sizeof(*dep_chunk));
num_chunks++;
for (unsigned i = 0; i < num_dependencies; i++) {
struct amdgpu_fence *fence =
(struct amdgpu_fence*)cs->fence_dependencies.list[i];
assert(util_queue_fence_is_signalled(&fence->submitted));
amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]);
} }
/* Fence dependencies. */ chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES;
unsigned num_dependencies = cs->fence_dependencies.num; chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_dependencies;
if (num_dependencies) { chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
struct drm_amdgpu_cs_chunk_dep *dep_chunk = num_chunks++;
alloca(num_dependencies * sizeof(*dep_chunk)); }
for (unsigned i = 0; i < num_dependencies; i++) { /* Syncobj dependencies. */
struct amdgpu_fence *fence = unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num;
(struct amdgpu_fence*)cs->fence_dependencies.list[i]; if (num_syncobj_dependencies) {
struct drm_amdgpu_cs_chunk_sem *sem_chunk =
alloca(num_syncobj_dependencies * sizeof(sem_chunk[0]));
assert(util_queue_fence_is_signalled(&fence->submitted)); for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]); struct amdgpu_fence *fence =
} (struct amdgpu_fence*)cs->syncobj_dependencies.list[i];
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES; if (!amdgpu_fence_is_syncobj(fence))
chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_dependencies; continue;
chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
num_chunks++; assert(util_queue_fence_is_signalled(&fence->submitted));
sem_chunk[i].handle = fence->syncobj;
} }
/* Syncobj dependencies. */ chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num; chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_dependencies;
if (num_syncobj_dependencies) { chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
struct drm_amdgpu_cs_chunk_sem *sem_chunk = num_chunks++;
alloca(num_syncobj_dependencies * sizeof(sem_chunk[0])); }
for (unsigned i = 0; i < num_syncobj_dependencies; i++) { /* Syncobj signals. */
struct amdgpu_fence *fence = unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num;
(struct amdgpu_fence*)cs->syncobj_dependencies.list[i]; if (num_syncobj_to_signal) {
struct drm_amdgpu_cs_chunk_sem *sem_chunk =
alloca(num_syncobj_to_signal * sizeof(sem_chunk[0]));
if (!amdgpu_fence_is_syncobj(fence)) for (unsigned i = 0; i < num_syncobj_to_signal; i++) {
continue; struct amdgpu_fence *fence =
(struct amdgpu_fence*)cs->syncobj_to_signal.list[i];
assert(util_queue_fence_is_signalled(&fence->submitted)); assert(amdgpu_fence_is_syncobj(fence));
sem_chunk[i].handle = fence->syncobj; sem_chunk[i].handle = fence->syncobj;
}
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_dependencies;
chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
num_chunks++;
} }
/* Syncobj signals. */ chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_OUT;
unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num; chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4
if (num_syncobj_to_signal) { * num_syncobj_to_signal;
struct drm_amdgpu_cs_chunk_sem *sem_chunk = chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
alloca(num_syncobj_to_signal * sizeof(sem_chunk[0])); num_chunks++;
}
for (unsigned i = 0; i < num_syncobj_to_signal; i++) { /* Fence */
struct amdgpu_fence *fence = if (has_user_fence) {
(struct amdgpu_fence*)cs->syncobj_to_signal.list[i]; chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE;
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk;
num_chunks++;
}
assert(amdgpu_fence_is_syncobj(fence)); /* IB */
sem_chunk[i].handle = fence->syncobj; if (cs->ib[IB_PREAMBLE].ib_bytes) {
}
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_OUT;
chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4
* num_syncobj_to_signal;
chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
num_chunks++;
}
/* Fence */
if (has_user_fence) {
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE;
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk;
num_chunks++;
}
/* IB */
if (cs->ib[IB_PREAMBLE].ib_bytes) {
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_PREAMBLE];
num_chunks++;
}
/* IB */
cs->ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB; chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4; chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_MAIN]; chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_PREAMBLE];
num_chunks++; num_chunks++;
}
if (cs->secure) { /* IB */
cs->ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE; cs->ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */
cs->ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE; chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
} else { chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
cs->ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE; chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_MAIN];
cs->ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE; num_chunks++;
}
/* Apply RADEON_NOOP. */ if (cs->secure) {
if (acs->noop) { cs->ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE;
if (acs->ip_type == AMD_IP_GFX) { cs->ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE;
/* Reduce the IB size and fill it with NOP to make it like an empty IB. */ } else {
unsigned noop_size = MIN2(cs->ib[IB_MAIN].ib_bytes, ws->info.ib_alignment); cs->ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE;
cs->ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE;
}
cs->ib_main_addr[0] = PKT3(PKT3_NOP, noop_size / 4 - 2, 0); bool noop = acs->noop;
cs->ib[IB_MAIN].ib_bytes = noop_size;
} else {
noop = true;
}
}
assert(num_chunks <= ARRAY_SIZE(chunks)); if (noop && acs->ip_type == AMD_IP_GFX) {
/* Reduce the IB size and fill it with NOP to make it like an empty IB. */
unsigned noop_size = MIN2(cs->ib[IB_MAIN].ib_bytes, ws->info.ib_alignment);
cs->ib_main_addr[0] = PKT3(PKT3_NOP, noop_size / 4 - 2, 0);
cs->ib[IB_MAIN].ib_bytes = noop_size;
noop = false;
}
assert(num_chunks <= ARRAY_SIZE(chunks));
if (unlikely(acs->ctx->num_rejected_cs)) {
r = -ECANCELED;
} else if (unlikely(noop)) {
r = 0; r = 0;
} else {
/* Submit the command buffer.
*
* The kernel returns -ENOMEM with many parallel processes using GDS such as test suites
* quite often, but it eventually succeeds after enough attempts. This happens frequently
* with dEQP using NGG streamout.
*/
do {
/* Wait 1 ms and try again. */
if (r == -ENOMEM)
os_time_sleep(1000);
if (!noop) { r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
/* The kernel returns -ENOMEM with many parallel processes using GDS such as test suites num_chunks, chunks, &seq_no);
* quite often, but it eventually succeeds after enough attempts. This happens frequently } while (r == -ENOMEM);
* with dEQP using NGG streamout.
if (!r) {
/* Success. */
uint64_t *user_fence = NULL;
/* Need to reserve 4 QWORD for user fence:
* QWORD[0]: completed fence
* QWORD[1]: preempted fence
* QWORD[2]: reset fence
* QWORD[3]: preempted then reset
*/ */
do { if (has_user_fence)
/* Wait 1 ms and try again. */ user_fence = acs->ctx->user_fence_cpu_address_base + acs->ip_type * 4;
if (r == -ENOMEM) amdgpu_fence_submitted(cs->fence, seq_no, user_fence);
os_time_sleep(1000);
r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
num_chunks, chunks, &seq_no);
} while (r == -ENOMEM);
} }
} }
if (r) { /* Cleanup. */
if (bo_list)
amdgpu_bo_list_destroy_raw(ws->dev, bo_list);
cleanup:
if (unlikely(r)) {
if (!acs->allow_context_lost) { if (!acs->allow_context_lost) {
/* Non-robust contexts are allowed to terminate the process. The only alternative is /* Non-robust contexts are allowed to terminate the process. The only alternative is
* to skip command submission, which would look like a freeze because nothing is drawn, * to skip command submission, which would look like a freeze because nothing is drawn,
@ -1642,34 +1658,11 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
exit(1); exit(1);
} }
if (r == -ECANCELED) fprintf(stderr, "amdgpu: The CS has been rejected (%i). Recreate the context.\n", r);
fprintf(stderr, "amdgpu: The CS has been cancelled because the context is lost.\n");
else
fprintf(stderr, "amdgpu: The CS has been rejected, "
"see dmesg for more information (%i).\n", r);
acs->ctx->num_rejected_cs++; acs->ctx->num_rejected_cs++;
ws->num_total_rejected_cs++; ws->num_total_rejected_cs++;
} else if (!noop) {
/* Success. */
uint64_t *user_fence = NULL;
/* Need to reserve 4 QWORD for user fence:
* QWORD[0]: completed fence
* QWORD[1]: preempted fence
* QWORD[2]: reset fence
* QWORD[3]: preempted then reset
**/
if (has_user_fence)
user_fence = acs->ctx->user_fence_cpu_address_base + acs->ip_type * 4;
amdgpu_fence_submitted(cs->fence, seq_no, user_fence);
} }
/* Cleanup. */
if (bo_list)
amdgpu_bo_list_destroy_raw(ws->dev, bo_list);
cleanup:
/* If there was an error, signal the fence, because it won't be signalled /* If there was an error, signal the fence, because it won't be signalled
* by the hardware. */ * by the hardware. */
if (r || noop) if (r || noop)