From 26364c63471a714f4dc20615e8de19fcd4e7cee3 Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Tue, 31 Aug 2021 14:16:32 -0700 Subject: [PATCH] iris: signal the syncobj after a failed batch When a batch fails we either recreate our context (in case we got -EIO or -ENOMEM) or we abort() (every other error). If we don't abort and a later batch has a dependency on the batch that failed, then this newer batch will fail with -EINVAL since it requires a syncobj that was never submitted, which means we'll abort(). To avoid this problem, in this patch we simply signal syncobjs of failed batches. This means we may be breaking our dependency tracking, but IMHO it's better than simply letting it abort() later. In other words, this moves the situation for some apps from "app causes a GPU hang and then aborts" to "app causes a GPU hang but keeps running". Note: on some older Kernels (like today's Debian 5.10 Kernel) I see X simply freezing after the GPU hang when the app doesn't decide to abort(). Switching to a more recent Kernel fixes this issue for me, so in case it happens to you make sure you have the most recent stable trees. v2: - Fix coding style (Ken). - Use the big comment block provided by Ken (Ken). - Adjust the commit message so avoid saying we retry (Ken). - Rebase after the syncobj ownership changes. - Drive-by add a missing white space in the header. Acked-by: Lionel Landwerlin Reviewed-by: Kenneth Graunke Signed-off-by: Paulo Zanoni Part-of: --- src/gallium/drivers/iris/iris_batch.c | 15 +++++++++++++++ src/gallium/drivers/iris/iris_fence.c | 15 +++++++++++++++ src/gallium/drivers/iris/iris_fence.h | 3 ++- 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/iris/iris_batch.c b/src/gallium/drivers/iris/iris_batch.c index 939e828d475..d5d18dc0d0e 100644 --- a/src/gallium/drivers/iris/iris_batch.c +++ b/src/gallium/drivers/iris/iris_batch.c @@ -858,6 +858,21 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line) int ret = submit_batch(batch); + /* When batch submission fails, our end-of-batch syncobj remains + * unsignalled, and in fact is not even considered submitted. + * + * In the hang recovery case (-EIO) or -ENOMEM, we recreate our context and + * attempt to carry on. In that case, we need to signal our syncobj, + * dubiously claiming that this batch completed, because future batches may + * depend on it. If we don't, then execbuf would fail with -EINVAL for + * those batches, because they depend on a syncobj that's considered to be + * "never submitted". This would lead to an abort(). So here, we signal + * the failing batch's syncobj to try and allow further progress to be + * made, knowing we may have broken our dependency tracking. + */ + if (ret < 0) + iris_syncobj_signal(screen->bufmgr, iris_batch_get_signal_syncobj(batch)); + batch->exec_count = 0; batch->aperture_space = 0; diff --git a/src/gallium/drivers/iris/iris_fence.c b/src/gallium/drivers/iris/iris_fence.c index a7dffe58912..d9e58fe4fee 100644 --- a/src/gallium/drivers/iris/iris_fence.c +++ b/src/gallium/drivers/iris/iris_fence.c @@ -87,6 +87,21 @@ iris_syncobj_destroy(struct iris_bufmgr *bufmgr, struct iris_syncobj *syncobj) free(syncobj); } +void +iris_syncobj_signal(struct iris_bufmgr *bufmgr, struct iris_syncobj *syncobj) +{ + int fd = iris_bufmgr_get_fd(bufmgr); + struct drm_syncobj_array args = { + .handles = (uintptr_t)&syncobj->handle, + .count_handles = 1, + }; + + if (intel_ioctl(fd, DRM_IOCTL_SYNCOBJ_SIGNAL, &args)) { + fprintf(stderr, "failed to signal syncobj %"PRIu32"\n", + syncobj->handle); + } +} + /** * Add a sync-point to the batch, with the given flags. * diff --git a/src/gallium/drivers/iris/iris_fence.h b/src/gallium/drivers/iris/iris_fence.h index 1332e55ec92..d28c59ce5e8 100644 --- a/src/gallium/drivers/iris/iris_fence.h +++ b/src/gallium/drivers/iris/iris_fence.h @@ -39,7 +39,8 @@ struct iris_syncobj { }; struct iris_syncobj *iris_create_syncobj(struct iris_bufmgr *bufmgr); -void iris_syncobj_destroy(struct iris_bufmgr*, struct iris_syncobj *); +void iris_syncobj_destroy(struct iris_bufmgr *, struct iris_syncobj *); +void iris_syncobj_signal(struct iris_bufmgr *, struct iris_syncobj *); void iris_batch_add_syncobj(struct iris_batch *batch, struct iris_syncobj *syncobj,