iris: Try to recover from GPU hangs.

The iris batch module now tries to detect that the kernel has banned our GEM context, creates a new non-banned context, and informs the iris context module that all assumptions about state are now invalid and it needs to reinitialize the relevant state. Based on Chris Wilson's work, but significantly rewritten by me.
2026-01-03 13:40:11 +01:00 · 2019-05-07 23:19:30 -07:00 · 2019-05-07 23:19:30 -07:00 · c5c12bdd00
commit c5c12bdd00
parent 7402564c07
3 changed files with 71 additions and 0 deletions
--- a/src/gallium/drivers/iris/iris_batch.c
+++ b/src/gallium/drivers/iris/iris_batch.c
@ -451,6 +451,28 @@ iris_finish_batch(struct iris_batch *batch)
      batch->primary_batch_size = iris_batch_bytes_used(batch);
 }

+/**
+ * Replace our current GEM context with a new one (in case it got banned).
+ */
+static bool
+replace_hw_ctx(struct iris_batch *batch)
+{
+   struct iris_screen *screen = batch->screen;
+   struct iris_bufmgr *bufmgr = screen->bufmgr;
+
+   uint32_t new_ctx = iris_clone_hw_context(bufmgr, batch->hw_ctx_id);
+   if (!new_ctx)
+      return false;
+
+   iris_destroy_hw_context(bufmgr, batch->hw_ctx_id);
+   batch->hw_ctx_id = new_ctx;
+
+   /* Notify the context that state must be re-initialized. */
+   iris_lost_context_state(batch);
+
+   return true;
+}
+
 /**
 * Submit the batch to the GPU via execbuffer2.
 */
@ -583,6 +605,15 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line)
   /* Start a new batch buffer. */
   iris_batch_reset(batch);

+   /* EIO means our context is banned.  In this case, try and replace it
+    * with a new logical context, and inform iris_context that all state
+    * has been lost and needs to be re-initialized.  If this succeeds,
+    * dubiously claim success...
+    */
+   if (ret == -EIO && replace_hw_ctx(batch)) {
+      ret = 0;
+   }
+
   if (ret >= 0) {
      //if (iris->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
         //iris_check_for_reset(ice);
--- a/src/gallium/drivers/iris/iris_context.c
+++ b/src/gallium/drivers/iris/iris_context.c
@ -63,6 +63,44 @@ iris_set_debug_callback(struct pipe_context *ctx,
      memset(&ice->dbg, 0, sizeof(ice->dbg));
 }

+/**
+ * Called from the batch module when it detects a GPU hang.
+ *
+ * In this case, we've lost our GEM context, and can't rely on any existing
+ * state on the GPU.  We must mark everything dirty and wipe away any saved
+ * assumptions about the last known state of the GPU.
+ */
+void
+iris_lost_context_state(struct iris_batch *batch)
+{
+   /* The batch module doesn't have an iris_context, because we want to
+    * avoid introducing lots of layering violations.  Unfortunately, here
+    * we do need to inform the context of batch catastrophe.  We know the
+    * batch is one of our context's, so hackily claw our way back.
+    */
+   struct iris_context *ice = NULL;
+   struct iris_screen *screen;
+
+   if (batch->name == IRIS_BATCH_RENDER) {
+      ice = container_of(batch, ice, batches[IRIS_BATCH_RENDER]);
+      assert(&ice->batches[IRIS_BATCH_RENDER] == batch);
+      screen = (void *) ice->ctx.screen;
+
+      ice->vtbl.init_render_context(screen, batch, &ice->vtbl, &ice->dbg);
+   } else if (batch->name == IRIS_BATCH_COMPUTE) {
+      ice = container_of(batch, ice, batches[IRIS_BATCH_COMPUTE]);
+      assert(&ice->batches[IRIS_BATCH_COMPUTE] == batch);
+      screen = (void *) ice->ctx.screen;
+
+      ice->vtbl.init_compute_context(screen, batch, &ice->vtbl, &ice->dbg);
+   } else {
+      unreachable("unhandled batch reset");
+   }
+
+   ice->state.dirty = ~0ull;
+   memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid));
+}
+
 static void
 iris_get_sample_position(struct pipe_context *ctx,
                         unsigned sample_count,
--- a/src/gallium/drivers/iris/iris_context.h
+++ b/src/gallium/drivers/iris/iris_context.h
@ -662,6 +662,8 @@ double get_time(void);
 struct pipe_context *
 iris_create_context(struct pipe_screen *screen, void *priv, unsigned flags);

+void iris_lost_context_state(struct iris_batch *batch);
+
 void iris_init_blit_functions(struct pipe_context *ctx);
 void iris_init_clear_functions(struct pipe_context *ctx);
 void iris_init_program_functions(struct pipe_context *ctx);