iris: Try to recover from GPU hangs.

The iris batch module now tries to detect that the kernel has banned
our GEM context, creates a new non-banned context, and informs the
iris context module that all assumptions about state are now invalid
and it needs to reinitialize the relevant state.

Based on Chris Wilson's work, but significantly rewritten by me.
This commit is contained in:
Kenneth Graunke 2019-05-07 23:19:30 -07:00
parent 7402564c07
commit c5c12bdd00
3 changed files with 71 additions and 0 deletions

View file

@ -451,6 +451,28 @@ iris_finish_batch(struct iris_batch *batch)
batch->primary_batch_size = iris_batch_bytes_used(batch);
}
/**
* Replace our current GEM context with a new one (in case it got banned).
*/
static bool
replace_hw_ctx(struct iris_batch *batch)
{
struct iris_screen *screen = batch->screen;
struct iris_bufmgr *bufmgr = screen->bufmgr;
uint32_t new_ctx = iris_clone_hw_context(bufmgr, batch->hw_ctx_id);
if (!new_ctx)
return false;
iris_destroy_hw_context(bufmgr, batch->hw_ctx_id);
batch->hw_ctx_id = new_ctx;
/* Notify the context that state must be re-initialized. */
iris_lost_context_state(batch);
return true;
}
/**
* Submit the batch to the GPU via execbuffer2.
*/
@ -583,6 +605,15 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line)
/* Start a new batch buffer. */
iris_batch_reset(batch);
/* EIO means our context is banned. In this case, try and replace it
* with a new logical context, and inform iris_context that all state
* has been lost and needs to be re-initialized. If this succeeds,
* dubiously claim success...
*/
if (ret == -EIO && replace_hw_ctx(batch)) {
ret = 0;
}
if (ret >= 0) {
//if (iris->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
//iris_check_for_reset(ice);

View file

@ -63,6 +63,44 @@ iris_set_debug_callback(struct pipe_context *ctx,
memset(&ice->dbg, 0, sizeof(ice->dbg));
}
/**
* Called from the batch module when it detects a GPU hang.
*
* In this case, we've lost our GEM context, and can't rely on any existing
* state on the GPU. We must mark everything dirty and wipe away any saved
* assumptions about the last known state of the GPU.
*/
void
iris_lost_context_state(struct iris_batch *batch)
{
/* The batch module doesn't have an iris_context, because we want to
* avoid introducing lots of layering violations. Unfortunately, here
* we do need to inform the context of batch catastrophe. We know the
* batch is one of our context's, so hackily claw our way back.
*/
struct iris_context *ice = NULL;
struct iris_screen *screen;
if (batch->name == IRIS_BATCH_RENDER) {
ice = container_of(batch, ice, batches[IRIS_BATCH_RENDER]);
assert(&ice->batches[IRIS_BATCH_RENDER] == batch);
screen = (void *) ice->ctx.screen;
ice->vtbl.init_render_context(screen, batch, &ice->vtbl, &ice->dbg);
} else if (batch->name == IRIS_BATCH_COMPUTE) {
ice = container_of(batch, ice, batches[IRIS_BATCH_COMPUTE]);
assert(&ice->batches[IRIS_BATCH_COMPUTE] == batch);
screen = (void *) ice->ctx.screen;
ice->vtbl.init_compute_context(screen, batch, &ice->vtbl, &ice->dbg);
} else {
unreachable("unhandled batch reset");
}
ice->state.dirty = ~0ull;
memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid));
}
static void
iris_get_sample_position(struct pipe_context *ctx,
unsigned sample_count,

View file

@ -662,6 +662,8 @@ double get_time(void);
struct pipe_context *
iris_create_context(struct pipe_screen *screen, void *priv, unsigned flags);
void iris_lost_context_state(struct iris_batch *batch);
void iris_init_blit_functions(struct pipe_context *ctx);
void iris_init_clear_functions(struct pipe_context *ctx);
void iris_init_program_functions(struct pipe_context *ctx);