pan: report async CSF group faults via context reset status
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

A long-running job can trip the Panfrost watchdog, resulting
in the job timing out. However, GROUP_SUBMIT does not report
that through the normal submission/fence path. GROUP_SUBMIT
has already returned, and the syncobj is signalled, without
checking the group state, making any "completion" seem like
a success.

Poll the Panthor group state from get_device_reset_status()
and surface fatal faults and timeouts as a context reset.

This was observed when running the OpenCL-CTS test
test_allocations image2d_read on Mali, which is a very
long-running compute job. Rusticl already checks the device
reset status when flushing events, so following this change,
the timeout propagates as an execution failure instead of
being silently ignored, and the subsequent read observing
partial writes, with the test ultimately failing at the
verification step.

Signed-off-by: Ahmed Hesham <ahmed.hesham@arm.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41995>
This commit is contained in:
Ahmed Hesham 2026-06-03 12:52:05 +00:00 committed by Marge Bot
parent 76e600a868
commit 2a0bacd012

View file

@ -740,8 +740,8 @@ update_reset_status(struct panfrost_context *ctx,
}
}
static void
csf_check_ctx_state_and_reinit(struct panfrost_context *ctx)
static enum pipe_reset_status
csf_sync_ctx_state(struct panfrost_context *ctx)
{
struct panfrost_device *dev = pan_device(ctx->base.screen);
struct drm_panthor_group_get_state state = {
@ -749,19 +749,22 @@ csf_check_ctx_state_and_reinit(struct panfrost_context *ctx)
};
int ret;
if (!ctx->csf.is_init)
return PIPE_NO_RESET;
ret = pan_kmod_ioctl(panfrost_device_fd(dev),
DRM_IOCTL_PANTHOR_GROUP_GET_STATE, &state);
if (ret) {
update_reset_status(ctx, PIPE_UNKNOWN_CONTEXT_RESET);
mesa_loge("DRM_IOCTL_PANTHOR_GROUP_GET_STATE failed (err=%d)", errno);
return;
return PIPE_UNKNOWN_CONTEXT_RESET;
}
/* Context is still usable. This was a transient error. */
if (!(state.state & (DRM_PANTHOR_GROUP_STATE_FATAL_FAULT |
DRM_PANTHOR_GROUP_STATE_TIMEDOUT))) {
update_reset_status(ctx, PIPE_NO_RESET);
return;
return PIPE_NO_RESET;
}
/* If the VM is unusable, we can't do much, as this is shared between all
@ -776,9 +779,24 @@ csf_check_ctx_state_and_reinit(struct panfrost_context *ctx)
* means we consider all resets as guilty until that point, but that
* should be fine.
*/
update_reset_status(ctx, state.state & DRM_PANTHOR_GROUP_STATE_INNOCENT
? PIPE_INNOCENT_CONTEXT_RESET
: PIPE_GUILTY_CONTEXT_RESET);
enum pipe_reset_status reset_status =
state.state & DRM_PANTHOR_GROUP_STATE_INNOCENT
? PIPE_INNOCENT_CONTEXT_RESET
: PIPE_GUILTY_CONTEXT_RESET;
update_reset_status(ctx, reset_status);
return reset_status;
}
static void
csf_check_ctx_state_and_reinit(struct panfrost_context *ctx)
{
enum pipe_reset_status reset_status = csf_sync_ctx_state(ctx);
if (reset_status != PIPE_GUILTY_CONTEXT_RESET &&
reset_status != PIPE_INNOCENT_CONTEXT_RESET)
return;
mesa_loge("Group became unusable, re-initializing context");
panfrost_context_reinit(ctx);
@ -1744,7 +1762,10 @@ static enum pipe_reset_status
get_device_reset_status(struct pipe_context *pctx)
{
struct panfrost_context *ctx = pan_context(pctx);
enum pipe_reset_status reset_status = ctx->csf.reset_status;
/* Probe for an asynchronous group fault/timeout that the submit and fence
* paths don't observe, so it's reported instead of silently dropped. */
enum pipe_reset_status reset_status = csf_sync_ctx_state(ctx);
/* Reset the status before returning. */
ctx->csf.reset_status = PIPE_NO_RESET;