From 2a0bacd01203a394f70ea9b5652372ebdefa512a Mon Sep 17 00:00:00 2001 From: Ahmed Hesham Date: Wed, 3 Jun 2026 12:52:05 +0000 Subject: [PATCH] pan: report async CSF group faults via context reset status A long-running job can trip the Panfrost watchdog, resulting in the job timing out. However, GROUP_SUBMIT does not report that through the normal submission/fence path. GROUP_SUBMIT has already returned, and the syncobj is signalled, without checking the group state, making any "completion" seem like a success. Poll the Panthor group state from get_device_reset_status() and surface fatal faults and timeouts as a context reset. This was observed when running the OpenCL-CTS test test_allocations image2d_read on Mali, which is a very long-running compute job. Rusticl already checks the device reset status when flushing events, so following this change, the timeout propagates as an execution failure instead of being silently ignored, and the subsequent read observing partial writes, with the test ultimately failing at the verification step. Signed-off-by: Ahmed Hesham Reviewed-by: Boris Brezillon Part-of: --- src/gallium/drivers/panfrost/pan_csf.c | 37 ++++++++++++++++++++------ 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_csf.c b/src/gallium/drivers/panfrost/pan_csf.c index dfa5c81af31..f643b20c6cd 100644 --- a/src/gallium/drivers/panfrost/pan_csf.c +++ b/src/gallium/drivers/panfrost/pan_csf.c @@ -740,8 +740,8 @@ update_reset_status(struct panfrost_context *ctx, } } -static void -csf_check_ctx_state_and_reinit(struct panfrost_context *ctx) +static enum pipe_reset_status +csf_sync_ctx_state(struct panfrost_context *ctx) { struct panfrost_device *dev = pan_device(ctx->base.screen); struct drm_panthor_group_get_state state = { @@ -749,19 +749,22 @@ csf_check_ctx_state_and_reinit(struct panfrost_context *ctx) }; int ret; + if (!ctx->csf.is_init) + return PIPE_NO_RESET; + ret = pan_kmod_ioctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_GET_STATE, &state); if (ret) { update_reset_status(ctx, PIPE_UNKNOWN_CONTEXT_RESET); mesa_loge("DRM_IOCTL_PANTHOR_GROUP_GET_STATE failed (err=%d)", errno); - return; + return PIPE_UNKNOWN_CONTEXT_RESET; } /* Context is still usable. This was a transient error. */ if (!(state.state & (DRM_PANTHOR_GROUP_STATE_FATAL_FAULT | DRM_PANTHOR_GROUP_STATE_TIMEDOUT))) { update_reset_status(ctx, PIPE_NO_RESET); - return; + return PIPE_NO_RESET; } /* If the VM is unusable, we can't do much, as this is shared between all @@ -776,9 +779,24 @@ csf_check_ctx_state_and_reinit(struct panfrost_context *ctx) * means we consider all resets as guilty until that point, but that * should be fine. */ - update_reset_status(ctx, state.state & DRM_PANTHOR_GROUP_STATE_INNOCENT - ? PIPE_INNOCENT_CONTEXT_RESET - : PIPE_GUILTY_CONTEXT_RESET); + enum pipe_reset_status reset_status = + state.state & DRM_PANTHOR_GROUP_STATE_INNOCENT + ? PIPE_INNOCENT_CONTEXT_RESET + : PIPE_GUILTY_CONTEXT_RESET; + + update_reset_status(ctx, reset_status); + + return reset_status; +} + +static void +csf_check_ctx_state_and_reinit(struct panfrost_context *ctx) +{ + enum pipe_reset_status reset_status = csf_sync_ctx_state(ctx); + + if (reset_status != PIPE_GUILTY_CONTEXT_RESET && + reset_status != PIPE_INNOCENT_CONTEXT_RESET) + return; mesa_loge("Group became unusable, re-initializing context"); panfrost_context_reinit(ctx); @@ -1744,7 +1762,10 @@ static enum pipe_reset_status get_device_reset_status(struct pipe_context *pctx) { struct panfrost_context *ctx = pan_context(pctx); - enum pipe_reset_status reset_status = ctx->csf.reset_status; + + /* Probe for an asynchronous group fault/timeout that the submit and fence + * paths don't observe, so it's reported instead of silently dropped. */ + enum pipe_reset_status reset_status = csf_sync_ctx_state(ctx); /* Reset the status before returning. */ ctx->csf.reset_status = PIPE_NO_RESET;