From 2a0bacd01203a394f70ea9b5652372ebdefa512a Mon Sep 17 00:00:00 2001
From: Ahmed Hesham <ahmed.hesham@arm.com>
Date: Wed, 3 Jun 2026 12:52:05 +0000
Subject: [PATCH] pan: report async CSF group faults via context reset status

A long-running job can trip the Panfrost watchdog, resulting
in the job timing out. However, GROUP_SUBMIT does not report
that through the normal submission/fence path. GROUP_SUBMIT
has already returned, and the syncobj is signalled, without
checking the group state, making any "completion" seem like
a success.

Poll the Panthor group state from get_device_reset_status()
and surface fatal faults and timeouts as a context reset.

This was observed when running the OpenCL-CTS test
test_allocations image2d_read on Mali, which is a very
long-running compute job. Rusticl already checks the device
reset status when flushing events, so following this change,
the timeout propagates as an execution failure instead of
being silently ignored, and the subsequent read observing
partial writes, with the test ultimately failing at the
verification step.

Signed-off-by: Ahmed Hesham <ahmed.hesham@arm.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41995>
---
 src/gallium/drivers/panfrost/pan_csf.c | 37 ++++++++++++++++++++------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/panfrost/pan_csf.c b/src/gallium/drivers/panfrost/pan_csf.c
index dfa5c81af31..f643b20c6cd 100644
--- a/src/gallium/drivers/panfrost/pan_csf.c
+++ b/src/gallium/drivers/panfrost/pan_csf.c
@@ -740,8 +740,8 @@ update_reset_status(struct panfrost_context *ctx,
    }
 }
 
-static void
-csf_check_ctx_state_and_reinit(struct panfrost_context *ctx)
+static enum pipe_reset_status
+csf_sync_ctx_state(struct panfrost_context *ctx)
 {
    struct panfrost_device *dev = pan_device(ctx->base.screen);
    struct drm_panthor_group_get_state state = {
@@ -749,19 +749,22 @@ csf_check_ctx_state_and_reinit(struct panfrost_context *ctx)
    };
    int ret;
 
+   if (!ctx->csf.is_init)
+      return PIPE_NO_RESET;
+
    ret = pan_kmod_ioctl(panfrost_device_fd(dev),
                         DRM_IOCTL_PANTHOR_GROUP_GET_STATE, &state);
    if (ret) {
       update_reset_status(ctx, PIPE_UNKNOWN_CONTEXT_RESET);
       mesa_loge("DRM_IOCTL_PANTHOR_GROUP_GET_STATE failed (err=%d)", errno);
-      return;
+      return PIPE_UNKNOWN_CONTEXT_RESET;
    }
 
    /* Context is still usable. This was a transient error. */
    if (!(state.state & (DRM_PANTHOR_GROUP_STATE_FATAL_FAULT |
                         DRM_PANTHOR_GROUP_STATE_TIMEDOUT))) {
       update_reset_status(ctx, PIPE_NO_RESET);
-      return;
+      return PIPE_NO_RESET;
    }
 
    /* If the VM is unusable, we can't do much, as this is shared between all
@@ -776,9 +779,24 @@ csf_check_ctx_state_and_reinit(struct panfrost_context *ctx)
     * means we consider all resets as guilty until that point, but that
     * should be fine.
     */
-   update_reset_status(ctx, state.state & DRM_PANTHOR_GROUP_STATE_INNOCENT
-                               ? PIPE_INNOCENT_CONTEXT_RESET
-                               : PIPE_GUILTY_CONTEXT_RESET);
+   enum pipe_reset_status reset_status =
+      state.state & DRM_PANTHOR_GROUP_STATE_INNOCENT
+         ? PIPE_INNOCENT_CONTEXT_RESET
+         : PIPE_GUILTY_CONTEXT_RESET;
+
+   update_reset_status(ctx, reset_status);
+
+   return reset_status;
+}
+
+static void
+csf_check_ctx_state_and_reinit(struct panfrost_context *ctx)
+{
+   enum pipe_reset_status reset_status = csf_sync_ctx_state(ctx);
+
+   if (reset_status != PIPE_GUILTY_CONTEXT_RESET &&
+       reset_status != PIPE_INNOCENT_CONTEXT_RESET)
+      return;
 
    mesa_loge("Group became unusable, re-initializing context");
    panfrost_context_reinit(ctx);
@@ -1744,7 +1762,10 @@ static enum pipe_reset_status
 get_device_reset_status(struct pipe_context *pctx)
 {
    struct panfrost_context *ctx = pan_context(pctx);
-   enum pipe_reset_status reset_status = ctx->csf.reset_status;
+
+   /* Probe for an asynchronous group fault/timeout that the submit and fence
+    * paths don't observe, so it's reported instead of silently dropped. */
+   enum pipe_reset_status reset_status = csf_sync_ctx_state(ctx);
 
    /* Reset the status before returning. */
    ctx->csf.reset_status = PIPE_NO_RESET;