render: replace fatal aborts on GPU reset with EGL context recovery

Hyprland currently triggers RASSERT (SIGABRT) on three GPU-reset
signals: glGetGraphicsResetStatus() reporting a reset in begin() /
beginSimple(), glGetError() returning GL_CONTEXT_LOST at end(), and
renderTextureInternal() receiving an invalid texture. The original
abort message acknowledged this as a gap: "Cannot continue until proper
GPU reset handling is implemented."

On recent Intel (Xe driver on Meteor Lake/Arc graphics), TLB
invalidation timeouts cause the kernel to issue recoverable GPU resets
and re-arm the rendering engines without losing the EGL display. The
compositor is expected to re-establish its context and carry on. The
current RASSERT takes down the entire user session on what is, from
the kernel's perspective, a handled fault — and these resets can cluster
during suspend/resume or under memory pressure.

This patch adds a minimal recovery mechanism:

- attemptContextReset() unbinds and rebinds the EGL context via
  eglMakeCurrent(EGL_NO_CONTEXT) then eglMakeCurrent(m_eglContext), and
  marks shaders for reinitialization on the next frame.

- begin() and beginSimple() check a per-monitor m_gpuResetCooldown
  counter first; if > 0, they skip the frame and decrement. Otherwise,
  if glGetGraphicsResetStatus reports a reset, they log the reason,
  call attemptContextReset(), and either skip one frame (on success)
  or set a 60-frame cooldown (on failure) to avoid tight-loop recovery
  attempts. pMonitor is reset so the skipped frame doesn't hold a
  dangling reference.

- end() no longer aborts on GL_CONTEXT_LOST; it logs and sets the
  cooldown, letting the next begin() drive recovery.

- renderTextureInternal() replaces the RASSERT-on-invalid-texture with
  a logged skip. Post-reset textures are often invalid until reuploaded;
  skipping the draw for one frame is preferable to SIGABRT.

The existing RASSERT(pMonitor, "...without begin()!") is kept — that's
a programmer error, not a recoverable state.

Tested on a Framework 13 / Intel Xe setup that was previously crashing
on GPU resets; with this patch the compositor logs the reset, skips a
handful of frames, and resumes rendering normally. Combined with the
prior two commits (surface and blur null-guards), this covers the
follow-on nulls that surface during the recovery window.
This commit is contained in:
Sjoerd Siebinga 2026-04-14 15:14:31 +02:00
parent 15357c9685
commit 0603a354f3
2 changed files with 55 additions and 6 deletions

View file

@ -663,9 +663,28 @@ EGLImageKHR CHyprOpenGLImpl::createEGLImage(const Aquamarine::SDMABUFAttrs& attr
return image;
}
bool CHyprOpenGLImpl::attemptContextReset() {
Log::logger->log(Log::ERR, "GPU reset: attempting EGL context recovery...");
eglMakeCurrent(m_eglDisplay, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
if (eglMakeCurrent(m_eglDisplay, EGL_NO_SURFACE, EGL_NO_SURFACE, m_eglContext) != EGL_TRUE) {
Log::logger->log(Log::ERR, "GPU reset: eglMakeCurrent failed, context may be lost");
return false;
}
m_shadersInitialized = false;
Log::logger->log(Log::WARN, "GPU reset: EGL context re-established, shaders will reinitialize on next frame.");
return true;
}
void CHyprOpenGLImpl::beginSimple(PHLMONITOR pMonitor, const CRegion& damage, SP<IRenderbuffer> rb, SP<IFramebuffer> fb) {
g_pHyprRenderer->m_renderData.pMonitor = pMonitor;
if (m_gpuResetCooldown > 0) {
m_gpuResetCooldown--;
Log::logger->log(Log::WARN, "GPU reset recovery cooldown, skipping frame ({} remaining)", m_gpuResetCooldown);
g_pHyprRenderer->m_renderData.pMonitor.reset();
return;
}
const GLenum RESETSTATUS = glGetGraphicsResetStatus();
if (RESETSTATUS != GL_NO_ERROR) {
std::string errStr = "";
@ -675,7 +694,14 @@ void CHyprOpenGLImpl::beginSimple(PHLMONITOR pMonitor, const CRegion& damage, SP
case GL_UNKNOWN_CONTEXT_RESET: errStr = "GL_UNKNOWN_CONTEXT_RESET"; break;
default: errStr = "UNKNOWN??"; break;
}
RASSERT(false, "Aborting, glGetGraphicsResetStatus returned {}. Cannot continue until proper GPU reset handling is implemented.", errStr);
Log::logger->log(Log::ERR, "GPU reset detected in beginSimple: {}. Attempting EGL context recovery.", errStr);
if (!attemptContextReset()) {
Log::logger->log(Log::ERR, "GPU reset recovery failed. Skipping frames for cooldown.");
m_gpuResetCooldown = 60;
} else {
Log::logger->log(Log::WARN, "GPU reset recovery succeeded. Skipping current frame to reinitialize.");
}
g_pHyprRenderer->m_renderData.pMonitor.reset();
return;
}
@ -714,6 +740,13 @@ void CHyprOpenGLImpl::makeEGLCurrent() {
void CHyprOpenGLImpl::begin(PHLMONITOR pMonitor, const CRegion& damage_, SP<IFramebuffer> fb, std::optional<CRegion> finalDamage) {
g_pHyprRenderer->m_renderData.pMonitor = pMonitor;
if (m_gpuResetCooldown > 0) {
m_gpuResetCooldown--;
Log::logger->log(Log::WARN, "GPU reset recovery cooldown, skipping frame ({} remaining)", m_gpuResetCooldown);
g_pHyprRenderer->m_renderData.pMonitor.reset();
return;
}
const GLenum RESETSTATUS = glGetGraphicsResetStatus();
if (RESETSTATUS != GL_NO_ERROR) {
std::string errStr = "";
@ -723,7 +756,14 @@ void CHyprOpenGLImpl::begin(PHLMONITOR pMonitor, const CRegion& damage_, SP<IFra
case GL_UNKNOWN_CONTEXT_RESET: errStr = "GL_UNKNOWN_CONTEXT_RESET"; break;
default: errStr = "UNKNOWN??"; break;
}
RASSERT(false, "Aborting, glGetGraphicsResetStatus returned {}. Cannot continue until proper GPU reset handling is implemented.", errStr);
Log::logger->log(Log::ERR, "GPU reset detected in begin: {}. Attempting EGL context recovery.", errStr);
if (!attemptContextReset()) {
Log::logger->log(Log::ERR, "GPU reset recovery failed. Skipping frames for cooldown.");
m_gpuResetCooldown = 60;
} else {
Log::logger->log(Log::WARN, "GPU reset recovery succeeded. Skipping current frame to reinitialize.");
}
g_pHyprRenderer->m_renderData.pMonitor.reset();
return;
}
@ -856,8 +896,10 @@ void CHyprOpenGLImpl::end() {
// check for gl errors
const GLenum ERR = glGetError();
if UNLIKELY (ERR == GL_CONTEXT_LOST) /* We don't have infra to recover from this */
RASSERT(false, "glGetError at Opengl::end() returned GL_CONTEXT_LOST. Cannot continue until proper GPU reset handling is implemented.");
if UNLIKELY (ERR == GL_CONTEXT_LOST) {
Log::logger->log(Log::ERR, "glGetError at Opengl::end() returned GL_CONTEXT_LOST. Recovery will trigger on next begin().");
m_gpuResetCooldown = 60;
}
}
}
@ -1443,8 +1485,13 @@ WP<CShader> CHyprOpenGLImpl::renderToFBInternal(SP<ITexture> tex, const STexture
void CHyprOpenGLImpl::renderTextureInternal(SP<ITexture> tex, const CBox& box, const STextureRenderData& data) {
RASSERT(g_pHyprRenderer->m_renderData.pMonitor, "Tried to render texture without begin()!");
RASSERT(tex, "Attempted to draw nullptr texture!");
RASSERT(tex->ok(), "Attempted to draw invalid texture!");
if UNLIKELY (!tex || !tex->ok()) {
// After a GPU reset, textures become invalid. Skip the draw
// instead of aborting — recovery will happen on the next begin().
Log::logger->log(Log::ERR, "renderTextureInternal: invalid texture (likely GPU reset). Skipping draw.");
return;
}
TRACY_GPU_ZONE("RenderTextureInternalWithDamage");

View file

@ -313,6 +313,8 @@ namespace Render::GL {
bool m_applyFinalShader = false;
bool m_blend = false;
bool m_offloadedFramebuffer = false;
int m_gpuResetCooldown = 0;
bool attemptContextReset();
bool m_cmSupported = true;
SP<CShader> m_finalScreenShader;