From ab3aaad9570abb21967ce7479f1a920a1f5e3616 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Sat, 21 Jun 2025 08:25:09 -0300 Subject: [PATCH] vulkan: don't destroy vk_sync_timeline if a point is still pending MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, vk_sync_timeline_wait() may report idle while we still have time points outstanding in vk_queue. This race between vk_sync_timeline and vk_queue is problematic as vk_queue_submit_cleanup() can end-up trying to unref a timeline point that was already destroyed in vk_sync_timeline_finish(), leading to an use-after-free. Address this race-condition by introducing a reference counter to the timeline. Each timeline point takes a reference to the timeline when it's checked out and drops the reference when it's freed. Now, the timeline is only destroyed when all the references had been dropped. To guarantee that all references will be dropped and the timeline will be destroyed, vk_sync_timeline_finish() waits for all pending points to be checked out. This commit fixes several Mesa CI flakes in v3dv related to timeline semaphores. Cc: mesa-stable Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12648 Signed-off-by: MaĆ­ra Canal Reviewed-by: Faith Ekstrand Part-of: --- src/vulkan/runtime/vk_sync_timeline.c | 155 ++++++++++++++++++-------- src/vulkan/runtime/vk_sync_timeline.h | 2 + 2 files changed, 113 insertions(+), 44 deletions(-) diff --git a/src/vulkan/runtime/vk_sync_timeline.c b/src/vulkan/runtime/vk_sync_timeline.c index b21e97c846d..0d392f28418 100644 --- a/src/vulkan/runtime/vk_sync_timeline.c +++ b/src/vulkan/runtime/vk_sync_timeline.c @@ -60,6 +60,38 @@ vk_sync_timeline_type_validate(const struct vk_sync_timeline_type *ttype) assert(!(req_features & ~ttype->point_sync_type->features)); } +static void +vk_sync_timeline_state_ref(struct vk_sync_timeline_state *state) +{ + p_atomic_inc(&state->refcount); +} + +static void +vk_sync_timeline_state_unref(struct vk_device *device, + struct vk_sync_timeline_state *state) +{ + if (p_atomic_dec_return(&state->refcount)) + return; + + list_for_each_entry_safe(struct vk_sync_timeline_point, point, + &state->free_points, link) { + list_del(&point->link); + vk_sync_finish(device, &point->sync); + vk_free(&device->alloc, point); + } + + list_for_each_entry_safe(struct vk_sync_timeline_point, point, + &state->pending_points, link) { + list_del(&point->link); + vk_sync_finish(device, &point->sync); + vk_free(&device->alloc, point); + } + + u_cnd_monotonic_destroy(&state->cond); + mtx_destroy(&state->mutex); + vk_free(&device->alloc, state); +} + VkResult vk_sync_timeline_init(struct vk_device *device, struct vk_sync *sync, @@ -95,33 +127,37 @@ vk_sync_timeline_init(struct vk_device *device, list_inithead(&state->pending_points); list_inithead(&state->free_points); + p_atomic_set(&state->refcount, 1); + timeline->state = state; return VK_SUCCESS; } +static VkResult +vk_sync_timeline_gc_locked(struct vk_device *device, + struct vk_sync_timeline_state *state, + bool drain); + static void vk_sync_timeline_finish(struct vk_device *device, struct vk_sync *sync) { struct vk_sync_timeline_state *state = to_vk_sync_timeline_state(sync); - list_for_each_entry_safe(struct vk_sync_timeline_point, point, - &state->free_points, link) { - list_del(&point->link); - vk_sync_finish(device, &point->sync); - vk_free(&device->alloc, point); - } - list_for_each_entry_safe(struct vk_sync_timeline_point, point, - &state->pending_points, link) { - list_del(&point->link); - vk_sync_finish(device, &point->sync); - vk_free(&device->alloc, point); - } + /* We need to garbage collect to get rid of any pending points so that the + * vk_sync_timeline_state_unref() at the end drops the final reference + * held by the vk_sync_timeline. It's up to the client to ensure that + * there are no vk_sync in-flight when this is called so this should get + * rid of all pending time points. The only time point references left are + * those held by waits or vk_queue_submit(). + */ + mtx_lock(&state->mutex); + vk_sync_timeline_gc_locked(device, state, true); + assert(list_is_empty(&state->pending_points)); + mtx_unlock(&state->mutex); - u_cnd_monotonic_destroy(&state->cond); - mtx_destroy(&state->mutex); - vk_free(&device->alloc, state); + vk_sync_timeline_state_unref(device, state); } static struct vk_sync_timeline_point * @@ -137,11 +173,6 @@ vk_sync_timeline_first_point(struct vk_sync_timeline_state *state) return point; } -static VkResult -vk_sync_timeline_gc_locked(struct vk_device *device, - struct vk_sync_timeline_state *state, - bool drain); - static VkResult vk_sync_timeline_alloc_point_locked(struct vk_device *device, struct vk_sync_timeline *timeline, @@ -197,6 +228,8 @@ vk_sync_timeline_alloc_point_locked(struct vk_device *device, *point_out = point; + vk_sync_timeline_state_ref(state); + return VK_SUCCESS; } @@ -221,21 +254,66 @@ vk_sync_timeline_ref_point_locked(struct vk_sync_timeline_point *point) point->refcount++; } -static void -vk_sync_timeline_unref_point_locked(struct vk_sync_timeline_state *state, - struct vk_sync_timeline_point *point) +/* Returns true if this was the last reference to point. + * + * DO NOT call this helper directly. You should call vk_sync_timeline_unref_point_locked() + * or vk_sync_timeline_point_unref() instead. + */ +static bool +vk_sync_timeline_unref_point_no_unref_state_locked(struct vk_sync_timeline_point *point) { + struct vk_sync_timeline_state *state = point->timeline_state; + assert(point->refcount > 0); point->refcount--; - if (point->refcount == 0) { - /* The pending list also takes a reference so this can't be pending */ - assert(!point->pending); - list_add(&point->link, &state->free_points); - } + + if (point->refcount > 0) + return false; + + assert(point->refcount == 0); + + /* The pending list also takes a reference so this can't be pending */ + assert(!point->pending); + list_add(&point->link, &state->free_points); + + return true; } static void -vk_sync_timeline_complete_point_locked(struct vk_sync_timeline_state *state, +vk_sync_timeline_unref_point_locked(struct vk_device *device, + struct vk_sync_timeline_state *state, + struct vk_sync_timeline_point *point) +{ + /* The caller needs to have its own reference to the state, not just the + * one implicit in point, because it's also holding the lock. + */ + assert(p_atomic_read(&state->refcount) > 1); + + if (vk_sync_timeline_unref_point_no_unref_state_locked(point)) + vk_sync_timeline_state_unref(device, state); +} + +void +vk_sync_timeline_point_unref(struct vk_device *device, + struct vk_sync_timeline_point *point) +{ + struct vk_sync_timeline_state *state = point->timeline_state; + + mtx_lock(&state->mutex); + const bool last_ref = + vk_sync_timeline_unref_point_no_unref_state_locked(point); + mtx_unlock(&state->mutex); + + /* Drop the state reference outside the mutex so we don't free the state + * and then try to unlock the mutex. + */ + if (last_ref) + vk_sync_timeline_state_unref(device, state); +} + +static void +vk_sync_timeline_complete_point_locked(struct vk_device *device, + struct vk_sync_timeline_state *state, struct vk_sync_timeline_point *point) { if (!point->pending) @@ -248,7 +326,7 @@ vk_sync_timeline_complete_point_locked(struct vk_sync_timeline_state *state, list_del(&point->link); /* Drop the pending reference */ - vk_sync_timeline_unref_point_locked(state, point); + vk_sync_timeline_unref_point_locked(device, state, point); } static VkResult @@ -290,7 +368,7 @@ vk_sync_timeline_gc_locked(struct vk_device *device, return result; } - vk_sync_timeline_complete_point_locked(state, point); + vk_sync_timeline_complete_point_locked(device, state, point); } return VK_SUCCESS; @@ -364,17 +442,6 @@ vk_sync_timeline_get_point(struct vk_device *device, return result; } -void -vk_sync_timeline_point_unref(struct vk_device *device, - struct vk_sync_timeline_point *point) -{ - struct vk_sync_timeline_state *state = point->timeline_state; - - mtx_lock(&state->mutex); - vk_sync_timeline_unref_point_locked(state, point); - mtx_unlock(&state->mutex); -} - static VkResult vk_sync_timeline_signal_locked(struct vk_device *device, struct vk_sync_timeline_state *state, @@ -476,13 +543,13 @@ vk_sync_timeline_wait_locked(struct vk_device *device, /* Pick the mutex back up */ mtx_lock(&state->mutex); - vk_sync_timeline_unref_point_locked(state, point); + vk_sync_timeline_unref_point_locked(device, state, point); /* This covers both VK_TIMEOUT and VK_ERROR_DEVICE_LOST */ if (result != VK_SUCCESS) return result; - vk_sync_timeline_complete_point_locked(state, point); + vk_sync_timeline_complete_point_locked(device, state, point); } return VK_SUCCESS; diff --git a/src/vulkan/runtime/vk_sync_timeline.h b/src/vulkan/runtime/vk_sync_timeline.h index 27a0df9dc8b..50642efc5cf 100644 --- a/src/vulkan/runtime/vk_sync_timeline.h +++ b/src/vulkan/runtime/vk_sync_timeline.h @@ -69,6 +69,8 @@ struct vk_sync_timeline_state { uint64_t highest_past; uint64_t highest_pending; + uint32_t refcount; + struct list_head pending_points; struct list_head free_points; };