venus: refactor to add vn_watchdog

Summary:
- cleanup redundant report_period_us check post 1.0 release
- add vn_watchdog and its accessors
  - vn_watchdog_init
  - vn_watchdog_fini
  - vn_watchdog_acquire
  - vn_watchdog_release
  - vn_watchdog_timeout

Signed-off-by: Yiwei Zhang <zzyiwei@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26179>
This commit is contained in:
Yiwei Zhang 2023-11-06 11:24:40 -08:00 committed by Marge Bot
parent d8b059b01b
commit f6adc60822
4 changed files with 100 additions and 75 deletions

View file

@ -130,53 +130,54 @@ vn_extension_get_spec_version(const char *name)
return index >= 0 ? vn_info_extension_get(index)->spec_version : 0;
}
static inline bool
vn_watchdog_timeout(const struct vn_watchdog *watchdog)
{
return !watchdog->alive;
}
static inline void
vn_watchdog_release(struct vn_watchdog *watchdog)
{
if (syscall(SYS_gettid) == watchdog->tid) {
watchdog->tid = 0;
mtx_unlock(&watchdog->mutex);
}
}
static bool
vn_ring_monitor_acquire(struct vn_ring *ring)
vn_watchdog_acquire(struct vn_watchdog *watchdog, bool alive)
{
pid_t tid = syscall(SYS_gettid);
if (!ring->instance->ring.monitor.threadid &&
tid != ring->instance->ring.monitor.threadid &&
mtx_trylock(&ring->instance->ring.monitor.mutex) == thrd_success) {
if (!watchdog->tid && tid != watchdog->tid &&
mtx_trylock(&watchdog->mutex) == thrd_success) {
/* register as the only waiting thread that monitors the ring. */
ring->instance->ring.monitor.threadid = tid;
watchdog->tid = tid;
}
return tid == ring->instance->ring.monitor.threadid;
if (tid != watchdog->tid)
return false;
watchdog->alive = alive;
return true;
}
void
vn_ring_monitor_release(struct vn_ring *ring)
vn_relax_fini(struct vn_relax_state *state)
{
if (syscall(SYS_gettid) != ring->instance->ring.monitor.threadid)
return;
ring->instance->ring.monitor.threadid = 0;
mtx_unlock(&ring->instance->ring.monitor.mutex);
vn_watchdog_release(state->watchdog);
}
struct vn_relax_state
vn_relax_init(struct vn_ring *ring, const char *reason)
{
if (ring->instance->ring.monitor.report_period_us) {
#ifndef NDEBUG
/* ensure minimum check period is greater than maximum renderer
* reporting period (with margin of safety to ensure no false
* positives).
*
* first_warn_time is pre-calculated based on parameters in vn_relax
* and must update together.
*/
const uint32_t first_warn_time = 3481600;
const uint32_t safety_margin = 250000;
assert(first_warn_time - safety_margin >=
ring->instance->ring.monitor.report_period_us);
#endif
if (vn_ring_monitor_acquire(ring))
vn_ring_unset_status_bits(ring, VK_RING_STATUS_ALIVE_BIT_MESA);
}
struct vn_watchdog *watchdog = &ring->instance->ring.watchdog;
if (vn_watchdog_acquire(watchdog, true))
vn_ring_unset_status_bits(ring, VK_RING_STATUS_ALIVE_BIT_MESA);
return (struct vn_relax_state){
.ring = ring,
.watchdog = watchdog,
.iter = 0,
.reason = reason,
};
@ -209,30 +210,28 @@ vn_relax(struct vn_relax_state *state)
* another 2047 shorter sleeps)
*/
if (unlikely(*iter % (1 << warn_order) == 0)) {
vn_log(NULL, "stuck in %s wait with iter at %d", reason, *iter);
struct vn_instance *instance = ring->instance;
vn_log(instance, "stuck in %s wait with iter at %d", reason, *iter);
struct vn_watchdog *watchdog = state->watchdog;
const uint32_t status = vn_ring_load_status(ring);
if (status & VK_RING_STATUS_FATAL_BIT_MESA) {
vn_log(NULL, "aborting on ring fatal error at iter %d", *iter);
vn_log(instance, "aborting on ring fatal error at iter %d", *iter);
abort();
}
if (ring->instance->ring.monitor.report_period_us) {
if (vn_ring_monitor_acquire(ring)) {
ring->instance->ring.monitor.alive =
status & VK_RING_STATUS_ALIVE_BIT_MESA;
vn_ring_unset_status_bits(ring, VK_RING_STATUS_ALIVE_BIT_MESA);
}
const bool alive = status & VK_RING_STATUS_ALIVE_BIT_MESA;
if (vn_watchdog_acquire(watchdog, alive))
vn_ring_unset_status_bits(ring, VK_RING_STATUS_ALIVE_BIT_MESA);
if (!ring->instance->ring.monitor.alive && !VN_DEBUG(NO_ABORT)) {
vn_log(NULL, "aborting on expired ring alive status at iter %d",
*iter);
abort();
}
if (vn_watchdog_timeout(watchdog) && !VN_DEBUG(NO_ABORT)) {
vn_log(instance, "aborting on expired ring alive status at iter %d",
*iter);
abort();
}
if (*iter >= (1 << abort_order) && !VN_DEBUG(NO_ABORT)) {
vn_log(NULL, "aborting");
vn_log(instance, "aborting");
abort();
}
}

View file

@ -49,6 +49,7 @@
#include "vn_entrypoints.h"
#define VN_DEFAULT_ALIGN 8
#define VN_WATCHDOG_REPORT_PERIOD_US 3000000
#define VN_DEBUG(category) (unlikely(vn_env.debug & VN_DEBUG_##category))
#define VN_PERF(category) (unlikely(vn_env.perf & VN_PERF_##category))
@ -181,8 +182,28 @@ struct vn_env {
};
extern struct vn_env vn_env;
/* Only one "waiting" thread may fulfill the "watchdog" role at a time. Every
* VN_WATCHDOG_REPORT_PERIOD_US or longer, the watchdog tests the ring's ALIVE
* status, updates the "alive" atomic, and resets the ALIVE status for the
* next cycle. Other waiting threads just check the "alive" atomic. The
* watchdog role may be released and acquired by another waiting thread
* dynamically.
*
* Examples of "waiting" are to wait for:
* - ring to reach a seqno
* - ring space to be released
* - sync primitives to signal
* - query result being available
*/
struct vn_watchdog {
mtx_t mutex;
atomic_int tid;
atomic_bool alive;
};
struct vn_relax_state {
struct vn_ring *ring;
struct vn_watchdog *watchdog;
uint32_t iter;
const char *reason;
};
@ -254,8 +275,35 @@ vn_refcount_dec(struct vn_refcount *ref)
uint32_t
vn_extension_get_spec_version(const char *name);
void
vn_ring_monitor_release(struct vn_ring *ring);
static inline void
vn_watchdog_init(struct vn_watchdog *watchdog)
{
#ifndef NDEBUG
/* ensure minimum check period is greater than maximum renderer
* reporting period (with margin of safety to ensure no false
* positives).
*
* first_warn_time is pre-calculated based on parameters in vn_relax
* and must update together.
*/
static const uint32_t first_warn_time = 3481600;
static const uint32_t safety_margin = 250000;
assert(first_warn_time - safety_margin >= VN_WATCHDOG_REPORT_PERIOD_US);
#endif
mtx_init(&watchdog->mutex, mtx_plain);
watchdog->tid = 0;
/* initialized to be alive to avoid vn_watchdog_timout false alarm */
watchdog->alive = true;
}
static inline void
vn_watchdog_fini(struct vn_watchdog *watchdog)
{
mtx_destroy(&watchdog->mutex);
}
struct vn_relax_state
vn_relax_init(struct vn_ring *ring, const char *reason);
@ -263,11 +311,8 @@ vn_relax_init(struct vn_ring *ring, const char *reason);
void
vn_relax(struct vn_relax_state *state);
static inline void
vn_relax_fini(struct vn_relax_state *state)
{
vn_ring_monitor_release(state->ring);
}
void
vn_relax_fini(struct vn_relax_state *state);
static_assert(sizeof(vn_object_id) >= sizeof(uintptr_t), "");

View file

@ -125,7 +125,7 @@ vn_instance_fini_ring(struct vn_instance *instance)
vn_renderer_submit_simple(instance->renderer, destroy_ring_data,
vn_cs_encoder_get_len(&local_enc));
mtx_destroy(&instance->ring.monitor.mutex);
vn_watchdog_fini(&instance->ring.watchdog);
vn_ring_fini(&instance->ring.ring);
@ -138,7 +138,7 @@ static VkResult
vn_instance_init_ring(struct vn_instance *instance)
{
/* 32-bit seqno for renderer roundtrips */
const size_t extra_size = sizeof(uint32_t);
static const size_t extra_size = sizeof(uint32_t);
struct vn_ring_layout layout;
vn_ring_get_layout(VN_INSTANCE_RING_SIZE, extra_size, &layout);
@ -157,16 +157,11 @@ vn_instance_init_ring(struct vn_instance *instance)
instance->ring.id = (uintptr_t)ring;
instance->ring.monitor.report_period_us = 3000000;
mtx_init(&instance->ring.monitor.mutex, mtx_plain);
/* ring monitor should be alive at all time */
instance->ring.monitor.alive = true;
vn_watchdog_init(&instance->ring.watchdog);
const struct VkRingMonitorInfoMESA monitor_info = {
.sType = VK_STRUCTURE_TYPE_RING_MONITOR_INFO_MESA,
.maxReportingPeriodMicroseconds =
instance->ring.monitor.report_period_us,
.maxReportingPeriodMicroseconds = VN_WATCHDOG_REPORT_PERIOD_US,
};
const struct VkRingCreateInfoMESA info = {
.sType = VK_STRUCTURE_TYPE_RING_CREATE_INFO_MESA,

View file

@ -64,21 +64,7 @@ struct vn_instance {
mtx_t roundtrip_mutex;
uint64_t roundtrip_next;
/* Only one "waiting" thread may fulfill the "monitor" role at a time.
* Every "report_period_us" or longer, the waiting "monitor" thread
* tests the ring's ALIVE status, updates the "alive" atomic, and resets
* the ALIVE status for the next cycle. Waiting non-"monitor" threads,
* just check the "alive" atomic. The "monitor" role may be released and
* acquired by another waiting thread dynamically.
*/
struct {
mtx_t mutex;
atomic_int threadid;
atomic_bool alive;
/* constant and non-zero after ring init, if monitoring is enabled */
uint32_t report_period_us;
} monitor;
struct vn_watchdog watchdog;
} ring;
/* Between the driver and the app, VN_MAX_API_VERSION is what we advertise