mesa/src/vulkan/runtime/vk_device.h
Yiwei Zhang 8351c6070d vulkan/anv: use vk_device_get_timestamp and drop vk_clock_gettime
vk_clock_gettime hasn't been used by other implementations ever since
venus and kk migrated over to the common implementation. It'd be better
to drop that helper (or move into anv) because it's not OS agnostic as
compare to the more comprehensive vk_device_get_timestamp.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40582>
2026-03-24 04:08:39 +00:00

497 lines
19 KiB
C

/*
* Copyright © 2020 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef VK_DEVICE_H
#define VK_DEVICE_H
#include "rmv/vk_rmv_common.h"
#include "vk_dispatch_table.h"
#include "vk_extensions.h"
#include "vk_object.h"
#include "vk_physical_device_features.h"
#include "util/list.h"
#include "util/simple_mtx.h"
#include "util/u_atomic.h"
#include "util/u_sync_provider.h"
#ifdef __cplusplus
extern "C" {
#endif
struct vk_acceleration_structure_build_ops;
struct vk_command_buffer_ops;
struct vk_device_shader_ops;
struct vk_sync_signal;
struct vk_sync_wait;
enum vk_queue_submit_mode {
/** Submits happen immediately
*
* `vkQueueSubmit()` and `vkQueueBindSparse()` call
* ``vk_queue::driver_submit`` directly for all submits and the last call to
* ``vk_queue::driver_submit`` will have completed by the time
* `vkQueueSubmit()` or `vkQueueBindSparse()` return.
*/
VK_QUEUE_SUBMIT_MODE_IMMEDIATE,
/** Submits may be deferred until a future `vk_queue_flush()`
*
* Submits are added to the queue and `vk_queue_flush()` is called.
* However, any submits with unsatisfied dependencies will be left on the
* queue until a future `vk_queue_flush()` call. This is used for
* implementing emulated timeline semaphores without threading.
*/
VK_QUEUE_SUBMIT_MODE_DEFERRED,
/** Submits will be added to the queue and handled later by a thread
*
* This places additional requirements on the vk_sync types used by the
* driver:
*
* 1. All `vk_sync` types which support `VK_SYNC_FEATURE_GPU_WAIT` also
* support `VK_SYNC_FEATURE_WAIT_PENDING` so that the threads can
* sort out when a given submit has all its dependencies resolved.
*
* 2. All binary `vk_sync` types which support `VK_SYNC_FEATURE_GPU_WAIT`
* also support `VK_SYNC_FEATURE_CPU_RESET` so we can reset
* semaphores after waiting on them.
*
* 3. All vk_sync types used as permanent payloads of semaphores support
* ``vk_sync_type::move`` so that it can move the pending signal into a
* temporary vk_sync and reset the semaphore.
*
* This is requied for shared timeline semaphores where we need to handle
* wait-before-signal by threading in the driver if we ever see an
* unresolve dependency.
*/
VK_QUEUE_SUBMIT_MODE_THREADED,
/** Threaded but only if we need it to resolve dependencies
*
* This imposes all the same requirements on `vk_sync` types as
* `VK_QUEUE_SUBMIT_MODE_THREADED`.
*/
VK_QUEUE_SUBMIT_MODE_THREADED_ON_DEMAND,
};
struct vk_device_memory_report {
PFN_vkDeviceMemoryReportCallbackEXT callback;
void *data;
};
/** Base struct for VkDevice */
struct vk_device {
struct vk_object_base base;
/** Allocator used to create this device
*
* This is used as a fall-back for when a NULL pAllocator is passed into a
* device-level create function such as vkCreateImage().
*/
VkAllocationCallbacks alloc;
/** Pointer to the physical device */
struct vk_physical_device *physical;
/** Table of enabled extensions */
struct vk_device_extension_table enabled_extensions;
/** Table of enabled features */
struct vk_features enabled_features;
/** Device-level dispatch table */
struct vk_device_dispatch_table dispatch_table;
/** Command dispatch table
*
* This is used for emulated secondary command buffer support. To use
* emulated (trace/replay) secondary command buffers:
*
* 1. Provide your "real" command buffer dispatch table here. Because
* this doesn't get populated by vk_device_init(), the driver will have
* to add the vk_common entrypoints to this table itself.
*
* 2. Add vk_enqueue_unless_primary_device_entrypoint_table to your device
* level dispatch table.
*/
const struct vk_device_dispatch_table *command_dispatch_table;
/** Command buffer vtable when using the common command pool */
const struct vk_command_buffer_ops *command_buffer_ops;
/** Shader vtable for VK_EXT_shader_object and common pipelines */
const struct vk_device_shader_ops *shader_ops;
/** Acceleration structure build vtable for common BVH building. */
const struct vk_acceleration_structure_build_ops *as_build_ops;
/**
* Write data to a buffer from the command processor. This is simpler than
* setting up a staging buffer and faster for small writes, but is not
* meant for larger amounts of data. \p data is owned by the caller and the
* driver is expected to write it out directly to the command stream as
* part of an immediate write packet.
*/
void (*write_buffer_cp)(VkCommandBuffer cmdbuf, VkDeviceAddress addr,
void *data, uint32_t size);
/* Flush data written via write_buffer_cp. Users must use a normal pipeline
* barrier in order to read this data, with the appropriate destination
* access, but this replaces the source access mask.
*/
void (*flush_buffer_write_cp)(VkCommandBuffer cmdbuf);
/* An unaligned dispatch function. This launches a number of threads that
* may not be a multiple of the workgroup size, which may result in partial
* workgroups.
*/
void (*cmd_dispatch_unaligned)(VkCommandBuffer cmdbuf,
uint32_t invocations_x,
uint32_t invocations_y,
uint32_t invocations_z);
/* vkCmdFillBuffer but with a device address. */
void (*cmd_fill_buffer_addr)(VkCommandBuffer cmdbuf,
VkDeviceAddress devAddr,
VkDeviceSize size,
uint32_t data);
/** Driver provided callback for capturing traces
*
* Triggers for this callback are:
* - Keyboard input (F12)
* - Creation of a trigger file
* - Reaching the trace frame
*/
VkResult (*capture_trace)(VkQueue queue);
uint32_t current_frame;
bool trace_hotkey_trigger;
simple_mtx_t trace_mtx;
/* For VK_EXT_private_data */
uint32_t private_data_next_index;
struct list_head queues;
struct {
int lost;
bool reported;
} _lost;
/** Checks the status of this device
*
* This is expected to return either VK_SUCCESS or VK_ERROR_DEVICE_LOST.
* It is called before ``vk_queue::driver_submit`` and after every non-trivial
* wait operation to ensure the device is still around. This gives the
* driver a hook to ask the kernel if its device is still valid. If the
* kernel says the device has been lost, it MUST call vk_device_set_lost().
*
* This function may be called from any thread at any time.
*/
VkResult (*check_status)(struct vk_device *device);
/* Get the device timestamp in the VK_TIME_DOMAIN_DEVICE_KHR domain */
VkResult (*get_timestamp)(struct vk_device *device, uint64_t *timestamp);
/** Host time domain used for timestamp calibration */
VkTimeDomainKHR calibrate_time_domain;
/** Period of VK_TIME_DOMAIN_DEVICE_KHR */
uint64_t device_time_domain_period;
/** Copies the sync payloads from the set of waits to the set of signals
*
* This effectively does the same as a vk_queue::driver_submit() with the
* given set of waits and signals and no command buffers, only without the
* queue. Instead, the driver is expected to simply copy the sync payloads
* from the wait set, merge them together into one, and apply that to the
* signals. After this function returns, all of the signals are now
* equivalent to the union all of the waits.
*/
VkResult (*copy_sync_payloads)(struct vk_device *device,
uint32_t wait_count,
const struct vk_sync_wait *waits,
uint32_t signal_count,
const struct vk_sync_signal *signals);
/* Set by vk_device_set_drm_fd() */
struct util_sync_provider *sync;
/** Implicit pipeline cache, or NULL */
struct vk_pipeline_cache *mem_cache;
/** An enum describing how timeline semaphores work */
enum vk_device_timeline_mode {
/** Timeline semaphores are not supported */
VK_DEVICE_TIMELINE_MODE_NONE,
/** Timeline semaphores are emulated with vk_timeline
*
* In this mode, timeline semaphores are emulated using vk_timeline
* which is a collection of binary semaphores, one per time point.
* These timeline semaphores cannot be shared because the data structure
* exists entirely in userspace. These timelines are virtually
* invisible to the driver; all it sees are the binary vk_syncs, one per
* time point.
*
* To handle wait-before-signal, we place all vk_queue_submits in the
* queue's submit list in vkQueueSubmit() and call vk_device_flush() at
* key points such as the end of vkQueueSubmit() and vkSemaphoreSignal().
* This ensures that, as soon as a given submit's dependencies are fully
* resolvable, it gets submitted to the driver.
*/
VK_DEVICE_TIMELINE_MODE_EMULATED,
/** Timeline semaphores are a kernel-assisted emulation
*
* In this mode, timeline semaphores are still technically an emulation
* in the sense that they don't support wait-before-signal natively.
* Instead, all GPU-waitable objects support a CPU wait-for-pending
* operation which lets the userspace driver wait until a given event
* on the (possibly shared) vk_sync is pending. The event is "pending"
* if a job has been submitted to the kernel (possibly from a different
* process) which will signal it. In vkQueueSubit, we use this wait
* mode to detect waits which are not yet pending and, the first time we
* do, spawn a thread to manage the queue. That thread waits for each
* submit's waits to all be pending before submitting to the driver
* queue.
*
* We have to be a bit more careful about a few things in this mode.
* In particular, we can never assume that any given wait operation is
* pending. For instance, when we go to export a sync file from a
* binary semaphore, we need to first wait for it to be pending. The
* spec guarantees that the vast majority of these waits return almost
* immediately, but we do need to insert them for correctness.
*/
VK_DEVICE_TIMELINE_MODE_ASSISTED,
/** Timeline semaphores are 100% native
*
* In this mode, wait-before-signal is natively supported by the
* underlying timeline implementation. We can submit-and-forget and
* assume that dependencies will get resolved for us by the kernel.
* Currently, this isn't supported by any Linux primitives.
*/
VK_DEVICE_TIMELINE_MODE_NATIVE,
} timeline_mode;
/** Per-device submit mode
*
* This represents the device-wide submit strategy which may be different
* from the per-queue submit mode. See vk_queue.submit.mode for more
* details.
*/
enum vk_queue_submit_mode submit_mode;
struct vk_memory_trace_data memory_trace_data;
mtx_t swapchain_private_mtx;
struct hash_table *swapchain_private;
mtx_t swapchain_name_mtx;
struct hash_table *swapchain_name;
/* For VK_KHR_pipeline_binary */
bool disable_internal_cache;
/* Link-time optimization disable */
bool disable_lto;
struct vk_device_memory_report *memory_reports;
uint32_t memory_report_count;
};
VK_DEFINE_HANDLE_CASTS(vk_device, base, VkDevice,
VK_OBJECT_TYPE_DEVICE);
/** Initialize a vk_device
*
* Along with initializing the data structures in `vk_device`, this function
* checks that every extension specified by
* ``VkInstanceCreateInfo::ppEnabledExtensionNames`` is actually supported by
* the physical device and returns `VK_ERROR_EXTENSION_NOT_PRESENT` if an
* unsupported extension is requested. It also checks all the feature struct
* chained into the `pCreateInfo->pNext` chain against the features returned
* by `vkGetPhysicalDeviceFeatures2` and returns
* `VK_ERROR_FEATURE_NOT_PRESENT` if an unsupported feature is requested.
*
* :param device: |out| The device to initialize
* :param physical_device: |in| The physical device
* :param dispatch_table: |in| Device-level dispatch table
* :param pCreateInfo: |in| VkDeviceCreateInfo pointer passed to
* `vkCreateDevice()`
* :param alloc: |in| Allocation callbacks passed to
* `vkCreateDevice()`
*/
VkResult MUST_CHECK
vk_device_init(struct vk_device *device,
struct vk_physical_device *physical_device,
const struct vk_device_dispatch_table *dispatch_table,
const VkDeviceCreateInfo *pCreateInfo,
const VkAllocationCallbacks *alloc);
static inline void
vk_device_set_drm_fd(struct vk_device *device, int drm_fd)
{
device->sync = util_sync_provider_drm(drm_fd);
}
/** Tears down a vk_device
*
* :param device: |out| The device to tear down
*/
void
vk_device_finish(struct vk_device *device);
/** Enables threaded submit on this device
*
* This doesn't ensure that threaded submit will be used. It just disables
* the deferred submit option for emulated timeline semaphores and forces them
* to always use the threaded path. It also does some checks that the vk_sync
* types used by the driver work for threaded submit.
*
* This must be called before any queues are created.
*/
void vk_device_enable_threaded_submit(struct vk_device *device);
static inline bool
vk_device_supports_threaded_submit(const struct vk_device *device)
{
return device->submit_mode == VK_QUEUE_SUBMIT_MODE_THREADED ||
device->submit_mode == VK_QUEUE_SUBMIT_MODE_THREADED_ON_DEMAND;
}
VkResult vk_device_flush(struct vk_device *device);
VkResult PRINTFLIKE(4, 5)
_vk_device_set_lost(struct vk_device *device,
const char *file, int line,
const char *msg, ...);
#define vk_device_set_lost(device, ...) \
_vk_device_set_lost(device, __FILE__, __LINE__, __VA_ARGS__)
void _vk_device_report_lost(struct vk_device *device);
static inline bool
vk_device_is_lost_no_report(struct vk_device *device)
{
return p_atomic_read(&device->_lost.lost) > 0;
}
static inline bool
vk_device_is_lost(struct vk_device *device)
{
int lost = vk_device_is_lost_no_report(device);
if (unlikely(lost && !device->_lost.reported))
_vk_device_report_lost(device);
return lost;
}
static inline VkResult
vk_device_check_status(struct vk_device *device)
{
if (vk_device_is_lost(device))
return VK_ERROR_DEVICE_LOST;
if (!device->check_status)
return VK_SUCCESS;
VkResult result = device->check_status(device);
assert(result == VK_SUCCESS || result == VK_ERROR_DEVICE_LOST);
if (result == VK_ERROR_DEVICE_LOST)
assert(vk_device_is_lost_no_report(device));
return result;
}
/** Copy semaphore payloads to other semaphores/fences
*
* This is equivalent to doing VkQueueSubmit without any command buffers or
* sparse bind operations and without implicitly synchronizing on any queue.
*/
VkResult
vk_device_copy_semaphore_payloads(struct vk_device *device,
uint32_t wait_semaphore_count,
const VkSemaphoreSubmitInfo *wait_semaphores,
uint32_t signal_semaphore_count,
const VkSemaphoreSubmitInfo *signal_semaphores,
uint32_t fence_count,
const VkFence *fences);
VkResult
vk_device_get_timestamp(struct vk_device *device, VkTimeDomainKHR domain,
uint64_t *timestamp);
static inline uint64_t
vk_time_max_deviation(uint64_t begin, uint64_t end, uint64_t max_clock_period)
{
/*
* The maximum deviation is the sum of the interval over which we
* perform the sampling and the maximum period of any sampled
* clock. That's because the maximum skew between any two sampled
* clock edges is when the sampled clock with the largest period is
* sampled at the end of that period but right at the beginning of the
* sampling interval and some other clock is sampled right at the
* beginning of its sampling period and right at the end of the
* sampling interval. Let's assume the GPU has the longest clock
* period and that the application is sampling GPU and monotonic:
*
* s e
* w x y z 0 1 2 3 4 5 6 7 8 9 a b c d e f
* Raw -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
*
* g
* 0 1 2 3
* GPU -----_____-----_____-----_____-----_____
*
* m
* x y z 0 1 2 3 4 5 6 7 8 9 a b c
* Monotonic -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
*
* Interval <----------------->
* Deviation <-------------------------->
*
* s = read(raw) 2
* g = read(GPU) 1
* m = read(monotonic) 2
* e = read(raw) b
*
* We round the sample interval up by one tick to cover sampling error
* in the interval clock
*/
uint64_t sample_interval = end - begin + 1;
return sample_interval + max_clock_period;
}
PFN_vkVoidFunction
vk_device_get_proc_addr(const struct vk_device *device,
const char *name);
#ifdef __cplusplus
}
#endif
#endif /* VK_DEVICE_H */