Merge branch 'work/tu_kgsl_timeline_sync' into 'main'

tu/kgsl: timeline-based queue and sync primitives

See merge request mesa/mesa!39751
This commit is contained in:
Zan Dobersek 2026-03-11 05:38:00 +00:00
commit e0d31a135c
4 changed files with 680 additions and 100 deletions

View file

@ -75,6 +75,7 @@
#include "vk_object.h"
#include "vk_sync.h"
#include "vk_drm_syncobj.h"
#include "vk_sync_binary.h"
#include "vk_sync_timeline.h"
#define MAX_VBS 32

View file

@ -80,6 +80,11 @@ struct tu_queue_family {
const VkQueueFamilyProperties *properties;
};
enum tu_kgsl_sync_impl_type {
TU_KGSL_SYNC_IMPL_TYPE_SYNCOBJ,
TU_KGSL_SYNC_IMPL_TYPE_TIMELINE,
};
extern uint64_t os_page_size;
struct tu_physical_device
@ -162,7 +167,9 @@ struct tu_physical_device
struct tu_memory_heap heap;
enum tu_kgsl_sync_impl_type kgsl_sync_impl_type;
struct vk_sync_type syncobj_type;
struct vk_sync_binary_type binary_type;
struct vk_sync_timeline_type timeline_type;
const struct vk_sync_type *sync_types[3];

View file

@ -49,6 +49,92 @@ safe_ioctl(int fd, unsigned long request, void *arg)
return ret;
}
static int
kgsl_timeline_create_ioctl(int fd,
uint64_t initial_value,
uint32_t *id)
{
struct kgsl_timeline_create req = { .seqno = initial_value };
int ret = safe_ioctl(fd, IOCTL_KGSL_TIMELINE_CREATE, &req);
if (!ret) {
/* Sentinel value, returned valid ID should be non-zero. */
assert(req.id != 0);
*id = req.id;
}
return ret;
}
static int
kgsl_timeline_destroy_ioctl(int fd,
uint32_t id)
{
/* Sentinel value, valid ID should be non-zero. */
assert(id != 0);
return safe_ioctl(fd, IOCTL_KGSL_TIMELINE_DESTROY, &id);
}
static int
kgsl_timeline_signal_ioctl(int fd,
struct kgsl_timeline_val *timeline_vals,
uint32_t count)
{
struct kgsl_timeline_signal req = {
.timelines = (uint64_t)(uintptr_t) timeline_vals,
.count = count,
.timelines_size = sizeof(struct kgsl_timeline_val)
};
return safe_ioctl(fd, IOCTL_KGSL_TIMELINE_SIGNAL, &req);
}
static int
kgsl_timeline_wait_ioctl(int fd,
int64_t timeout,
struct kgsl_timeline_val *timeline_vals,
uint32_t count,
uint32_t flags)
{
struct kgsl_timeline_wait req = {
.tv_sec = timeout / NSEC_PER_SEC,
.tv_nsec = timeout % NSEC_PER_SEC,
.timelines = (uint64_t)(uintptr_t) timeline_vals,
.count = count,
.timelines_size = sizeof(struct kgsl_timeline_val),
.flags = flags
};
return safe_ioctl(fd, IOCTL_KGSL_TIMELINE_WAIT, &req);
}
static int
kgsl_timeline_query_ioctl(int fd,
uint32_t id,
uint64_t *value)
{
struct kgsl_timeline_val req = { .timeline = id };
int ret = safe_ioctl(fd, IOCTL_KGSL_TIMELINE_QUERY, &req);
if (!ret)
*value = req.seqno;
return ret;
}
static void
kgsl_submitqueue_destroy(struct tu_device *dev, struct tu_queue *queue)
{
if (queue->msm_queue_id) {
struct kgsl_drawctxt_destroy req = {
.drawctxt_id = queue->msm_queue_id
};
safe_ioctl(dev->physical_device->local_fd, IOCTL_KGSL_DRAWCTXT_DESTROY, &req);
}
if (queue->kgsl_queue_timeline_id)
kgsl_timeline_destroy_ioctl(dev->physical_device->local_fd, queue->kgsl_queue_timeline_id);
}
static int
kgsl_submitqueue_new(struct tu_device *dev, struct tu_queue *queue)
{
@ -60,21 +146,27 @@ kgsl_submitqueue_new(struct tu_device *dev, struct tu_queue *queue)
int ret = safe_ioctl(dev->physical_device->local_fd, IOCTL_KGSL_DRAWCTXT_CREATE, &req);
if (ret)
return ret;
goto fail;
queue->msm_queue_id = req.drawctxt_id;
if (dev->physical_device->kgsl_sync_impl_type == TU_KGSL_SYNC_IMPL_TYPE_TIMELINE) {
ret = kgsl_timeline_create_ioctl(dev->physical_device->local_fd,
0, &queue->kgsl_queue_timeline_id);
if (ret)
goto fail;
}
return 0;
fail:
kgsl_submitqueue_destroy(dev, queue);
return ret;
}
static void
kgsl_submitqueue_close(struct tu_device *dev, struct tu_queue *queue)
{
struct kgsl_drawctxt_destroy req = {
.drawctxt_id = queue->msm_queue_id,
};
safe_ioctl(dev->physical_device->local_fd, IOCTL_KGSL_DRAWCTXT_DESTROY, &req);
kgsl_submitqueue_destroy(dev, queue);
}
static void kgsl_bo_finish(struct tu_device *dev, struct tu_bo *bo);
@ -712,6 +804,19 @@ get_relative_ms(uint64_t abs_timeout_ns)
return abs_timeout_ms - cur_time_ms;
}
static int64_t
get_relative_ns(uint64_t abs_timeout_ns)
{
if (abs_timeout_ns >= INT64_MAX)
return INT64_MAX;
int64_t cur_time_ns = os_time_get_nano();
if (abs_timeout_ns <= cur_time_ns)
return 0;
return abs_timeout_ns - cur_time_ns;
}
/* safe_ioctl is not enough as restarted waits would not adjust the timeout
* which could lead to waiting substantially longer than requested
*/
@ -1203,6 +1308,151 @@ const struct vk_sync_type vk_kgsl_sync_type = {
.export_sync_file = vk_kgsl_sync_export_sync_file,
};
struct vk_kgsl_timeline
{
struct vk_sync vk;
uint32_t id;
};
static VkResult
vk_kgsl_timeline_init(struct vk_device *_device,
struct vk_sync *sync,
uint64_t initial_value)
{
struct tu_device *device = container_of(_device, struct tu_device, vk);
struct vk_kgsl_timeline *timeline = container_of(sync, struct vk_kgsl_timeline, vk);
int ret = kgsl_timeline_create_ioctl(device->physical_device->local_fd,
initial_value, &timeline->id);
if (ret) {
return vk_errorf(_device, VK_ERROR_OUT_OF_HOST_MEMORY,
"kgsl_timeline_create failed: %m");
}
return VK_SUCCESS;
}
static void
vk_kgsl_timeline_finish(struct vk_device *_device,
struct vk_sync *sync)
{
struct tu_device *device = container_of(_device, struct tu_device, vk);
struct vk_kgsl_timeline *timeline = container_of(sync, struct vk_kgsl_timeline, vk);
kgsl_timeline_destroy_ioctl(device->physical_device->local_fd, timeline->id);
}
static VkResult
vk_kgsl_timeline_signal_many(struct vk_device *_device,
uint32_t signal_count,
const struct vk_sync_signal *signals)
{
struct tu_device *device = container_of(_device, struct tu_device, vk);
STACK_ARRAY(struct kgsl_timeline_val, timeline_vals, signal_count);
for (uint32_t i = 0; i < signal_count; ++i) {
struct vk_kgsl_timeline *timeline = container_of(signals[i].sync, struct vk_kgsl_timeline, vk);
timeline_vals[i] = (struct kgsl_timeline_val) {
.seqno = signals[i].signal_value,
.timeline = timeline->id
};
}
int ret = kgsl_timeline_signal_ioctl(device->physical_device->local_fd,
timeline_vals, signal_count);
STACK_ARRAY_FINISH(timeline_vals);
if (ret) {
return vk_errorf(_device, VK_ERROR_UNKNOWN,
"kgsl_timeline_signal failed: %m");
}
return VK_SUCCESS;
}
static VkResult
vk_kgsl_timeline_signal(struct vk_device *device,
struct vk_sync *sync,
uint64_t value)
{
struct vk_sync_signal signal = {
.sync = sync,
.stage_mask = ~(VkPipelineStageFlags2)0,
.signal_value = value
};
return vk_kgsl_timeline_signal_many(device, 1, &signal);
}
static VkResult
vk_kgsl_timeline_wait_many(struct vk_device *_device,
uint32_t wait_count,
const struct vk_sync_wait *waits,
enum vk_sync_wait_flags wait_flags,
uint64_t abs_timeout_ns)
{
struct tu_device *device = container_of(_device, struct tu_device, vk);
STACK_ARRAY(struct kgsl_timeline_val, timeline_vals, wait_count);
for (uint32_t i = 0; i < wait_count; ++i) {
struct vk_kgsl_timeline *timeline = container_of(waits[i].sync, struct vk_kgsl_timeline, vk);
timeline_vals[i] = (struct kgsl_timeline_val) {
.seqno = waits[i].wait_value,
.timeline = timeline->id
};
}
uint32_t flag = KGSL_TIMELINE_WAIT_ALL;
if (wait_flags & VK_SYNC_WAIT_ANY)
flag = KGSL_TIMELINE_WAIT_ANY;
int ret = kgsl_timeline_wait_ioctl(device->physical_device->local_fd,
get_relative_ns(abs_timeout_ns),
timeline_vals, wait_count, flag);
STACK_ARRAY_FINISH(timeline_vals);
if (ret) {
if (errno == EBUSY || errno == ETIMEDOUT)
return VK_TIMEOUT;
return vk_errorf(_device, VK_ERROR_UNKNOWN,
"kgsl_timeline_wait failed: %m");
}
return VK_SUCCESS;
}
static VkResult
vk_kgsl_timeline_get_value(struct vk_device *_device,
struct vk_sync *sync,
uint64_t *value)
{
struct tu_device *device = container_of(_device, struct tu_device, vk);
struct vk_kgsl_timeline *timeline = container_of(sync, struct vk_kgsl_timeline, vk);
int ret = kgsl_timeline_query_ioctl(device->physical_device->local_fd,
timeline->id, value);
if (ret) {
return vk_errorf(_device, VK_ERROR_UNKNOWN,
"kgsl_timeline_query failed: %m");
}
return VK_SUCCESS;
}
const struct vk_sync_type vk_kgsl_timeline_type = {
.size = sizeof(struct vk_kgsl_timeline),
.features = (enum vk_sync_features)
(VK_SYNC_FEATURE_TIMELINE |
VK_SYNC_FEATURE_GPU_WAIT |
VK_SYNC_FEATURE_CPU_WAIT |
VK_SYNC_FEATURE_CPU_SIGNAL |
VK_SYNC_FEATURE_WAIT_ANY |
VK_SYNC_FEATURE_WAIT_BEFORE_SIGNAL),
.init = vk_kgsl_timeline_init,
.finish = vk_kgsl_timeline_finish,
.signal = vk_kgsl_timeline_signal,
.signal_many = vk_kgsl_timeline_signal_many,
.get_value = vk_kgsl_timeline_get_value,
.wait_many = vk_kgsl_timeline_wait_many,
};
struct tu_kgsl_queue_submit {
struct util_dynarray commands;
struct util_dynarray ranges;
@ -1320,19 +1570,110 @@ kgsl_bind_finalize(struct tu_kgsl_queue_submit *submit)
}
}
struct kgsl_profiling {
struct kgsl_command_object *cmd_obj;
struct kgsl_cmdbatch_profiling_buffer *buffer;
uint64_t gpu_offset;
#if HAVE_PERFETTO
uint64_t start_ts;
#endif
};
static void
kgsl_profiling_alloc(struct kgsl_profiling *profiling,
struct tu_queue *queue,
struct tu_u_trace_submission_data *u_trace_submission_data)
{
mtx_lock(&queue->device->kgsl_profiling_mutex);
tu_suballoc_bo_alloc(&u_trace_submission_data->kgsl_timestamp_bo,
&queue->device->kgsl_profiling_suballoc,
sizeof(struct kgsl_cmdbatch_profiling_buffer), 4);
mtx_unlock(&queue->device->kgsl_profiling_mutex);
profiling->cmd_obj = (struct kgsl_command_object *)
vk_alloc(&queue->device->vk.alloc, sizeof(*profiling->cmd_obj),
alignof(*profiling->cmd_obj), VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
struct tu_suballoc_bo *bo = &u_trace_submission_data->kgsl_timestamp_bo;
*profiling->cmd_obj = (struct kgsl_command_object) {
.offset = bo->iova - bo->bo->iova,
.gpuaddr = bo->bo->iova,
.size = sizeof(struct kgsl_cmdbatch_profiling_buffer),
.flags = KGSL_OBJLIST_MEMOBJ | KGSL_OBJLIST_PROFILE,
.id = bo->bo->gem_handle,
};
profiling->buffer =
(struct kgsl_cmdbatch_profiling_buffer *) tu_suballoc_bo_map(bo);
memset(profiling->buffer, 0, sizeof(*profiling->buffer));
}
static void
kgsl_profiling_free(struct kgsl_profiling *profiling,
struct tu_queue *queue,
struct tu_u_trace_submission_data *u_trace_submission_data)
{
mtx_lock(&queue->device->kgsl_profiling_mutex);
tu_suballoc_bo_free(&queue->device->kgsl_profiling_suballoc,
&u_trace_submission_data->kgsl_timestamp_bo);
mtx_unlock(&queue->device->kgsl_profiling_mutex);
}
#if HAVE_PERFETTO
static int
kgsl_profiling_end_perfetto_submit(struct kgsl_profiling *profiling,
struct tu_queue *queue)
{
/* We need to wait for KGSL to queue the GPU command before we can read
* the timestamp. Since this is just for profiling and doesn't take too
* long, we can just busy-wait for it.
*/
while (p_atomic_read(&profiling->buffer->gpu_ticks_queued) == 0);
struct kgsl_perfcounter_read_group perf = {
.groupid = KGSL_PERFCOUNTER_GROUP_ALWAYSON,
.countable = 0,
.value = 0
};
struct kgsl_perfcounter_read req = {
.reads = &perf,
.count = 1,
};
int ret = safe_ioctl(queue->device->fd, IOCTL_KGSL_PERFCOUNTER_READ, &req);
/* Older KGSL has some kind of garbage in upper 32 bits */
uint64_t offseted_gpu_ts = perf.value & 0xffffffff;
profiling->gpu_offset = tu_device_ticks_to_ns(
queue->device, offseted_gpu_ts - profiling->buffer->gpu_ticks_queued);
struct tu_perfetto_clocks clocks = {
.cpu = profiling->buffer->wall_clock_ns,
.gpu_ts = tu_device_ticks_to_ns(queue->device,
profiling->buffer->gpu_ticks_queued),
.gpu_ts_offset = profiling->gpu_offset,
};
clocks = tu_perfetto_end_submit(queue, queue->device->submit_count,
profiling->start_ts, &clocks);
profiling->gpu_offset = clocks.gpu_ts_offset;
return ret;
}
#endif
static VkResult
kgsl_queue_submit(struct tu_queue *queue, void *_submit,
struct vk_sync_wait *waits, uint32_t wait_count,
struct vk_sync_signal *signals, uint32_t signal_count,
struct tu_u_trace_submission_data *u_trace_submission_data)
kgsl_queue_submit_syncobj(struct tu_queue *queue, void *_submit,
struct vk_sync_wait *waits, uint32_t wait_count,
struct vk_sync_signal *signals, uint32_t signal_count,
struct tu_u_trace_submission_data *u_trace_submission_data)
{
struct tu_kgsl_queue_submit *submit =
(struct tu_kgsl_queue_submit *)_submit;
#if HAVE_PERFETTO
uint64_t start_ts = tu_perfetto_begin_submit();
#endif
if (submit->commands.size == 0 && submit->bind_cmds.size == 0) {
/* This handles the case where we have a wait and no commands to submit.
* It is necessary to handle this case separately as the kernel will not
@ -1399,37 +1740,13 @@ kgsl_queue_submit(struct tu_queue *queue, void *_submit,
if (submit->bind_cmds.size != 0)
kgsl_bind_finalize(submit);
struct kgsl_profiling profiling = { 0 };
if (u_trace_submission_data) {
mtx_lock(&queue->device->kgsl_profiling_mutex);
tu_suballoc_bo_alloc(&u_trace_submission_data->kgsl_timestamp_bo,
&queue->device->kgsl_profiling_suballoc,
sizeof(struct kgsl_cmdbatch_profiling_buffer), 4);
mtx_unlock(&queue->device->kgsl_profiling_mutex);
}
uint32_t obj_count = 0;
if (u_trace_submission_data)
obj_count++;
struct kgsl_command_object *objs = (struct kgsl_command_object *)
vk_alloc(&queue->device->vk.alloc, sizeof(*objs) * obj_count,
alignof(*objs), VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
struct kgsl_cmdbatch_profiling_buffer *profiling_buffer = NULL;
uint32_t obj_idx = 0;
if (u_trace_submission_data) {
struct tu_suballoc_bo *bo = &u_trace_submission_data->kgsl_timestamp_bo;
objs[obj_idx++] = (struct kgsl_command_object) {
.offset = bo->iova - bo->bo->iova,
.gpuaddr = bo->bo->iova,
.size = sizeof(struct kgsl_cmdbatch_profiling_buffer),
.flags = KGSL_OBJLIST_MEMOBJ | KGSL_OBJLIST_PROFILE,
.id = bo->bo->gem_handle,
};
profiling_buffer =
(struct kgsl_cmdbatch_profiling_buffer *) tu_suballoc_bo_map(bo);
memset(profiling_buffer, 0, sizeof(*profiling_buffer));
#if HAVE_PERFETTO
profiling.start_ts = tu_perfetto_begin_submit();
#endif
kgsl_profiling_alloc(&profiling, queue, u_trace_submission_data);
}
const struct kgsl_syncobj *wait_semaphores[wait_count];
@ -1477,7 +1794,6 @@ kgsl_queue_submit(struct tu_queue *queue, void *_submit,
int ret;
uint32_t timestamp = 0;
uint64_t gpu_offset = 0;
if (submit->bind_cmds.size == 0) {
struct kgsl_gpu_command req = {
@ -1492,11 +1808,11 @@ kgsl_queue_submit(struct tu_queue *queue, void *_submit,
.context_id = queue->msm_queue_id,
};
if (obj_idx) {
if (profiling.cmd_obj) {
req.flags |= KGSL_CMDBATCH_PROFILING;
req.objlist = (uintptr_t) objs;
req.objlist = (uintptr_t) profiling.cmd_obj;
req.objsize = sizeof(struct kgsl_command_object);
req.numobjs = obj_idx;
req.numobjs = 1;
}
ret = safe_ioctl(queue->device->physical_device->local_fd,
@ -1546,42 +1862,8 @@ kgsl_queue_submit(struct tu_queue *queue, void *_submit,
}
#if HAVE_PERFETTO
if (profiling_buffer) {
/* We need to wait for KGSL to queue the GPU command before we can read
* the timestamp. Since this is just for profiling and doesn't take too
* long, we can just busy-wait for it.
*/
while (p_atomic_read(&profiling_buffer->gpu_ticks_queued) == 0);
struct kgsl_perfcounter_read_group perf = {
.groupid = KGSL_PERFCOUNTER_GROUP_ALWAYSON,
.countable = 0,
.value = 0
};
struct kgsl_perfcounter_read req = {
.reads = &perf,
.count = 1,
};
ret = safe_ioctl(queue->device->fd, IOCTL_KGSL_PERFCOUNTER_READ, &req);
/* Older KGSL has some kind of garbage in upper 32 bits */
uint64_t offseted_gpu_ts = perf.value & 0xffffffff;
gpu_offset = tu_device_ticks_to_ns(
queue->device, offseted_gpu_ts - profiling_buffer->gpu_ticks_queued);
struct tu_perfetto_clocks clocks = {
.cpu = profiling_buffer->wall_clock_ns,
.gpu_ts = tu_device_ticks_to_ns(queue->device,
profiling_buffer->gpu_ticks_queued),
.gpu_ts_offset = gpu_offset,
};
clocks = tu_perfetto_end_submit(queue, queue->device->submit_count,
start_ts, &clocks);
gpu_offset = clocks.gpu_ts_offset;
}
if (profiling.buffer)
ret = kgsl_profiling_end_perfetto_submit(&profiling, queue);
#endif
kgsl_syncobj_destroy(&wait_sync);
@ -1605,23 +1887,293 @@ kgsl_queue_submit(struct tu_queue *queue, void *_submit,
signal_sync->timestamp = timestamp;
}
if (u_trace_submission_data) {
struct tu_u_trace_submission_data *submission_data =
u_trace_submission_data;
submission_data->gpu_ts_offset = gpu_offset;
}
if (u_trace_submission_data)
u_trace_submission_data->gpu_ts_offset = profiling.gpu_offset;
fail_submit:
if (result != VK_SUCCESS && u_trace_submission_data) {
mtx_lock(&queue->device->kgsl_profiling_mutex);
tu_suballoc_bo_free(&queue->device->kgsl_profiling_suballoc,
&u_trace_submission_data->kgsl_timestamp_bo);
mtx_unlock(&queue->device->kgsl_profiling_mutex);
}
if (result != VK_SUCCESS && u_trace_submission_data)
kgsl_profiling_free(&profiling, queue, u_trace_submission_data);
return result;
}
static VkResult
kgsl_queue_submit_timeline(struct tu_queue *queue, void *_submit,
struct vk_sync_wait *waits, uint32_t wait_count,
struct vk_sync_signal *signals, uint32_t signal_count,
struct tu_u_trace_submission_data *u_trace_submission_data)
{
struct tu_kgsl_queue_submit *submit =
(struct tu_kgsl_queue_submit *)_submit;
/* The queue timeline is included in the waits and signals, as it is
* necessary for the fast path below to function properly. Wait on the
* current queue timeline seqno will ensure that work of previous submit
* was completed. Signal of an increased queue timeline seqno will indicate
* the work of the current submit was completed.
*/
const uint64_t previous_submit_timeline_seqno = queue->kgsl_queue_timeline_seqno;
const uint64_t current_submit_timeline_seqno = previous_submit_timeline_seqno + 1;
STACK_ARRAY(struct kgsl_timeline_val, wait_timeline_vals, wait_count + 1);
for (uint32_t i = 0; i < wait_count; ++i) {
struct vk_kgsl_timeline *timeline = container_of(waits[i].sync, struct vk_kgsl_timeline, vk);
wait_timeline_vals[i] = (struct kgsl_timeline_val) {
.seqno = waits[i].wait_value,
.timeline = timeline->id
};
}
wait_timeline_vals[wait_count] = (struct kgsl_timeline_val) {
.seqno = previous_submit_timeline_seqno,
.timeline = queue->kgsl_queue_timeline_id
};
STACK_ARRAY(struct kgsl_timeline_val, signal_timeline_vals, signal_count + 1);
for (uint32_t i = 0; i < signal_count; ++i) {
struct vk_kgsl_timeline *timeline = container_of(signals[i].sync, struct vk_kgsl_timeline, vk);
signal_timeline_vals[i] = (struct kgsl_timeline_val) {
.seqno = signals[i].signal_value,
.timeline = timeline->id
};
}
signal_timeline_vals[signal_count] = (struct kgsl_timeline_val) {
.seqno = current_submit_timeline_seqno,
.timeline = queue->kgsl_queue_timeline_id
};
if (submit->commands.size == 0 && submit->bind_cmds.size == 0) {
/* First part of the zero-command, zero-bind fast path. If all wait
* timelines are already signaled, fire off the signals and finish.
*/
int ret = kgsl_timeline_wait_ioctl(queue->device->physical_device->local_fd, 0,
wait_timeline_vals, wait_count + 1,
KGSL_TIMELINE_WAIT_ALL);
if (ret == 0) {
VkResult result = VK_SUCCESS;
ret = kgsl_timeline_signal_ioctl(queue->device->physical_device->local_fd,
signal_timeline_vals, signal_count + 1);
if (ret) {
result = vk_device_set_lost(&queue->device->vk,
"signal submit failed\n");
} else {
queue->kgsl_queue_timeline_seqno = current_submit_timeline_seqno;
}
STACK_ARRAY_FINISH(signal_timeline_vals);
STACK_ARRAY_FINISH(wait_timeline_vals);
return result;
}
}
struct kgsl_cmd_syncpoint_timeline cmd_syncpoint_timeline = {
.timelines = (uint64_t)(uintptr_t) wait_timeline_vals,
.count = wait_count + 1,
.timelines_size = sizeof(struct kgsl_timeline_val)
};
struct kgsl_command_syncpoint cmd_syncpoint = {
.priv = (uint64_t)(uintptr_t) &cmd_syncpoint_timeline,
.size = sizeof(struct kgsl_cmd_syncpoint_timeline),
.type = KGSL_CMD_SYNCPOINT_TYPE_TIMELINE
};
struct kgsl_gpu_aux_command_timeline aux_cmd_timeline = {
.timelines = (uint64_t)(uintptr_t) signal_timeline_vals,
.count = signal_count + 1,
.timelines_size = sizeof(struct kgsl_timeline_val)
};
struct kgsl_gpu_aux_command_generic aux_cmd_generic_timeline = {
.priv = (uint64_t)(uintptr_t) &aux_cmd_timeline,
.size = sizeof(struct kgsl_gpu_aux_command_timeline),
.type = KGSL_GPU_AUX_COMMAND_TIMELINE,
};
struct kgsl_gpu_aux_command aux_cmd;
if (submit->commands.size == 0 && submit->bind_cmds.size == 0) {
/* Second part of the zero-command, zero-bind fast path. All wait
* timelines haven't yet been signaled, but we can dispatch an aux
* command that will wait on those timelines and then fire off the
* the necessary timeline signals.
*/
aux_cmd = {
.flags = KGSL_GPU_AUX_COMMAND_TIMELINE | KGSL_GPU_AUX_COMMAND_SYNC,
.cmdlist = (uint64_t)(uintptr_t) &aux_cmd_generic_timeline,
.cmdsize = sizeof(struct kgsl_gpu_aux_command_generic),
.numcmds = 1,
.synclist = (uint64_t)(uintptr_t) &cmd_syncpoint,
.syncsize = sizeof(struct kgsl_command_syncpoint),
.numsyncs = 1,
.context_id = queue->msm_queue_id
};
VkResult result = VK_SUCCESS;
int ret = safe_ioctl(queue->device->physical_device->local_fd,
IOCTL_KGSL_GPU_AUX_COMMAND, &aux_cmd);
if (ret) {
result = vk_device_set_lost(&queue->device->vk,
"timeline signal aux command submit failed\n");
} else {
queue->kgsl_queue_timeline_seqno = current_submit_timeline_seqno;
}
STACK_ARRAY_FINISH(signal_timeline_vals);
STACK_ARRAY_FINISH(wait_timeline_vals);
return result;
}
int ret;
uint32_t timestamp;
/* For sanity: we're either dealing with commands or binds. In either case,
* the first dispatched command should wait on the specified timelines.
* An additional aux command will take care of timeline signals.
*/
assert((submit->commands.size == 0) ^ (submit->bind_cmds.size == 0));
struct kgsl_profiling profiling = { 0 };
if (u_trace_submission_data) {
#if HAVE_PERFETTO
profiling.start_ts = tu_perfetto_begin_submit();
#endif
kgsl_profiling_alloc(&profiling, queue, u_trace_submission_data);
}
if (submit->commands.size != 0) {
struct kgsl_gpu_command req = {
.flags = KGSL_CMDBATCH_SUBMIT_IB_LIST,
.cmdlist = (uint64_t)(uintptr_t) submit->commands.data,
.cmdsize = sizeof(struct kgsl_command_object),
.numcmds = util_dynarray_num_elements(&submit->commands,
struct kgsl_command_object),
.synclist = (uint64_t)(uintptr_t) &cmd_syncpoint,
.syncsize = sizeof(struct kgsl_command_syncpoint),
.numsyncs = 1,
.context_id = queue->msm_queue_id,
};
if (profiling.cmd_obj) {
req.flags |= KGSL_CMDBATCH_PROFILING;
req.objlist = (uintptr_t) profiling.cmd_obj;
req.objsize = sizeof(struct kgsl_command_object);
req.numobjs = 1;
}
ret = safe_ioctl(queue->device->physical_device->local_fd,
IOCTL_KGSL_GPU_COMMAND, &req);
timestamp = req.timestamp;
}
if (submit->bind_cmds.size != 0) {
kgsl_bind_finalize(submit);
/* kgsl doesn't support multiple bind commands at once */
uint32_t i = 0;
util_dynarray_foreach(&submit->bind_cmds,
struct kgsl_gpu_aux_command_bind, aux_cmd_bind) {
struct kgsl_gpu_aux_command_generic aux_cmd_generic_bind = {
.priv = (uint64_t)(uintptr_t) aux_cmd_bind,
.size = sizeof(struct kgsl_gpu_aux_command_bind),
.type = KGSL_GPU_AUX_COMMAND_BIND,
};
aux_cmd = {
.flags = KGSL_GPU_AUX_COMMAND_BIND,
.cmdlist = (uint64_t)(uintptr_t) &aux_cmd_generic_bind,
.cmdsize = sizeof(struct kgsl_gpu_aux_command_generic),
.numcmds = 1,
.context_id = queue->msm_queue_id,
};
if (i == 0) {
aux_cmd.flags |= KGSL_GPU_AUX_COMMAND_SYNC;
aux_cmd.synclist = (uint64_t)(uintptr_t) &cmd_syncpoint;
aux_cmd.syncsize = sizeof(struct kgsl_command_syncpoint);
aux_cmd.numsyncs = 1;
}
ret = safe_ioctl(queue->device->physical_device->local_fd,
IOCTL_KGSL_GPU_AUX_COMMAND, &aux_cmd);
timestamp = aux_cmd.timestamp;
i++;
if (ret)
break;
}
}
#if HAVE_PERFETTO
if (profiling.buffer)
ret = kgsl_profiling_end_perfetto_submit(&profiling, queue);
#endif
VkResult result = VK_SUCCESS;
if (ret) {
result = vk_device_set_lost(&queue->device->vk,
"submit failed: %s\n", strerror(errno));
goto fail_submit;
}
aux_cmd = {
.flags = KGSL_GPU_AUX_COMMAND_TIMELINE,
.cmdlist = (uint64_t)(uintptr_t) &aux_cmd_generic_timeline,
.cmdsize = sizeof(struct kgsl_gpu_aux_command_generic),
.numcmds = 1,
.context_id = queue->msm_queue_id
};
ret = safe_ioctl(queue->device->physical_device->local_fd,
IOCTL_KGSL_GPU_AUX_COMMAND, &aux_cmd);
if (ret) {
result = vk_device_set_lost(&queue->device->vk,
"timeline submit failed: %s\n",
strerror(errno));
goto fail_submit;
}
timestamp = aux_cmd.timestamp;
queue->kgsl_queue_timeline_seqno = current_submit_timeline_seqno;
p_atomic_set(&queue->fence, timestamp);
if (u_trace_submission_data)
u_trace_submission_data->gpu_ts_offset = profiling.gpu_offset;
fail_submit:
if (result != VK_SUCCESS && u_trace_submission_data)
kgsl_profiling_free(&profiling, queue, u_trace_submission_data);
STACK_ARRAY_FINISH(signal_timeline_vals);
STACK_ARRAY_FINISH(wait_timeline_vals);
return result;
}
static VkResult
kgsl_queue_submit(struct tu_queue *queue, void *submit,
struct vk_sync_wait *waits, uint32_t wait_count,
struct vk_sync_signal *signals, uint32_t signal_count,
struct tu_u_trace_submission_data *u_trace_submission_data)
{
switch (queue->device->physical_device->kgsl_sync_impl_type) {
case TU_KGSL_SYNC_IMPL_TYPE_SYNCOBJ:
return kgsl_queue_submit_syncobj(queue, submit,
waits, wait_count,
signals, signal_count,
u_trace_submission_data);
case TU_KGSL_SYNC_IMPL_TYPE_TIMELINE:
return kgsl_queue_submit_timeline(queue, submit,
waits, wait_count,
signals, signal_count,
u_trace_submission_data);
}
return VK_ERROR_UNKNOWN;
}
static VkResult
kgsl_device_init(struct tu_device *dev)
{
@ -1731,6 +2283,7 @@ tu_knl_kgsl_load(struct tu_instance *instance, int fd)
static const char dma_heap_path[] = "/dev/dma_heap/system";
static const char ion_path[] = "/dev/ion";
int dma_fd;
uint32_t dummy_timeline_id = 0;
dma_fd = open(dma_heap_path, O_RDONLY);
if (dma_fd >= 0) {
@ -1796,11 +2349,27 @@ tu_knl_kgsl_load(struct tu_instance *instance, int fd)
device->has_raytracing = tu_kgsl_get_raytracing(fd);
device->submitqueue_priority_count = 1;
device->timeline_type = vk_sync_timeline_get_type(&vk_kgsl_sync_type);
device->sync_types[0] = &vk_kgsl_sync_type;
device->sync_types[1] = &device->timeline_type.sync;
/* Prefer timeline-based sync implementation if supported by the kernel. */
if (kgsl_timeline_create_ioctl(fd, 0, &dummy_timeline_id) == 0) {
kgsl_timeline_destroy_ioctl(fd, dummy_timeline_id);
device->kgsl_sync_impl_type = TU_KGSL_SYNC_IMPL_TYPE_TIMELINE;
} else {
device->kgsl_sync_impl_type = TU_KGSL_SYNC_IMPL_TYPE_SYNCOBJ;
}
switch (device->kgsl_sync_impl_type) {
case TU_KGSL_SYNC_IMPL_TYPE_SYNCOBJ:
device->timeline_type = vk_sync_timeline_get_type(&vk_kgsl_sync_type);
device->sync_types[0] = &vk_kgsl_sync_type;
device->sync_types[1] = &device->timeline_type.sync;
break;
case TU_KGSL_SYNC_IMPL_TYPE_TIMELINE:
device->binary_type = vk_sync_binary_get_type(&vk_kgsl_timeline_type);
device->sync_types[0] = &vk_kgsl_timeline_type;
device->sync_types[1] = &device->binary_type.sync;
break;
}
device->sync_types[2] = NULL;
device->heap.size = tu_get_system_heap_size(device);

View file

@ -36,6 +36,9 @@ struct tu_queue
unsigned render_pass_idx;
int fence; /* timestamp/fence of the last queue submission */
uint32_t kgsl_queue_timeline_id;
uint64_t kgsl_queue_timeline_seqno;
};
VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)