mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-21 05:00:09 +01:00
tu/kgsl: Support u_trace and perfetto
Raw GPU time is retrieved via kgsl_cmdbatch_profiling_buffer, offseted GPU time is retrieved via KGSL_PERFCOUNTER_GROUP_ALWAYSON. This allows to calculate GPU time offset for each submission and synchronize CPU/GPU time domains. Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12805>
This commit is contained in:
parent
3ccd199708
commit
ec268fa5b6
4 changed files with 186 additions and 2 deletions
|
|
@ -1888,6 +1888,13 @@ tu_u_trace_submission_data_finish(
|
|||
}
|
||||
}
|
||||
|
||||
if (submission_data->kgsl_timestamp_bo.bo) {
|
||||
mtx_lock(&device->kgsl_profiling_mutex);
|
||||
tu_suballoc_bo_free(&device->kgsl_profiling_suballoc,
|
||||
&submission_data->kgsl_timestamp_bo);
|
||||
mtx_unlock(&device->kgsl_profiling_mutex);
|
||||
}
|
||||
|
||||
vk_free(&device->vk.alloc, submission_data->cmd_trace_data);
|
||||
vk_free(&device->vk.alloc, submission_data->syncobj);
|
||||
vk_free(&device->vk.alloc, submission_data);
|
||||
|
|
@ -2086,6 +2093,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
mtx_init(&device->bo_mutex, mtx_plain);
|
||||
mtx_init(&device->pipeline_mutex, mtx_plain);
|
||||
mtx_init(&device->autotune_mutex, mtx_plain);
|
||||
mtx_init(&device->kgsl_profiling_mutex, mtx_plain);
|
||||
u_rwlock_init(&device->dma_bo_lock);
|
||||
pthread_mutex_init(&device->submit_mutex, NULL);
|
||||
|
||||
|
|
@ -2181,6 +2189,10 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
(enum tu_bo_alloc_flags) (TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP));
|
||||
tu_bo_suballocator_init(&device->autotune_suballoc, device,
|
||||
128 * 1024, TU_BO_ALLOC_NO_FLAGS);
|
||||
if (is_kgsl(physical_device->instance)) {
|
||||
tu_bo_suballocator_init(&device->kgsl_profiling_suballoc, device,
|
||||
128 * 1024, TU_BO_ALLOC_NO_FLAGS);
|
||||
}
|
||||
|
||||
result = tu_bo_init_new(device, &device->global_bo, global_size,
|
||||
TU_BO_ALLOC_ALLOW_DUMP, "global");
|
||||
|
|
@ -2428,6 +2440,7 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
|
|||
|
||||
tu_bo_suballocator_finish(&device->pipeline_suballoc);
|
||||
tu_bo_suballocator_finish(&device->autotune_suballoc);
|
||||
tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
|
||||
|
||||
for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) {
|
||||
for (unsigned q = 0; q < device->queue_count[i]; q++)
|
||||
|
|
|
|||
|
|
@ -276,6 +276,12 @@ struct tu_device
|
|||
struct tu_suballocator autotune_suballoc;
|
||||
mtx_t autotune_mutex;
|
||||
|
||||
/* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
|
||||
* each submission.
|
||||
*/
|
||||
struct tu_suballocator kgsl_profiling_suballoc;
|
||||
mtx_t kgsl_profiling_mutex;
|
||||
|
||||
/* the blob seems to always use 8K factor and 128K param sizes, copy them */
|
||||
#define TU_TESS_FACTOR_SIZE (8 * 1024)
|
||||
#define TU_TESS_PARAM_SIZE (128 * 1024)
|
||||
|
|
@ -526,6 +532,9 @@ struct tu_u_trace_submission_data
|
|||
* offset may change between submissions due to power cycle.
|
||||
*/
|
||||
uint64_t gpu_ts_offset;
|
||||
|
||||
/* KGSL needs a GPU memory to write submission timestamps into */
|
||||
struct tu_suballoc_bo kgsl_timestamp_bo;
|
||||
};
|
||||
|
||||
VkResult
|
||||
|
|
|
|||
|
|
@ -332,6 +332,12 @@ struct kgsl_syncobj
|
|||
int fd;
|
||||
};
|
||||
|
||||
struct tu_u_trace_syncobj
|
||||
{
|
||||
uint32_t msm_queue_id;
|
||||
uint32_t timestamp;
|
||||
};
|
||||
|
||||
static void
|
||||
kgsl_syncobj_init(struct kgsl_syncobj *s, bool signaled)
|
||||
{
|
||||
|
|
@ -930,6 +936,9 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
|
|||
{
|
||||
MESA_TRACE_FUNC();
|
||||
|
||||
bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
|
||||
bool has_trace_points = false;
|
||||
|
||||
if (vk_submit->command_buffer_count == 0) {
|
||||
pthread_mutex_lock(&queue->device->submit_mutex);
|
||||
|
||||
|
|
@ -1006,6 +1015,14 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
|
|||
entry_count++;
|
||||
|
||||
entry_count += cmd_buffer->cs.entry_count;
|
||||
|
||||
if (u_trace_enabled && u_trace_has_points(&cmd_buffers[i]->trace)) {
|
||||
if (!(cmd_buffers[i]->usage_flags &
|
||||
VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
|
||||
entry_count++;
|
||||
|
||||
has_trace_points = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count))
|
||||
|
|
@ -1019,6 +1036,26 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
|
|||
return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
|
||||
}
|
||||
|
||||
uint32_t obj_count = 0;
|
||||
if (has_trace_points)
|
||||
obj_count++;
|
||||
|
||||
struct kgsl_command_object *objs = (struct kgsl_command_object *)
|
||||
vk_alloc(&queue->device->vk.alloc, sizeof(*objs) * obj_count,
|
||||
alignof(*objs), VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
|
||||
|
||||
struct tu_u_trace_submission_data *u_trace_submission_data = NULL;
|
||||
if (has_trace_points) {
|
||||
tu_u_trace_submission_data_create(
|
||||
queue->device, cmd_buffers, cmdbuf_count, &u_trace_submission_data);
|
||||
|
||||
mtx_lock(&queue->device->kgsl_profiling_mutex);
|
||||
tu_suballoc_bo_alloc(&u_trace_submission_data->kgsl_timestamp_bo,
|
||||
&queue->device->kgsl_profiling_suballoc,
|
||||
sizeof(struct kgsl_cmdbatch_profiling_buffer), 4);
|
||||
mtx_unlock(&queue->device->kgsl_profiling_mutex);
|
||||
}
|
||||
|
||||
uint32_t entry_idx = 0;
|
||||
for (uint32_t i = 0; i < cmdbuf_count; i++) {
|
||||
struct tu_cmd_buffer *cmd_buffer = cmd_buffers[i];
|
||||
|
|
@ -1044,6 +1081,36 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
|
|||
.id = cs->entries[j].bo->gem_handle,
|
||||
};
|
||||
}
|
||||
|
||||
if (u_trace_submission_data &&
|
||||
u_trace_submission_data->cmd_trace_data[i].timestamp_copy_cs) {
|
||||
struct tu_cs_entry *trace_cs_entry =
|
||||
&u_trace_submission_data->cmd_trace_data[i]
|
||||
.timestamp_copy_cs->entries[0];
|
||||
cmds[entry_idx++] = (struct kgsl_command_object) {
|
||||
.offset = trace_cs_entry->offset,
|
||||
.gpuaddr = trace_cs_entry->bo->iova,
|
||||
.size = trace_cs_entry->size,
|
||||
.flags = KGSL_CMDLIST_IB,
|
||||
.id = trace_cs_entry->bo->gem_handle,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
struct kgsl_cmdbatch_profiling_buffer *profiling_buffer = NULL;
|
||||
uint32_t obj_idx = 0;
|
||||
if (u_trace_submission_data) {
|
||||
struct tu_suballoc_bo *bo = &u_trace_submission_data->kgsl_timestamp_bo;
|
||||
|
||||
objs[obj_idx++] = (struct kgsl_command_object) {
|
||||
.gpuaddr = bo->iova,
|
||||
.offset = bo->iova - bo->bo->iova,
|
||||
.size = sizeof(struct kgsl_cmdbatch_profiling_buffer),
|
||||
.flags = KGSL_OBJLIST_MEMOBJ | KGSL_OBJLIST_PROFILE,
|
||||
.id = bo->bo->gem_handle,
|
||||
};
|
||||
profiling_buffer =
|
||||
(struct kgsl_cmdbatch_profiling_buffer *) tu_suballoc_bo_map(bo);
|
||||
}
|
||||
|
||||
if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) {
|
||||
|
|
@ -1112,9 +1179,49 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
|
|||
.context_id = queue->msm_queue_id,
|
||||
};
|
||||
|
||||
if (obj_idx) {
|
||||
req.flags |= KGSL_CMDBATCH_PROFILING;
|
||||
req.objlist = (uintptr_t) objs;
|
||||
req.objsize = sizeof(struct kgsl_command_object);
|
||||
req.numobjs = obj_idx;
|
||||
}
|
||||
|
||||
int ret = safe_ioctl(queue->device->physical_device->local_fd,
|
||||
IOCTL_KGSL_GPU_COMMAND, &req);
|
||||
|
||||
uint64_t gpu_offset = 0;
|
||||
#if HAVE_PERFETTO
|
||||
if (profiling_buffer && profiling_buffer->gpu_ticks_queued) {
|
||||
struct kgsl_perfcounter_read_group perf = {
|
||||
.groupid = KGSL_PERFCOUNTER_GROUP_ALWAYSON,
|
||||
.countable = 0,
|
||||
.value = 0
|
||||
};
|
||||
|
||||
struct kgsl_perfcounter_read req = {
|
||||
.reads = &perf,
|
||||
.count = 1,
|
||||
};
|
||||
|
||||
ret = safe_ioctl(queue->device->fd, IOCTL_KGSL_PERFCOUNTER_READ, &req);
|
||||
/* Older KGSL has some kind of garbage in upper 32 bits */
|
||||
uint64_t offseted_gpu_ts = perf.value & 0xffffffff;
|
||||
|
||||
gpu_offset = tu_device_ticks_to_ns(
|
||||
queue->device, offseted_gpu_ts - profiling_buffer->gpu_ticks_queued);
|
||||
|
||||
struct tu_perfetto_clocks clocks = {
|
||||
.cpu = profiling_buffer->wall_clock_ns,
|
||||
.gpu_ts = tu_device_ticks_to_ns(queue->device,
|
||||
profiling_buffer->gpu_ticks_queued),
|
||||
.gpu_ts_offset = gpu_offset,
|
||||
};
|
||||
|
||||
clocks = tu_perfetto_submit(queue->device, queue->device->submit_count, &clocks);
|
||||
gpu_offset = clocks.gpu_ts_offset;
|
||||
}
|
||||
#endif
|
||||
|
||||
kgsl_syncobj_destroy(&wait_sync);
|
||||
|
||||
if (ret) {
|
||||
|
|
@ -1136,9 +1243,40 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
|
|||
signal_sync->timestamp = req.timestamp;
|
||||
}
|
||||
|
||||
if (u_trace_submission_data) {
|
||||
struct tu_u_trace_submission_data *submission_data =
|
||||
u_trace_submission_data;
|
||||
submission_data->submission_id = queue->device->submit_count;
|
||||
submission_data->gpu_ts_offset = gpu_offset;
|
||||
/* We have to allocate it here since it is different between drm/kgsl */
|
||||
submission_data->syncobj = (struct tu_u_trace_syncobj *)
|
||||
vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj),
|
||||
8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
|
||||
submission_data->syncobj->timestamp = req.timestamp;
|
||||
submission_data->syncobj->msm_queue_id = queue->msm_queue_id;
|
||||
|
||||
u_trace_submission_data = NULL;
|
||||
|
||||
for (uint32_t i = 0; i < submission_data->cmd_buffer_count; i++) {
|
||||
bool free_data = i == submission_data->last_buffer_with_tracepoints;
|
||||
if (submission_data->cmd_trace_data[i].trace)
|
||||
u_trace_flush(submission_data->cmd_trace_data[i].trace,
|
||||
submission_data, free_data);
|
||||
|
||||
if (!submission_data->cmd_trace_data[i].timestamp_copy_cs) {
|
||||
/* u_trace is owned by cmd_buffer */
|
||||
submission_data->cmd_trace_data[i].trace = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
queue->device->submit_count++;
|
||||
|
||||
pthread_mutex_unlock(&queue->device->submit_mutex);
|
||||
pthread_cond_broadcast(&queue->device->timeline_cond);
|
||||
|
||||
u_trace_context_process(&queue->device->trace_context, true);
|
||||
|
||||
if (cmd_buffers != (struct tu_cmd_buffer **) vk_submit->command_buffers)
|
||||
vk_free(&queue->device->vk.alloc, cmd_buffers);
|
||||
|
||||
|
|
@ -1149,6 +1287,13 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
|
|||
fail_submit:
|
||||
pthread_mutex_unlock(&queue->device->submit_mutex);
|
||||
|
||||
if (result != VK_SUCCESS) {
|
||||
mtx_lock(&queue->device->kgsl_profiling_mutex);
|
||||
tu_suballoc_bo_free(&queue->device->kgsl_profiling_suballoc,
|
||||
&u_trace_submission_data->kgsl_timestamp_bo);
|
||||
mtx_unlock(&queue->device->kgsl_profiling_mutex);
|
||||
}
|
||||
|
||||
if (cmd_buffers != (struct tu_cmd_buffer **) vk_submit->command_buffers)
|
||||
vk_free(&queue->device->vk.alloc, cmd_buffers);
|
||||
|
||||
|
|
@ -1160,7 +1305,19 @@ fail_submit:
|
|||
static VkResult
|
||||
kgsl_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj)
|
||||
{
|
||||
tu_finishme("tu_device_wait_u_trace");
|
||||
struct kgsl_device_waittimestamp_ctxtid req = {
|
||||
.context_id = syncobj->msm_queue_id,
|
||||
.timestamp = syncobj->timestamp,
|
||||
.timeout = 5000, // 5s
|
||||
};
|
||||
|
||||
int ret = safe_ioctl(dev->fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &req);
|
||||
|
||||
if (ret) {
|
||||
assert(errno == ETIME);
|
||||
return VK_TIMEOUT;
|
||||
}
|
||||
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
@ -1180,7 +1337,7 @@ kgsl_device_finish(struct tu_device *dev)
|
|||
static int
|
||||
kgsl_device_get_gpu_timestamp(struct tu_device *dev, uint64_t *ts)
|
||||
{
|
||||
tu_finishme("tu_device_get_gpu_timestamp");
|
||||
unreachable("");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -311,7 +311,12 @@ tu_perfetto_init(void)
|
|||
util_perfetto_init();
|
||||
|
||||
perfetto::DataSourceDescriptor dsd;
|
||||
#ifdef ANDROID
|
||||
/* AGI requires this name */
|
||||
dsd.set_name("gpu.renderstages");
|
||||
#else
|
||||
dsd.set_name("gpu.renderstages.msm");
|
||||
#endif
|
||||
TuRenderpassDataSource::Register(dsd);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue