tu/kgsl: Support u_trace and perfetto

Raw GPU time is retrieved via kgsl_cmdbatch_profiling_buffer,
offseted GPU time is retrieved via KGSL_PERFCOUNTER_GROUP_ALWAYSON.
This allows to calculate GPU time offset for each submission and
synchronize CPU/GPU time domains.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12805>
This commit is contained in:
Danylo Piliaiev 2023-08-10 11:54:44 +02:00 committed by Marge Bot
parent 3ccd199708
commit ec268fa5b6
4 changed files with 186 additions and 2 deletions

View file

@ -1888,6 +1888,13 @@ tu_u_trace_submission_data_finish(
} }
} }
if (submission_data->kgsl_timestamp_bo.bo) {
mtx_lock(&device->kgsl_profiling_mutex);
tu_suballoc_bo_free(&device->kgsl_profiling_suballoc,
&submission_data->kgsl_timestamp_bo);
mtx_unlock(&device->kgsl_profiling_mutex);
}
vk_free(&device->vk.alloc, submission_data->cmd_trace_data); vk_free(&device->vk.alloc, submission_data->cmd_trace_data);
vk_free(&device->vk.alloc, submission_data->syncobj); vk_free(&device->vk.alloc, submission_data->syncobj);
vk_free(&device->vk.alloc, submission_data); vk_free(&device->vk.alloc, submission_data);
@ -2086,6 +2093,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
mtx_init(&device->bo_mutex, mtx_plain); mtx_init(&device->bo_mutex, mtx_plain);
mtx_init(&device->pipeline_mutex, mtx_plain); mtx_init(&device->pipeline_mutex, mtx_plain);
mtx_init(&device->autotune_mutex, mtx_plain); mtx_init(&device->autotune_mutex, mtx_plain);
mtx_init(&device->kgsl_profiling_mutex, mtx_plain);
u_rwlock_init(&device->dma_bo_lock); u_rwlock_init(&device->dma_bo_lock);
pthread_mutex_init(&device->submit_mutex, NULL); pthread_mutex_init(&device->submit_mutex, NULL);
@ -2181,6 +2189,10 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
(enum tu_bo_alloc_flags) (TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP)); (enum tu_bo_alloc_flags) (TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP));
tu_bo_suballocator_init(&device->autotune_suballoc, device, tu_bo_suballocator_init(&device->autotune_suballoc, device,
128 * 1024, TU_BO_ALLOC_NO_FLAGS); 128 * 1024, TU_BO_ALLOC_NO_FLAGS);
if (is_kgsl(physical_device->instance)) {
tu_bo_suballocator_init(&device->kgsl_profiling_suballoc, device,
128 * 1024, TU_BO_ALLOC_NO_FLAGS);
}
result = tu_bo_init_new(device, &device->global_bo, global_size, result = tu_bo_init_new(device, &device->global_bo, global_size,
TU_BO_ALLOC_ALLOW_DUMP, "global"); TU_BO_ALLOC_ALLOW_DUMP, "global");
@ -2428,6 +2440,7 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
tu_bo_suballocator_finish(&device->pipeline_suballoc); tu_bo_suballocator_finish(&device->pipeline_suballoc);
tu_bo_suballocator_finish(&device->autotune_suballoc); tu_bo_suballocator_finish(&device->autotune_suballoc);
tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) { for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) {
for (unsigned q = 0; q < device->queue_count[i]; q++) for (unsigned q = 0; q < device->queue_count[i]; q++)

View file

@ -276,6 +276,12 @@ struct tu_device
struct tu_suballocator autotune_suballoc; struct tu_suballocator autotune_suballoc;
mtx_t autotune_mutex; mtx_t autotune_mutex;
/* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
* each submission.
*/
struct tu_suballocator kgsl_profiling_suballoc;
mtx_t kgsl_profiling_mutex;
/* the blob seems to always use 8K factor and 128K param sizes, copy them */ /* the blob seems to always use 8K factor and 128K param sizes, copy them */
#define TU_TESS_FACTOR_SIZE (8 * 1024) #define TU_TESS_FACTOR_SIZE (8 * 1024)
#define TU_TESS_PARAM_SIZE (128 * 1024) #define TU_TESS_PARAM_SIZE (128 * 1024)
@ -526,6 +532,9 @@ struct tu_u_trace_submission_data
* offset may change between submissions due to power cycle. * offset may change between submissions due to power cycle.
*/ */
uint64_t gpu_ts_offset; uint64_t gpu_ts_offset;
/* KGSL needs a GPU memory to write submission timestamps into */
struct tu_suballoc_bo kgsl_timestamp_bo;
}; };
VkResult VkResult

View file

@ -332,6 +332,12 @@ struct kgsl_syncobj
int fd; int fd;
}; };
struct tu_u_trace_syncobj
{
uint32_t msm_queue_id;
uint32_t timestamp;
};
static void static void
kgsl_syncobj_init(struct kgsl_syncobj *s, bool signaled) kgsl_syncobj_init(struct kgsl_syncobj *s, bool signaled)
{ {
@ -930,6 +936,9 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
{ {
MESA_TRACE_FUNC(); MESA_TRACE_FUNC();
bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
bool has_trace_points = false;
if (vk_submit->command_buffer_count == 0) { if (vk_submit->command_buffer_count == 0) {
pthread_mutex_lock(&queue->device->submit_mutex); pthread_mutex_lock(&queue->device->submit_mutex);
@ -1006,6 +1015,14 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
entry_count++; entry_count++;
entry_count += cmd_buffer->cs.entry_count; entry_count += cmd_buffer->cs.entry_count;
if (u_trace_enabled && u_trace_has_points(&cmd_buffers[i]->trace)) {
if (!(cmd_buffers[i]->usage_flags &
VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
entry_count++;
has_trace_points = true;
}
} }
if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count))
@ -1019,6 +1036,26 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
} }
uint32_t obj_count = 0;
if (has_trace_points)
obj_count++;
struct kgsl_command_object *objs = (struct kgsl_command_object *)
vk_alloc(&queue->device->vk.alloc, sizeof(*objs) * obj_count,
alignof(*objs), VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
struct tu_u_trace_submission_data *u_trace_submission_data = NULL;
if (has_trace_points) {
tu_u_trace_submission_data_create(
queue->device, cmd_buffers, cmdbuf_count, &u_trace_submission_data);
mtx_lock(&queue->device->kgsl_profiling_mutex);
tu_suballoc_bo_alloc(&u_trace_submission_data->kgsl_timestamp_bo,
&queue->device->kgsl_profiling_suballoc,
sizeof(struct kgsl_cmdbatch_profiling_buffer), 4);
mtx_unlock(&queue->device->kgsl_profiling_mutex);
}
uint32_t entry_idx = 0; uint32_t entry_idx = 0;
for (uint32_t i = 0; i < cmdbuf_count; i++) { for (uint32_t i = 0; i < cmdbuf_count; i++) {
struct tu_cmd_buffer *cmd_buffer = cmd_buffers[i]; struct tu_cmd_buffer *cmd_buffer = cmd_buffers[i];
@ -1044,6 +1081,36 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
.id = cs->entries[j].bo->gem_handle, .id = cs->entries[j].bo->gem_handle,
}; };
} }
if (u_trace_submission_data &&
u_trace_submission_data->cmd_trace_data[i].timestamp_copy_cs) {
struct tu_cs_entry *trace_cs_entry =
&u_trace_submission_data->cmd_trace_data[i]
.timestamp_copy_cs->entries[0];
cmds[entry_idx++] = (struct kgsl_command_object) {
.offset = trace_cs_entry->offset,
.gpuaddr = trace_cs_entry->bo->iova,
.size = trace_cs_entry->size,
.flags = KGSL_CMDLIST_IB,
.id = trace_cs_entry->bo->gem_handle,
};
}
}
struct kgsl_cmdbatch_profiling_buffer *profiling_buffer = NULL;
uint32_t obj_idx = 0;
if (u_trace_submission_data) {
struct tu_suballoc_bo *bo = &u_trace_submission_data->kgsl_timestamp_bo;
objs[obj_idx++] = (struct kgsl_command_object) {
.gpuaddr = bo->iova,
.offset = bo->iova - bo->bo->iova,
.size = sizeof(struct kgsl_cmdbatch_profiling_buffer),
.flags = KGSL_OBJLIST_MEMOBJ | KGSL_OBJLIST_PROFILE,
.id = bo->bo->gem_handle,
};
profiling_buffer =
(struct kgsl_cmdbatch_profiling_buffer *) tu_suballoc_bo_map(bo);
} }
if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) { if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) {
@ -1112,9 +1179,49 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
.context_id = queue->msm_queue_id, .context_id = queue->msm_queue_id,
}; };
if (obj_idx) {
req.flags |= KGSL_CMDBATCH_PROFILING;
req.objlist = (uintptr_t) objs;
req.objsize = sizeof(struct kgsl_command_object);
req.numobjs = obj_idx;
}
int ret = safe_ioctl(queue->device->physical_device->local_fd, int ret = safe_ioctl(queue->device->physical_device->local_fd,
IOCTL_KGSL_GPU_COMMAND, &req); IOCTL_KGSL_GPU_COMMAND, &req);
uint64_t gpu_offset = 0;
#if HAVE_PERFETTO
if (profiling_buffer && profiling_buffer->gpu_ticks_queued) {
struct kgsl_perfcounter_read_group perf = {
.groupid = KGSL_PERFCOUNTER_GROUP_ALWAYSON,
.countable = 0,
.value = 0
};
struct kgsl_perfcounter_read req = {
.reads = &perf,
.count = 1,
};
ret = safe_ioctl(queue->device->fd, IOCTL_KGSL_PERFCOUNTER_READ, &req);
/* Older KGSL has some kind of garbage in upper 32 bits */
uint64_t offseted_gpu_ts = perf.value & 0xffffffff;
gpu_offset = tu_device_ticks_to_ns(
queue->device, offseted_gpu_ts - profiling_buffer->gpu_ticks_queued);
struct tu_perfetto_clocks clocks = {
.cpu = profiling_buffer->wall_clock_ns,
.gpu_ts = tu_device_ticks_to_ns(queue->device,
profiling_buffer->gpu_ticks_queued),
.gpu_ts_offset = gpu_offset,
};
clocks = tu_perfetto_submit(queue->device, queue->device->submit_count, &clocks);
gpu_offset = clocks.gpu_ts_offset;
}
#endif
kgsl_syncobj_destroy(&wait_sync); kgsl_syncobj_destroy(&wait_sync);
if (ret) { if (ret) {
@ -1136,9 +1243,40 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
signal_sync->timestamp = req.timestamp; signal_sync->timestamp = req.timestamp;
} }
if (u_trace_submission_data) {
struct tu_u_trace_submission_data *submission_data =
u_trace_submission_data;
submission_data->submission_id = queue->device->submit_count;
submission_data->gpu_ts_offset = gpu_offset;
/* We have to allocate it here since it is different between drm/kgsl */
submission_data->syncobj = (struct tu_u_trace_syncobj *)
vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj),
8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
submission_data->syncobj->timestamp = req.timestamp;
submission_data->syncobj->msm_queue_id = queue->msm_queue_id;
u_trace_submission_data = NULL;
for (uint32_t i = 0; i < submission_data->cmd_buffer_count; i++) {
bool free_data = i == submission_data->last_buffer_with_tracepoints;
if (submission_data->cmd_trace_data[i].trace)
u_trace_flush(submission_data->cmd_trace_data[i].trace,
submission_data, free_data);
if (!submission_data->cmd_trace_data[i].timestamp_copy_cs) {
/* u_trace is owned by cmd_buffer */
submission_data->cmd_trace_data[i].trace = NULL;
}
}
}
queue->device->submit_count++;
pthread_mutex_unlock(&queue->device->submit_mutex); pthread_mutex_unlock(&queue->device->submit_mutex);
pthread_cond_broadcast(&queue->device->timeline_cond); pthread_cond_broadcast(&queue->device->timeline_cond);
u_trace_context_process(&queue->device->trace_context, true);
if (cmd_buffers != (struct tu_cmd_buffer **) vk_submit->command_buffers) if (cmd_buffers != (struct tu_cmd_buffer **) vk_submit->command_buffers)
vk_free(&queue->device->vk.alloc, cmd_buffers); vk_free(&queue->device->vk.alloc, cmd_buffers);
@ -1149,6 +1287,13 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit)
fail_submit: fail_submit:
pthread_mutex_unlock(&queue->device->submit_mutex); pthread_mutex_unlock(&queue->device->submit_mutex);
if (result != VK_SUCCESS) {
mtx_lock(&queue->device->kgsl_profiling_mutex);
tu_suballoc_bo_free(&queue->device->kgsl_profiling_suballoc,
&u_trace_submission_data->kgsl_timestamp_bo);
mtx_unlock(&queue->device->kgsl_profiling_mutex);
}
if (cmd_buffers != (struct tu_cmd_buffer **) vk_submit->command_buffers) if (cmd_buffers != (struct tu_cmd_buffer **) vk_submit->command_buffers)
vk_free(&queue->device->vk.alloc, cmd_buffers); vk_free(&queue->device->vk.alloc, cmd_buffers);
@ -1160,7 +1305,19 @@ fail_submit:
static VkResult static VkResult
kgsl_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj) kgsl_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj)
{ {
tu_finishme("tu_device_wait_u_trace"); struct kgsl_device_waittimestamp_ctxtid req = {
.context_id = syncobj->msm_queue_id,
.timestamp = syncobj->timestamp,
.timeout = 5000, // 5s
};
int ret = safe_ioctl(dev->fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &req);
if (ret) {
assert(errno == ETIME);
return VK_TIMEOUT;
}
return VK_SUCCESS; return VK_SUCCESS;
} }
@ -1180,7 +1337,7 @@ kgsl_device_finish(struct tu_device *dev)
static int static int
kgsl_device_get_gpu_timestamp(struct tu_device *dev, uint64_t *ts) kgsl_device_get_gpu_timestamp(struct tu_device *dev, uint64_t *ts)
{ {
tu_finishme("tu_device_get_gpu_timestamp"); unreachable("");
return 0; return 0;
} }

View file

@ -311,7 +311,12 @@ tu_perfetto_init(void)
util_perfetto_init(); util_perfetto_init();
perfetto::DataSourceDescriptor dsd; perfetto::DataSourceDescriptor dsd;
#ifdef ANDROID
/* AGI requires this name */
dsd.set_name("gpu.renderstages");
#else
dsd.set_name("gpu.renderstages.msm"); dsd.set_name("gpu.renderstages.msm");
#endif
TuRenderpassDataSource::Register(dsd); TuRenderpassDataSource::Register(dsd);
} }