diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index ec40f455996..8d18773689d 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -1888,6 +1888,13 @@ tu_u_trace_submission_data_finish( } } + if (submission_data->kgsl_timestamp_bo.bo) { + mtx_lock(&device->kgsl_profiling_mutex); + tu_suballoc_bo_free(&device->kgsl_profiling_suballoc, + &submission_data->kgsl_timestamp_bo); + mtx_unlock(&device->kgsl_profiling_mutex); + } + vk_free(&device->vk.alloc, submission_data->cmd_trace_data); vk_free(&device->vk.alloc, submission_data->syncobj); vk_free(&device->vk.alloc, submission_data); @@ -2086,6 +2093,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, mtx_init(&device->bo_mutex, mtx_plain); mtx_init(&device->pipeline_mutex, mtx_plain); mtx_init(&device->autotune_mutex, mtx_plain); + mtx_init(&device->kgsl_profiling_mutex, mtx_plain); u_rwlock_init(&device->dma_bo_lock); pthread_mutex_init(&device->submit_mutex, NULL); @@ -2181,6 +2189,10 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, (enum tu_bo_alloc_flags) (TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP)); tu_bo_suballocator_init(&device->autotune_suballoc, device, 128 * 1024, TU_BO_ALLOC_NO_FLAGS); + if (is_kgsl(physical_device->instance)) { + tu_bo_suballocator_init(&device->kgsl_profiling_suballoc, device, + 128 * 1024, TU_BO_ALLOC_NO_FLAGS); + } result = tu_bo_init_new(device, &device->global_bo, global_size, TU_BO_ALLOC_ALLOW_DUMP, "global"); @@ -2428,6 +2440,7 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) tu_bo_suballocator_finish(&device->pipeline_suballoc); tu_bo_suballocator_finish(&device->autotune_suballoc); + tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc); for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) { for (unsigned q = 0; q < device->queue_count[i]; q++) diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index 67db9b465bb..4f144168566 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -276,6 +276,12 @@ struct tu_device struct tu_suballocator autotune_suballoc; mtx_t autotune_mutex; + /* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on + * each submission. + */ + struct tu_suballocator kgsl_profiling_suballoc; + mtx_t kgsl_profiling_mutex; + /* the blob seems to always use 8K factor and 128K param sizes, copy them */ #define TU_TESS_FACTOR_SIZE (8 * 1024) #define TU_TESS_PARAM_SIZE (128 * 1024) @@ -526,6 +532,9 @@ struct tu_u_trace_submission_data * offset may change between submissions due to power cycle. */ uint64_t gpu_ts_offset; + + /* KGSL needs a GPU memory to write submission timestamps into */ + struct tu_suballoc_bo kgsl_timestamp_bo; }; VkResult diff --git a/src/freedreno/vulkan/tu_knl_kgsl.cc b/src/freedreno/vulkan/tu_knl_kgsl.cc index 0f033d9b290..6d3825a4da0 100644 --- a/src/freedreno/vulkan/tu_knl_kgsl.cc +++ b/src/freedreno/vulkan/tu_knl_kgsl.cc @@ -332,6 +332,12 @@ struct kgsl_syncobj int fd; }; +struct tu_u_trace_syncobj +{ + uint32_t msm_queue_id; + uint32_t timestamp; +}; + static void kgsl_syncobj_init(struct kgsl_syncobj *s, bool signaled) { @@ -930,6 +936,9 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit) { MESA_TRACE_FUNC(); + bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context); + bool has_trace_points = false; + if (vk_submit->command_buffer_count == 0) { pthread_mutex_lock(&queue->device->submit_mutex); @@ -1006,6 +1015,14 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit) entry_count++; entry_count += cmd_buffer->cs.entry_count; + + if (u_trace_enabled && u_trace_has_points(&cmd_buffers[i]->trace)) { + if (!(cmd_buffers[i]->usage_flags & + VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) + entry_count++; + + has_trace_points = true; + } } if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) @@ -1019,6 +1036,26 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit) return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); } + uint32_t obj_count = 0; + if (has_trace_points) + obj_count++; + + struct kgsl_command_object *objs = (struct kgsl_command_object *) + vk_alloc(&queue->device->vk.alloc, sizeof(*objs) * obj_count, + alignof(*objs), VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + + struct tu_u_trace_submission_data *u_trace_submission_data = NULL; + if (has_trace_points) { + tu_u_trace_submission_data_create( + queue->device, cmd_buffers, cmdbuf_count, &u_trace_submission_data); + + mtx_lock(&queue->device->kgsl_profiling_mutex); + tu_suballoc_bo_alloc(&u_trace_submission_data->kgsl_timestamp_bo, + &queue->device->kgsl_profiling_suballoc, + sizeof(struct kgsl_cmdbatch_profiling_buffer), 4); + mtx_unlock(&queue->device->kgsl_profiling_mutex); + } + uint32_t entry_idx = 0; for (uint32_t i = 0; i < cmdbuf_count; i++) { struct tu_cmd_buffer *cmd_buffer = cmd_buffers[i]; @@ -1044,6 +1081,36 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit) .id = cs->entries[j].bo->gem_handle, }; } + + if (u_trace_submission_data && + u_trace_submission_data->cmd_trace_data[i].timestamp_copy_cs) { + struct tu_cs_entry *trace_cs_entry = + &u_trace_submission_data->cmd_trace_data[i] + .timestamp_copy_cs->entries[0]; + cmds[entry_idx++] = (struct kgsl_command_object) { + .offset = trace_cs_entry->offset, + .gpuaddr = trace_cs_entry->bo->iova, + .size = trace_cs_entry->size, + .flags = KGSL_CMDLIST_IB, + .id = trace_cs_entry->bo->gem_handle, + }; + } + } + + struct kgsl_cmdbatch_profiling_buffer *profiling_buffer = NULL; + uint32_t obj_idx = 0; + if (u_trace_submission_data) { + struct tu_suballoc_bo *bo = &u_trace_submission_data->kgsl_timestamp_bo; + + objs[obj_idx++] = (struct kgsl_command_object) { + .gpuaddr = bo->iova, + .offset = bo->iova - bo->bo->iova, + .size = sizeof(struct kgsl_cmdbatch_profiling_buffer), + .flags = KGSL_OBJLIST_MEMOBJ | KGSL_OBJLIST_PROFILE, + .id = bo->bo->gem_handle, + }; + profiling_buffer = + (struct kgsl_cmdbatch_profiling_buffer *) tu_suballoc_bo_map(bo); } if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) { @@ -1112,9 +1179,49 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit) .context_id = queue->msm_queue_id, }; + if (obj_idx) { + req.flags |= KGSL_CMDBATCH_PROFILING; + req.objlist = (uintptr_t) objs; + req.objsize = sizeof(struct kgsl_command_object); + req.numobjs = obj_idx; + } + int ret = safe_ioctl(queue->device->physical_device->local_fd, IOCTL_KGSL_GPU_COMMAND, &req); + uint64_t gpu_offset = 0; +#if HAVE_PERFETTO + if (profiling_buffer && profiling_buffer->gpu_ticks_queued) { + struct kgsl_perfcounter_read_group perf = { + .groupid = KGSL_PERFCOUNTER_GROUP_ALWAYSON, + .countable = 0, + .value = 0 + }; + + struct kgsl_perfcounter_read req = { + .reads = &perf, + .count = 1, + }; + + ret = safe_ioctl(queue->device->fd, IOCTL_KGSL_PERFCOUNTER_READ, &req); + /* Older KGSL has some kind of garbage in upper 32 bits */ + uint64_t offseted_gpu_ts = perf.value & 0xffffffff; + + gpu_offset = tu_device_ticks_to_ns( + queue->device, offseted_gpu_ts - profiling_buffer->gpu_ticks_queued); + + struct tu_perfetto_clocks clocks = { + .cpu = profiling_buffer->wall_clock_ns, + .gpu_ts = tu_device_ticks_to_ns(queue->device, + profiling_buffer->gpu_ticks_queued), + .gpu_ts_offset = gpu_offset, + }; + + clocks = tu_perfetto_submit(queue->device, queue->device->submit_count, &clocks); + gpu_offset = clocks.gpu_ts_offset; + } +#endif + kgsl_syncobj_destroy(&wait_sync); if (ret) { @@ -1136,9 +1243,40 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit) signal_sync->timestamp = req.timestamp; } + if (u_trace_submission_data) { + struct tu_u_trace_submission_data *submission_data = + u_trace_submission_data; + submission_data->submission_id = queue->device->submit_count; + submission_data->gpu_ts_offset = gpu_offset; + /* We have to allocate it here since it is different between drm/kgsl */ + submission_data->syncobj = (struct tu_u_trace_syncobj *) + vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj), + 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + submission_data->syncobj->timestamp = req.timestamp; + submission_data->syncobj->msm_queue_id = queue->msm_queue_id; + + u_trace_submission_data = NULL; + + for (uint32_t i = 0; i < submission_data->cmd_buffer_count; i++) { + bool free_data = i == submission_data->last_buffer_with_tracepoints; + if (submission_data->cmd_trace_data[i].trace) + u_trace_flush(submission_data->cmd_trace_data[i].trace, + submission_data, free_data); + + if (!submission_data->cmd_trace_data[i].timestamp_copy_cs) { + /* u_trace is owned by cmd_buffer */ + submission_data->cmd_trace_data[i].trace = NULL; + } + } + } + + queue->device->submit_count++; + pthread_mutex_unlock(&queue->device->submit_mutex); pthread_cond_broadcast(&queue->device->timeline_cond); + u_trace_context_process(&queue->device->trace_context, true); + if (cmd_buffers != (struct tu_cmd_buffer **) vk_submit->command_buffers) vk_free(&queue->device->vk.alloc, cmd_buffers); @@ -1149,6 +1287,13 @@ kgsl_queue_submit(struct tu_queue *queue, struct vk_queue_submit *vk_submit) fail_submit: pthread_mutex_unlock(&queue->device->submit_mutex); + if (result != VK_SUCCESS) { + mtx_lock(&queue->device->kgsl_profiling_mutex); + tu_suballoc_bo_free(&queue->device->kgsl_profiling_suballoc, + &u_trace_submission_data->kgsl_timestamp_bo); + mtx_unlock(&queue->device->kgsl_profiling_mutex); + } + if (cmd_buffers != (struct tu_cmd_buffer **) vk_submit->command_buffers) vk_free(&queue->device->vk.alloc, cmd_buffers); @@ -1160,7 +1305,19 @@ fail_submit: static VkResult kgsl_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj) { - tu_finishme("tu_device_wait_u_trace"); + struct kgsl_device_waittimestamp_ctxtid req = { + .context_id = syncobj->msm_queue_id, + .timestamp = syncobj->timestamp, + .timeout = 5000, // 5s + }; + + int ret = safe_ioctl(dev->fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &req); + + if (ret) { + assert(errno == ETIME); + return VK_TIMEOUT; + } + return VK_SUCCESS; } @@ -1180,7 +1337,7 @@ kgsl_device_finish(struct tu_device *dev) static int kgsl_device_get_gpu_timestamp(struct tu_device *dev, uint64_t *ts) { - tu_finishme("tu_device_get_gpu_timestamp"); + unreachable(""); return 0; } diff --git a/src/freedreno/vulkan/tu_perfetto.cc b/src/freedreno/vulkan/tu_perfetto.cc index b417eec081a..852b3086c65 100644 --- a/src/freedreno/vulkan/tu_perfetto.cc +++ b/src/freedreno/vulkan/tu_perfetto.cc @@ -311,7 +311,12 @@ tu_perfetto_init(void) util_perfetto_init(); perfetto::DataSourceDescriptor dsd; +#ifdef ANDROID + /* AGI requires this name */ + dsd.set_name("gpu.renderstages"); +#else dsd.set_name("gpu.renderstages.msm"); +#endif TuRenderpassDataSource::Register(dsd); }