panvk/csf: add u_trace to panvk_cmd_buffer

There is one u_trace per subqueue to record trace events.  When tracing
is enabled, trace_begin_cmdbuf and trace_end_cmdbuf will emit trace
events to u_trace and emit timestamp writes to the command streams.

The trace events are buffered in u_trace and are not flushed for
processing yet.

Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32360>
This commit is contained in:
Chia-I Wu 2024-11-24 18:50:25 -08:00 committed by Marge Bot
parent 06cc6e82cf
commit 39824d70b8
4 changed files with 131 additions and 1 deletions

View file

@ -23,6 +23,7 @@
#include "vk_command_buffer.h"
#include "util/list.h"
#include "util/perf/u_trace.h"
#define MAX_VBS 16
#define MAX_RTS 8
@ -377,6 +378,10 @@ struct panvk_cmd_buffer {
uint32_t flush_id;
struct {
struct u_trace uts[PANVK_SUBQUEUE_COUNT];
} utrace;
struct {
struct panvk_cmd_graphics_state gfx;
struct panvk_cmd_compute_state compute;

View file

@ -39,6 +39,8 @@
#include "panvk_instance.h"
#include "panvk_physical_device.h"
#include "panvk_priv_bo.h"
#include "panvk_tracepoints.h"
#include "panvk_utrace.h"
#include "pan_desc.h"
#include "pan_encoder.h"
@ -178,6 +180,8 @@ finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue)
}
}
trace_end_cmdbuf(&cmdbuf->utrace.uts[subqueue], cmdbuf, cmdbuf->flags);
cs_finish(&cmdbuf->state.cs[subqueue].builder);
}
@ -724,6 +728,7 @@ panvk_reset_cmdbuf(struct vk_command_buffer *vk_cmdbuf,
container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
struct panvk_cmd_pool *pool =
container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk);
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
vk_command_buffer_reset(&cmdbuf->vk);
@ -733,6 +738,12 @@ panvk_reset_cmdbuf(struct vk_command_buffer *vk_cmdbuf,
list_splicetail(&cmdbuf->push_sets, &pool->push_sets);
list_inithead(&cmdbuf->push_sets);
for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->utrace.uts); i++) {
struct u_trace *ut = &cmdbuf->utrace.uts[i];
u_trace_fini(ut);
u_trace_init(ut, &dev->utrace.utctx);
}
memset(&cmdbuf->state, 0, sizeof(cmdbuf->state));
init_cs_builders(cmdbuf);
}
@ -746,6 +757,9 @@ panvk_destroy_cmdbuf(struct vk_command_buffer *vk_cmdbuf)
container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk);
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->utrace.uts); i++)
u_trace_fini(&cmdbuf->utrace.uts[i]);
panvk_pool_cleanup(&cmdbuf->cs_pool);
panvk_pool_cleanup(&cmdbuf->desc_pool);
panvk_pool_cleanup(&cmdbuf->tls_pool);
@ -814,6 +828,9 @@ panvk_create_cmdbuf(struct vk_command_pool *vk_pool, VkCommandBufferLevel level,
panvk_pool_init(&cmdbuf->tls_pool, device, &pool->tls_bo_pool,
&tls_pool_props);
for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->utrace.uts); i++)
u_trace_init(&cmdbuf->utrace.uts[i], &device->utrace.utctx);
init_cs_builders(cmdbuf);
*cmdbuf_out = &cmdbuf->vk;
return VK_SUCCESS;
@ -843,6 +860,9 @@ panvk_per_arch(BeginCommandBuffer)(VkCommandBuffer commandBuffer,
panvk_per_arch(cmd_inherit_render_state)(cmdbuf, pBeginInfo);
for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
trace_begin_cmdbuf(&cmdbuf->utrace.uts[i], cmdbuf);
return VK_SUCCESS;
}
@ -901,6 +921,12 @@ panvk_per_arch(CmdExecuteCommands)(VkCommandBuffer commandBuffer,
cs_move64_to(prim_b, addr, cs_root_chunk_gpu_addr(sec_b));
cs_move32_to(prim_b, size, cs_root_chunk_size(sec_b));
cs_call(prim_b, addr, size);
struct u_trace *prim_ut = &primary->utrace.uts[j];
struct u_trace *sec_ut = &secondary->utrace.uts[j];
u_trace_clone_append(u_trace_begin_iterator(sec_ut),
u_trace_end_iterator(sec_ut), prim_ut, prim_b,
panvk_per_arch(utrace_copy_buffer));
}
}

View file

@ -5,14 +5,93 @@
#include "panvk_utrace.h"
#include "genxml/cs_builder.h"
#include "panvk_cmd_buffer.h"
#include "panvk_device.h"
#include "panvk_priv_bo.h"
static void
cmd_write_timestamp(struct cs_builder *b, mali_ptr addr)
{
const struct cs_index addr_reg = cs_scratch_reg64(b, 0);
/* abuse DEFERRED_SYNC */
const struct cs_async_op async = cs_defer(
SB_ALL_ITERS_MASK | SB_MASK(DEFERRED_FLUSH), SB_ID(DEFERRED_SYNC));
cs_move64_to(b, addr_reg, addr);
cs_store_state(b, addr_reg, 0, MALI_CS_STATE_TIMESTAMP, async);
}
static void
cmd_copy_data(struct cs_builder *b, mali_ptr dst_addr, mali_ptr src_addr,
uint32_t size)
{
assert((dst_addr | src_addr | size) % sizeof(uint32_t) == 0);
/* wait for timestamp writes */
cs_wait_slot(b, SB_ID(DEFERRED_SYNC), false);
/* Depending on where this is called from, we could potentially use SR
* registers or copy with a compute job.
*/
const struct cs_index dst_addr_reg = cs_scratch_reg64(b, 0);
const struct cs_index src_addr_reg = cs_scratch_reg64(b, 2);
const uint32_t temp_count = CS_REG_SCRATCH_COUNT - 4;
while (size) {
cs_move64_to(b, dst_addr_reg, dst_addr);
cs_move64_to(b, src_addr_reg, src_addr);
const uint32_t max_offset = 1 << 16;
uint32_t copy_count = MIN2(size, max_offset) / sizeof(uint32_t);
uint32_t offset = 0;
while (copy_count) {
const uint32_t count = MIN2(copy_count, temp_count);
const struct cs_index reg = cs_scratch_reg_tuple(b, 4, count);
cs_load_to(b, reg, src_addr_reg, BITFIELD_MASK(count), offset);
cs_wait_slot(b, SB_ID(LS), false);
cs_store(b, reg, dst_addr_reg, BITFIELD_MASK(count), offset);
copy_count -= count;
offset += count * sizeof(uint32_t);
}
dst_addr += offset;
src_addr += offset;
size -= offset;
}
cs_wait_slot(b, SB_ID(LS), false);
}
static struct cs_builder *
get_builder(struct panvk_cmd_buffer *cmdbuf, struct u_trace *ut)
{
const uint32_t subqueue = ut - cmdbuf->utrace.uts;
assert(subqueue < PANVK_SUBQUEUE_COUNT);
return panvk_get_cs_builder(cmdbuf, subqueue);
}
static void
panvk_utrace_record_ts(struct u_trace *ut, void *cs, void *timestamps,
uint64_t offset_B, uint32_t flags)
{
struct cs_builder *b = get_builder(cs, ut);
const struct panvk_priv_bo *bo = timestamps;
const mali_ptr addr = bo->addr.dev + offset_B;
cmd_write_timestamp(b, addr);
}
void
panvk_per_arch(utrace_context_init)(struct panvk_device *dev)
{
u_trace_context_init(&dev->utrace.utctx, NULL, sizeof(uint64_t), 0,
panvk_utrace_create_buffer, panvk_utrace_delete_buffer,
NULL, panvk_utrace_read_ts, NULL, NULL, NULL);
panvk_utrace_record_ts, panvk_utrace_read_ts, NULL,
NULL, NULL);
}
void
@ -20,3 +99,18 @@ panvk_per_arch(utrace_context_fini)(struct panvk_device *dev)
{
u_trace_context_fini(&dev->utrace.utctx);
}
void
panvk_per_arch(utrace_copy_buffer)(struct u_trace_context *utctx,
void *cmdstream, void *ts_from,
uint64_t from_offset, void *ts_to,
uint64_t to_offset, uint64_t size_B)
{
struct cs_builder *b = cmdstream;
const struct panvk_priv_bo *src_bo = ts_from;
const struct panvk_priv_bo *dst_bo = ts_to;
const mali_ptr src_addr = src_bo->addr.dev + from_offset;
const mali_ptr dst_addr = dst_bo->addr.dev + to_offset;
cmd_copy_data(b, dst_addr, src_addr, size_B);
}

View file

@ -27,6 +27,11 @@ uint64_t panvk_utrace_read_ts(struct u_trace_context *utctx, void *timestamps,
void panvk_per_arch(utrace_context_init)(struct panvk_device *dev);
void panvk_per_arch(utrace_context_fini)(struct panvk_device *dev);
void panvk_per_arch(utrace_copy_buffer)(struct u_trace_context *utctx,
void *cmdstream, void *ts_from,
uint64_t from_offset, void *ts_to,
uint64_t to_offset, uint64_t size_B);
#else /* PAN_ARCH >= 10 */
static inline void