hk: merge adjacent CDM control streams

this reduces submission overhead in the kernel/firmware.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35662>
This commit is contained in:
Alyssa Rosenzweig 2025-01-31 12:21:08 -05:00 committed by Marge Bot
parent f7db4afd09
commit 43a47266c8
5 changed files with 73 additions and 5 deletions

View file

@ -75,6 +75,7 @@ static const struct debug_named_value agx_debug_options[] = {
{"scratch", AGX_DBG_SCRATCH, "Debug scratch memory usage"},
{"1queue", AGX_DBG_1QUEUE, "Force usage of a single queue for multiple contexts"},
{"nosoft", AGX_DBG_NOSOFT, "Disable soft fault optimizations"},
{"nomerge", AGX_DBG_NOMERGE, "Disable control stream merging"},
{"bodumpverbose", AGX_DBG_BODUMPVERBOSE, "Include extra info with dumps"},
DEBUG_NAMED_VALUE_END
};

View file

@ -47,6 +47,7 @@ enum agx_dbg {
AGX_DBG_NOSOFT = BITFIELD_BIT(19),
AGX_DBG_FEEDBACK = BITFIELD_BIT(20),
AGX_DBG_1QUEUE = BITFIELD_BIT(21),
AGX_DBG_NOMERGE = BITFIELD_BIT(22),
};
/* How many power-of-two levels in the BO cache do we want? 2^14 minimum chosen

View file

@ -259,6 +259,32 @@ hk_BeginCommandBuffer(VkCommandBuffer commandBuffer,
return VK_SUCCESS;
}
/*
* Merge adjacent compute control streams. Except for reading timestamps, there
* is no reason to submit two CDM streams back-to-back in the same command
* buffer. However, it is challenging to avoid constructing such sequences due
* to the gymnastics required to reorder compute around graphics. Merging at
* EndCommandBuffer is cheap O(# of control streams) and lets us get away with
* the sloppiness.
*/
static void
merge_control_streams(struct hk_cmd_buffer *cmd)
{
struct hk_cs *last = NULL;
list_for_each_entry_safe(struct hk_cs, cs, &cmd->control_streams, node) {
if (cs->type == HK_CS_CDM && last && last->type == HK_CS_CDM &&
!last->timestamp.end.handle) {
hk_cs_merge_cdm(last, cs);
list_del(&cs->node);
hk_cs_destroy(cs);
} else {
last = cs;
}
}
}
VKAPI_ATTR VkResult VKAPI_CALL
hk_EndCommandBuffer(VkCommandBuffer commandBuffer)
{
@ -271,6 +297,21 @@ hk_EndCommandBuffer(VkCommandBuffer commandBuffer)
hk_cmd_buffer_end_compute(cmd);
hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.post_gfx);
struct hk_device *dev = hk_cmd_buffer_device(cmd);
if (likely(!(dev->dev.debug & AGX_DBG_NOMERGE))) {
merge_control_streams(cmd);
}
/* We cannot terminate CDM control streams until after merging, since merging
* needs to append stream links late. Now that we've merged, insert all the
* missing stream terminates.
*/
list_for_each_entry(struct hk_cs, cs, &cmd->control_streams, node) {
if (cs->type == HK_CS_CDM) {
cs->current = agx_cdm_terminate(cs->current);
}
}
return vk_command_buffer_get_record_result(&cmd->vk);
}

View file

@ -370,7 +370,7 @@ struct hk_cs {
/* Statistics */
struct {
uint32_t calls, cmds, flushes;
uint32_t calls, cmds, flushes, merged;
} stats;
/* Timestamp writes. Currently just compute end / fragment end. We could
@ -406,6 +406,32 @@ struct hk_cs {
uint32_t restart_index;
};
/*
* Helper to merge two compute control streams, concatenating the second control
* stream to the first one. Must sync with hk_cs.
*/
static inline void
hk_cs_merge_cdm(struct hk_cs *a, const struct hk_cs *b)
{
assert(a->type == HK_CS_CDM && b->type == HK_CS_CDM);
assert(a->cmd == b->cmd);
assert(!a->timestamp.end.handle);
agx_cdm_jump(a->current, b->addr);
a->current = b->current;
a->stream_linked = true;
a->scratch.cs.main |= b->scratch.cs.main;
a->scratch.cs.preamble |= b->scratch.cs.preamble;
a->timestamp = b->timestamp;
a->stats.calls += b->stats.calls;
a->stats.cmds += b->stats.cmds;
a->stats.flushes += b->stats.flushes;
a->stats.merged++;
}
static inline uint64_t
hk_cs_current_addr(struct hk_cs *cs)
{
@ -660,8 +686,6 @@ hk_cmd_buffer_end_compute_internal(struct hk_cmd_buffer *cmd,
if (cs->imm_writes.size) {
hk_dispatch_imm_writes(cmd, cs);
}
cs->current = agx_cdm_terminate(cs->current);
}
*ptr = NULL;

View file

@ -838,8 +838,9 @@ queue_submit(struct hk_device *dev, struct hk_queue *queue,
if (cs->type == HK_CS_CDM) {
perf_debug(
cmdbuf,
"%u: Submitting CDM with %u API calls, %u dispatches, %u flushes",
i, cs->stats.calls, cs->stats.cmds, cs->stats.flushes);
"%u: Submitting CDM with %u API calls, %u dispatches, %u flushes, %u merged",
i, cs->stats.calls, cs->stats.cmds, cs->stats.flushes,
cs->stats.merged);
assert(cs->stats.cmds > 0 || cs->stats.flushes > 0 ||
cs->timestamp.end.handle);