diff --git a/src/asahi/lib/agx_device.c b/src/asahi/lib/agx_device.c index 0e1f356680c..3e75efe519a 100644 --- a/src/asahi/lib/agx_device.c +++ b/src/asahi/lib/agx_device.c @@ -75,6 +75,7 @@ static const struct debug_named_value agx_debug_options[] = { {"scratch", AGX_DBG_SCRATCH, "Debug scratch memory usage"}, {"1queue", AGX_DBG_1QUEUE, "Force usage of a single queue for multiple contexts"}, {"nosoft", AGX_DBG_NOSOFT, "Disable soft fault optimizations"}, + {"nomerge", AGX_DBG_NOMERGE, "Disable control stream merging"}, {"bodumpverbose", AGX_DBG_BODUMPVERBOSE, "Include extra info with dumps"}, DEBUG_NAMED_VALUE_END }; diff --git a/src/asahi/lib/agx_device.h b/src/asahi/lib/agx_device.h index 17c900b67aa..52620660862 100644 --- a/src/asahi/lib/agx_device.h +++ b/src/asahi/lib/agx_device.h @@ -47,6 +47,7 @@ enum agx_dbg { AGX_DBG_NOSOFT = BITFIELD_BIT(19), AGX_DBG_FEEDBACK = BITFIELD_BIT(20), AGX_DBG_1QUEUE = BITFIELD_BIT(21), + AGX_DBG_NOMERGE = BITFIELD_BIT(22), }; /* How many power-of-two levels in the BO cache do we want? 2^14 minimum chosen diff --git a/src/asahi/vulkan/hk_cmd_buffer.c b/src/asahi/vulkan/hk_cmd_buffer.c index e355502784a..92fab5452f3 100644 --- a/src/asahi/vulkan/hk_cmd_buffer.c +++ b/src/asahi/vulkan/hk_cmd_buffer.c @@ -259,6 +259,32 @@ hk_BeginCommandBuffer(VkCommandBuffer commandBuffer, return VK_SUCCESS; } +/* + * Merge adjacent compute control streams. Except for reading timestamps, there + * is no reason to submit two CDM streams back-to-back in the same command + * buffer. However, it is challenging to avoid constructing such sequences due + * to the gymnastics required to reorder compute around graphics. Merging at + * EndCommandBuffer is cheap O(# of control streams) and lets us get away with + * the sloppiness. + */ +static void +merge_control_streams(struct hk_cmd_buffer *cmd) +{ + struct hk_cs *last = NULL; + + list_for_each_entry_safe(struct hk_cs, cs, &cmd->control_streams, node) { + if (cs->type == HK_CS_CDM && last && last->type == HK_CS_CDM && + !last->timestamp.end.handle) { + + hk_cs_merge_cdm(last, cs); + list_del(&cs->node); + hk_cs_destroy(cs); + } else { + last = cs; + } + } +} + VKAPI_ATTR VkResult VKAPI_CALL hk_EndCommandBuffer(VkCommandBuffer commandBuffer) { @@ -271,6 +297,21 @@ hk_EndCommandBuffer(VkCommandBuffer commandBuffer) hk_cmd_buffer_end_compute(cmd); hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.post_gfx); + struct hk_device *dev = hk_cmd_buffer_device(cmd); + if (likely(!(dev->dev.debug & AGX_DBG_NOMERGE))) { + merge_control_streams(cmd); + } + + /* We cannot terminate CDM control streams until after merging, since merging + * needs to append stream links late. Now that we've merged, insert all the + * missing stream terminates. + */ + list_for_each_entry(struct hk_cs, cs, &cmd->control_streams, node) { + if (cs->type == HK_CS_CDM) { + cs->current = agx_cdm_terminate(cs->current); + } + } + return vk_command_buffer_get_record_result(&cmd->vk); } diff --git a/src/asahi/vulkan/hk_cmd_buffer.h b/src/asahi/vulkan/hk_cmd_buffer.h index ef3bcc7fcd8..a1050819f8c 100644 --- a/src/asahi/vulkan/hk_cmd_buffer.h +++ b/src/asahi/vulkan/hk_cmd_buffer.h @@ -370,7 +370,7 @@ struct hk_cs { /* Statistics */ struct { - uint32_t calls, cmds, flushes; + uint32_t calls, cmds, flushes, merged; } stats; /* Timestamp writes. Currently just compute end / fragment end. We could @@ -406,6 +406,32 @@ struct hk_cs { uint32_t restart_index; }; +/* + * Helper to merge two compute control streams, concatenating the second control + * stream to the first one. Must sync with hk_cs. + */ +static inline void +hk_cs_merge_cdm(struct hk_cs *a, const struct hk_cs *b) +{ + assert(a->type == HK_CS_CDM && b->type == HK_CS_CDM); + assert(a->cmd == b->cmd); + assert(!a->timestamp.end.handle); + + agx_cdm_jump(a->current, b->addr); + a->current = b->current; + a->stream_linked = true; + + a->scratch.cs.main |= b->scratch.cs.main; + a->scratch.cs.preamble |= b->scratch.cs.preamble; + + a->timestamp = b->timestamp; + + a->stats.calls += b->stats.calls; + a->stats.cmds += b->stats.cmds; + a->stats.flushes += b->stats.flushes; + a->stats.merged++; +} + static inline uint64_t hk_cs_current_addr(struct hk_cs *cs) { @@ -660,8 +686,6 @@ hk_cmd_buffer_end_compute_internal(struct hk_cmd_buffer *cmd, if (cs->imm_writes.size) { hk_dispatch_imm_writes(cmd, cs); } - - cs->current = agx_cdm_terminate(cs->current); } *ptr = NULL; diff --git a/src/asahi/vulkan/hk_queue.c b/src/asahi/vulkan/hk_queue.c index ab0a89b2c63..e1129294abe 100644 --- a/src/asahi/vulkan/hk_queue.c +++ b/src/asahi/vulkan/hk_queue.c @@ -838,8 +838,9 @@ queue_submit(struct hk_device *dev, struct hk_queue *queue, if (cs->type == HK_CS_CDM) { perf_debug( cmdbuf, - "%u: Submitting CDM with %u API calls, %u dispatches, %u flushes", - i, cs->stats.calls, cs->stats.cmds, cs->stats.flushes); + "%u: Submitting CDM with %u API calls, %u dispatches, %u flushes, %u merged", + i, cs->stats.calls, cs->stats.cmds, cs->stats.flushes, + cs->stats.merged); assert(cs->stats.cmds > 0 || cs->stats.flushes > 0 || cs->timestamp.end.handle);