diff --git a/docs/features.txt b/docs/features.txt index c44569946a2..97af0a7a123 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -474,7 +474,7 @@ Vulkan 1.2 -- all DONE: anv, vn Vulkan 1.3 -- all DONE: anv, radv, lvp VK_KHR_copy_commands2 DONE (anv, lvp, radv, tu, v3dv) - VK_KHR_dynamic_rendering DONE (anv, lvp, radv) + VK_KHR_dynamic_rendering DONE (anv, lvp, radv, tu) VK_KHR_format_feature_flags2 DONE (anv, radv, tu, v3dv) VK_KHR_maintenance4 DONE (anv, radv, tu) VK_KHR_shader_non_semantic_info DONE (anv, radv, tu, v3dv) diff --git a/src/freedreno/vulkan/meson.build b/src/freedreno/vulkan/meson.build index 8fdf6d9c1a6..a3c1ab53ce0 100644 --- a/src/freedreno/vulkan/meson.build +++ b/src/freedreno/vulkan/meson.build @@ -39,6 +39,7 @@ libtu_files = files( 'tu_device.c', 'tu_descriptor_set.c', 'tu_descriptor_set.h', + 'tu_dynamic_rendering.c', 'tu_formats.c', 'tu_image.c', 'tu_lrz.c', diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 2efe79ee4eb..4843b37be8c 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -1526,6 +1526,46 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd, trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer); } +void +tu_cmd_render(struct tu_cmd_buffer *cmd_buffer) +{ + if (cmd_buffer->state.rp.has_tess) + tu6_lazy_emit_tessfactor_addr(cmd_buffer); + + struct tu_renderpass_result *autotune_result = NULL; + if (use_sysmem_rendering(cmd_buffer, &autotune_result)) + tu_cmd_render_sysmem(cmd_buffer, autotune_result); + else + tu_cmd_render_tiles(cmd_buffer, autotune_result); + + /* Outside of renderpasses we assume all draw states are disabled. We do + * this outside the draw CS for the normal case where 3d gmem stores aren't + * used. + */ + tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs); + +} + +static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer) +{ + /* discard draw_cs and draw_epilogue_cs entries now that the tiles are + rendered */ + tu_cs_discard_entries(&cmd_buffer->draw_cs); + tu_cs_begin(&cmd_buffer->draw_cs); + tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs); + tu_cs_begin(&cmd_buffer->draw_epilogue_cs); + + cmd_buffer->state.pass = NULL; + cmd_buffer->state.subpass = NULL; + cmd_buffer->state.framebuffer = NULL; + cmd_buffer->state.attachments = NULL; + memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp)); + + /* LRZ is not valid next time we use it */ + cmd_buffer->state.lrz.valid = false; + cmd_buffer->state.dirty |= TU_CMD_DIRTY_LRZ; +} + static VkResult tu_create_cmd_buffer(struct tu_device *device, struct tu_cmd_pool *pool, @@ -1570,6 +1610,8 @@ tu_create_cmd_buffer(struct tu_device *device, tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048); tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096); tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048); + tu_cs_init(&cmd_buffer->pre_chain.draw_cs, device, TU_CS_MODE_GROW, 4096); + tu_cs_init(&cmd_buffer->pre_chain.draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096); *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer); @@ -1586,6 +1628,8 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer) tu_cs_finish(&cmd_buffer->tile_store_cs); tu_cs_finish(&cmd_buffer->draw_epilogue_cs); tu_cs_finish(&cmd_buffer->sub_cs); + tu_cs_finish(&cmd_buffer->pre_chain.draw_cs); + tu_cs_finish(&cmd_buffer->pre_chain.draw_epilogue_cs); u_trace_fini(&cmd_buffer->trace); @@ -1614,6 +1658,8 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer) tu_cs_reset(&cmd_buffer->tile_store_cs); tu_cs_reset(&cmd_buffer->draw_epilogue_cs); tu_cs_reset(&cmd_buffer->sub_cs); + tu_cs_reset(&cmd_buffer->pre_chain.draw_cs); + tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs); tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results); @@ -1728,13 +1774,15 @@ tu_cache_init(struct tu_cache_state *cache) cache->pending_flush_bits = TU_CMD_FLAG_ALL_INVALIDATE; } -VKAPI_ATTR VkResult VKAPI_CALL -tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, - const VkCommandBufferBeginInfo *pBeginInfo) +/* Unlike the public entrypoint, this doesn't handle cache tracking, and + * tracking the CCU state. It's used for the driver to insert its own command + * buffer in the middle of a submit. + */ +VkResult +tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer, + VkCommandBufferUsageFlags usage_flags) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); VkResult result = VK_SUCCESS; - if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) { /* If the command buffer has already been resetted with * vkResetCommandBuffer, no need to do it again. @@ -1750,12 +1798,25 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, tu_cache_init(&cmd_buffer->state.cache); tu_cache_init(&cmd_buffer->state.renderpass_cache); - cmd_buffer->usage_flags = pBeginInfo->flags; + cmd_buffer->usage_flags = usage_flags; tu_cs_begin(&cmd_buffer->cs); tu_cs_begin(&cmd_buffer->draw_cs); tu_cs_begin(&cmd_buffer->draw_epilogue_cs); + cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING; + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, + const VkCommandBufferBeginInfo *pBeginInfo) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + VkResult result = tu_cmd_buffer_begin(cmd_buffer, pBeginInfo->flags); + if (result != VK_SUCCESS) + return result; + /* setup initial configuration into command buffer */ if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { switch (cmd_buffer->queue_family_index) { @@ -1805,8 +1866,6 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, } } - cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING; - return VK_SUCCESS; } @@ -3331,7 +3390,7 @@ tu_flush_for_stage(struct tu_cache_state *cache, } } -static void +void tu_render_pass_state_merge(struct tu_render_pass_state *dst, const struct tu_render_pass_state *src) { @@ -3346,6 +3405,103 @@ tu_render_pass_state_merge(struct tu_render_pass_state *dst, src->drawcall_bandwidth_per_sample_sum; } +void +tu_restore_suspended_pass(struct tu_cmd_buffer *cmd, + struct tu_cmd_buffer *suspended) +{ + cmd->state.pass = suspended->state.suspended_pass.pass; + cmd->state.subpass = suspended->state.suspended_pass.subpass; + cmd->state.framebuffer = suspended->state.suspended_pass.framebuffer; + cmd->state.attachments = suspended->state.suspended_pass.attachments; + cmd->state.render_area = suspended->state.suspended_pass.render_area; + cmd->state.lrz = suspended->state.suspended_pass.lrz; +} + +/* Take the saved pre-chain in "secondary" and copy its commands to "cmd", + * appending it after any saved-up commands in "cmd". + */ +void +tu_append_pre_chain(struct tu_cmd_buffer *cmd, + struct tu_cmd_buffer *secondary) +{ + tu_cs_add_entries(&cmd->draw_cs, &secondary->pre_chain.draw_cs); + tu_cs_add_entries(&cmd->draw_epilogue_cs, + &secondary->pre_chain.draw_epilogue_cs); + tu_render_pass_state_merge(&cmd->state.rp, + &secondary->pre_chain.state); + if (!u_trace_iterator_equal(secondary->pre_chain.trace_renderpass_start, + secondary->pre_chain.trace_renderpass_end)) { + tu_cs_emit_wfi(&cmd->draw_cs); + tu_cs_emit_pkt7(&cmd->draw_cs, CP_WAIT_FOR_ME, 0); + u_trace_clone_append(secondary->pre_chain.trace_renderpass_start, + secondary->pre_chain.trace_renderpass_end, + &cmd->trace, &cmd->draw_cs, + tu_copy_timestamp_buffer); + } +} + +/* Take the saved post-chain in "secondary" and copy it to "cmd". + */ +void +tu_append_post_chain(struct tu_cmd_buffer *cmd, + struct tu_cmd_buffer *secondary) +{ + tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs); + tu_cs_add_entries(&cmd->draw_epilogue_cs, &secondary->draw_epilogue_cs); + if (!u_trace_iterator_equal(secondary->trace_renderpass_start, + secondary->trace_renderpass_end)) { + tu_cs_emit_wfi(&cmd->draw_cs); + tu_cs_emit_pkt7(&cmd->draw_cs, CP_WAIT_FOR_ME, 0); + u_trace_clone_append(secondary->trace_renderpass_start, + secondary->trace_renderpass_end, + &cmd->trace, &cmd->draw_cs, + tu_copy_timestamp_buffer); + } + cmd->state.rp = secondary->state.rp; +} + +/* Assuming "secondary" is just a sequence of suspended and resuming passes, + * copy its state to "cmd". This also works instead of tu_append_post_chain(), + * but it's a bit slower because we don't assume that the chain begins in + * "secondary" and therefore have to care about the command buffer's + * renderpass state. + */ +void +tu_append_pre_post_chain(struct tu_cmd_buffer *cmd, + struct tu_cmd_buffer *secondary) +{ + tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs); + tu_cs_add_entries(&cmd->draw_epilogue_cs, &secondary->draw_epilogue_cs); + if (!u_trace_iterator_equal(secondary->trace_renderpass_start, + secondary->trace_renderpass_end)) { + tu_cs_emit_wfi(&cmd->draw_cs); + tu_cs_emit_pkt7(&cmd->draw_cs, CP_WAIT_FOR_ME, 0); + u_trace_clone_append(secondary->trace_renderpass_start, + secondary->trace_renderpass_end, + &cmd->trace, &cmd->draw_cs, + tu_copy_timestamp_buffer); + } + tu_render_pass_state_merge(&cmd->state.rp, + &secondary->state.rp); +} + +/* Take the current render pass state and save it to "pre_chain" to be + * combined later. + */ +static void +tu_save_pre_chain(struct tu_cmd_buffer *cmd) +{ + tu_cs_add_entries(&cmd->pre_chain.draw_cs, + &cmd->draw_cs); + tu_cs_add_entries(&cmd->pre_chain.draw_epilogue_cs, + &cmd->draw_epilogue_cs); + cmd->pre_chain.trace_renderpass_start = + cmd->trace_renderpass_start; + cmd->pre_chain.trace_renderpass_end = + cmd->trace_renderpass_end; + cmd->pre_chain.state = cmd->state.rp; +} + VKAPI_ATTR void VKAPI_CALL tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, @@ -3393,10 +3549,110 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, tu_render_pass_state_merge(&cmd->state.rp, &secondary->state.rp); } else { - assert(tu_cs_is_empty(&secondary->draw_cs)); - assert(tu_cs_is_empty(&secondary->draw_epilogue_cs)); + switch (secondary->state.suspend_resume) { + case SR_NONE: + assert(tu_cs_is_empty(&secondary->draw_cs)); + assert(tu_cs_is_empty(&secondary->draw_epilogue_cs)); + tu_cs_add_entries(&cmd->cs, &secondary->cs); + break; - tu_cs_add_entries(&cmd->cs, &secondary->cs); + case SR_IN_PRE_CHAIN: + /* cmd may be empty, which means that the chain begins before cmd + * in which case we have to update its state. + */ + if (cmd->state.suspend_resume == SR_NONE) { + cmd->state.suspend_resume = SR_IN_PRE_CHAIN; + cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace); + } + + /* The secondary is just a continuous suspend/resume chain so we + * just have to append it to the the command buffer. + */ + assert(tu_cs_is_empty(&secondary->cs)); + tu_append_pre_post_chain(cmd, secondary); + break; + + case SR_AFTER_PRE_CHAIN: + case SR_IN_CHAIN: + case SR_IN_CHAIN_AFTER_PRE_CHAIN: + if (secondary->state.suspend_resume == SR_AFTER_PRE_CHAIN || + secondary->state.suspend_resume == SR_IN_CHAIN_AFTER_PRE_CHAIN) { + /* In thse cases there is a `pre_chain` in the secondary which + * ends that we need to append to the primary. + */ + + if (cmd->state.suspend_resume == SR_NONE) + cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace); + + tu_append_pre_chain(cmd, secondary); + cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace); + + /* We're about to render, so we need to end the command stream + * in case there were any extra commands generated by copying + * the trace. + */ + tu_cs_end(&cmd->draw_cs); + tu_cs_end(&cmd->draw_epilogue_cs); + + switch (cmd->state.suspend_resume) { + case SR_NONE: + case SR_IN_PRE_CHAIN: + /* The renderpass chain ends in the secondary but isn't + * started in the primary, so we have to move the state to + * `pre_chain`. + */ + tu_save_pre_chain(cmd); + cmd->state.suspend_resume = SR_AFTER_PRE_CHAIN; + break; + case SR_IN_CHAIN: + case SR_IN_CHAIN_AFTER_PRE_CHAIN: + /* The renderpass ends in the secondary and starts somewhere + * earlier in this primary. Since the last render pass in + * the chain is in the secondary, we are technically outside + * of a render pass. Fix that here by reusing the dynamic + * render pass that was setup for the last suspended render + * pass before the secondary. + */ + tu_restore_suspended_pass(cmd, cmd); + + tu_cmd_render(cmd); + if (cmd->state.suspend_resume == SR_IN_CHAIN) + cmd->state.suspend_resume = SR_NONE; + else + cmd->state.suspend_resume = SR_AFTER_PRE_CHAIN; + break; + case SR_AFTER_PRE_CHAIN: + unreachable("resuming render pass is not preceded by suspending one"); + } + + tu_reset_render_pass(cmd); + } + + tu_cs_add_entries(&cmd->cs, &secondary->cs); + + if (secondary->state.suspend_resume == SR_IN_CHAIN_AFTER_PRE_CHAIN || + secondary->state.suspend_resume == SR_IN_CHAIN) { + /* The secondary ends in a "post-chain" (the opposite of a + * pre-chain) that we need to copy into the current command + * buffer. + */ + cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace); + tu_append_post_chain(cmd, secondary); + cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace); + cmd->state.suspended_pass = secondary->state.suspended_pass; + + switch (cmd->state.suspend_resume) { + case SR_NONE: + cmd->state.suspend_resume = SR_IN_CHAIN; + break; + case SR_AFTER_PRE_CHAIN: + cmd->state.suspend_resume = SR_IN_CHAIN_AFTER_PRE_CHAIN; + break; + default: + unreachable("suspending render pass is followed by a not resuming one"); + } + } + } } cmd->state.index_size = secondary->state.index_size; /* for restart index update */ @@ -3685,12 +3941,65 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer, cmd->state.cache.pending_flush_bits; cmd->state.renderpass_cache.flush_bits = 0; - trace_start_render_pass(&cmd->trace, &cmd->cs); + bool resuming = pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT; + bool suspending = pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT; + cmd->state.suspending = suspending; + cmd->state.resuming = resuming; - cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace); + /* We can't track LRZ across command buffer boundaries, so we have to + * disable LRZ when resuming/suspending unless we can track on the GPU. + */ + if ((resuming || suspending) && + !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) { + cmd->state.lrz.valid = false; + } else { + if (resuming) + tu_lrz_begin_resumed_renderpass(cmd, clear_values); + else + tu_lrz_begin_renderpass(cmd, clear_values); + } - tu_emit_renderpass_begin(cmd, clear_values); - tu_emit_subpass_begin(cmd); + + if (suspending) { + cmd->state.suspended_pass.pass = cmd->state.pass; + cmd->state.suspended_pass.subpass = cmd->state.subpass; + cmd->state.suspended_pass.framebuffer = cmd->state.framebuffer; + cmd->state.suspended_pass.render_area = cmd->state.render_area; + cmd->state.suspended_pass.attachments = cmd->state.attachments; + } + + if (!resuming) { + trace_start_render_pass(&cmd->trace, &cmd->cs); + } + + if (!resuming || cmd->state.suspend_resume == SR_NONE) { + cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace); + } + + if (!resuming) { + tu_emit_renderpass_begin(cmd, clear_values); + tu_emit_subpass_begin(cmd); + } + + if (suspending && !resuming) { + /* entering a chain */ + switch (cmd->state.suspend_resume) { + case SR_NONE: + cmd->state.suspend_resume = SR_IN_CHAIN; + break; + case SR_AFTER_PRE_CHAIN: + cmd->state.suspend_resume = SR_IN_CHAIN_AFTER_PRE_CHAIN; + break; + case SR_IN_PRE_CHAIN: + case SR_IN_CHAIN: + case SR_IN_CHAIN_AFTER_PRE_CHAIN: + unreachable("suspending render pass not followed by resuming pass"); + break; + } + } + + if (resuming && cmd->state.suspend_resume == SR_NONE) + cmd->state.suspend_resume = SR_IN_PRE_CHAIN; } VKAPI_ATTR void VKAPI_CALL @@ -4801,60 +5110,25 @@ tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer, tu_dispatch(cmd_buffer, &info); } -static void -tu_end_rendering(struct tu_cmd_buffer *cmd_buffer) -{ - tu_cs_end(&cmd_buffer->draw_cs); - tu_cs_end(&cmd_buffer->draw_epilogue_cs); - - cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace); - - if (cmd_buffer->state.rp.has_tess) - tu6_lazy_emit_tessfactor_addr(cmd_buffer); - - struct tu_renderpass_result *autotune_result = NULL; - if (use_sysmem_rendering(cmd_buffer, &autotune_result)) - tu_cmd_render_sysmem(cmd_buffer, autotune_result); - else - tu_cmd_render_tiles(cmd_buffer, autotune_result); - - /* Outside of renderpasses we assume all draw states are disabled. We do - * this outside the draw CS for the normal case where 3d gmem stores aren't - * used. - */ - tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs); - - /* discard draw_cs and draw_epilogue_cs entries now that the tiles are - rendered */ - tu_cs_discard_entries(&cmd_buffer->draw_cs); - tu_cs_begin(&cmd_buffer->draw_cs); - tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs); - tu_cs_begin(&cmd_buffer->draw_epilogue_cs); - - cmd_buffer->state.pass = NULL; - cmd_buffer->state.subpass = NULL; - cmd_buffer->state.framebuffer = NULL; - cmd_buffer->state.attachments = NULL; - memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp)); - - /* LRZ is not valid next time we use it */ - cmd_buffer->state.lrz.valid = false; - cmd_buffer->state.dirty |= TU_CMD_DIRTY_LRZ; -} - VKAPI_ATTR void VKAPI_CALL tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pSubpassEndInfo) { TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); - tu_end_rendering(cmd_buffer); + cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace); + + tu_cs_end(&cmd_buffer->draw_cs); + tu_cs_end(&cmd_buffer->draw_epilogue_cs); + tu_cmd_render(cmd_buffer); cmd_buffer->state.cache.pending_flush_bits |= cmd_buffer->state.renderpass_cache.pending_flush_bits; tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true); vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachments); + + tu_reset_render_pass(cmd_buffer); } VKAPI_ATTR void VKAPI_CALL @@ -4862,7 +5136,38 @@ tu_CmdEndRendering(VkCommandBuffer commandBuffer) { TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); - tu_end_rendering(cmd_buffer); + cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace); + + if (cmd_buffer->state.suspending) + cmd_buffer->state.suspended_pass.lrz = cmd_buffer->state.lrz; + + if (!cmd_buffer->state.suspending) { + tu_cs_end(&cmd_buffer->draw_cs); + tu_cs_end(&cmd_buffer->draw_epilogue_cs); + + if (cmd_buffer->state.suspend_resume == SR_IN_PRE_CHAIN) { + tu_save_pre_chain(cmd_buffer); + } else { + tu_cmd_render(cmd_buffer); + } + + tu_reset_render_pass(cmd_buffer); + } + + if (cmd_buffer->state.resuming && !cmd_buffer->state.suspending) { + /* exiting suspend/resume chain */ + switch (cmd_buffer->state.suspend_resume) { + case SR_IN_CHAIN: + cmd_buffer->state.suspend_resume = SR_NONE; + break; + case SR_IN_PRE_CHAIN: + case SR_IN_CHAIN_AFTER_PRE_CHAIN: + cmd_buffer->state.suspend_resume = SR_AFTER_PRE_CHAIN; + break; + default: + unreachable("suspending render pass not followed by resuming pass"); + } + } } static void diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index 1986701c796..f5b7778f25a 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -180,6 +180,7 @@ get_device_extensions(const struct tu_physical_device *device, .KHR_zero_initialize_workgroup_memory = true, .KHR_shader_non_semantic_info = true, .KHR_synchronization2 = true, + .KHR_dynamic_rendering = true, #ifndef TU_USE_KGSL .KHR_timeline_semaphore = true, #endif @@ -237,6 +238,7 @@ get_device_extensions(const struct tu_physical_device *device, .VALVE_mutable_descriptor_type = true, .EXT_image_2d_view_of_3d = true, .EXT_color_write_enable = true, + .EXT_load_store_op_none = true, }; } @@ -640,7 +642,7 @@ tu_get_physical_device_features_1_3(struct tu_physical_device *pdevice, features->synchronization2 = true; features->textureCompressionASTC_HDR = false; features->shaderZeroInitializeWorkgroupMemory = true; - features->dynamicRendering = false; + features->dynamicRendering = true; features->shaderIntegerDotProduct = true; features->maintenance4 = true; } @@ -1611,6 +1613,37 @@ tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream, tu_cs_emit_qw(cs, bo_to->iova + to_offset * sizeof(uint64_t)); } +/* Special helpers instead of u_trace_begin_iterator()/u_trace_end_iterator() + * that ignore tracepoints at the beginning/end that are part of a + * suspend/resume chain. + */ +static struct u_trace_iterator +tu_cmd_begin_iterator(struct tu_cmd_buffer *cmdbuf) +{ + switch (cmdbuf->state.suspend_resume) { + case SR_IN_PRE_CHAIN: + return cmdbuf->trace_renderpass_end; + case SR_AFTER_PRE_CHAIN: + case SR_IN_CHAIN_AFTER_PRE_CHAIN: + return cmdbuf->pre_chain.trace_renderpass_end; + default: + return u_trace_begin_iterator(&cmdbuf->trace); + } +} + +static struct u_trace_iterator +tu_cmd_end_iterator(struct tu_cmd_buffer *cmdbuf) +{ + switch (cmdbuf->state.suspend_resume) { + case SR_IN_PRE_CHAIN: + return cmdbuf->trace_renderpass_end; + case SR_IN_CHAIN: + case SR_IN_CHAIN_AFTER_PRE_CHAIN: + return cmdbuf->trace_renderpass_start; + default: + return u_trace_end_iterator(&cmdbuf->trace); + } +} VkResult tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs, struct u_trace **trace_copy) @@ -1638,8 +1671,8 @@ tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs, } u_trace_init(*trace_copy, cmdbuf->trace.utctx); - u_trace_clone_append(u_trace_begin_iterator(&cmdbuf->trace), - u_trace_end_iterator(&cmdbuf->trace), + u_trace_clone_append(tu_cmd_begin_iterator(cmdbuf), + tu_cmd_end_iterator(cmdbuf), *trace_copy, *cs, tu_copy_timestamp_buffer); @@ -1900,6 +1933,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, /* initialize to ones so ffs can be used to find unused slots */ BITSET_ONES(device->custom_border_color); + result = tu_init_dynamic_rendering(device); + if (result != VK_SUCCESS) { + vk_startup_errorf(device->instance, result, "dynamic rendering"); + goto fail_dynamic_rendering; + } + struct vk_pipeline_cache_create_info pcc_info = { }; device->mem_cache = vk_pipeline_cache_create(&device->vk, &pcc_info, false); @@ -2009,6 +2048,8 @@ fail_perfcntrs_pass_entries_alloc: fail_perfcntrs_pass_alloc: vk_pipeline_cache_destroy(device->mem_cache, &device->vk.alloc); fail_pipeline_cache: + tu_destroy_dynamic_rendering(device); +fail_dynamic_rendering: tu_destroy_clear_blit_shaders(device); fail_global_bo_map: tu_bo_finish(device, device->global_bo); @@ -2055,6 +2096,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) tu_destroy_clear_blit_shaders(device); + tu_destroy_dynamic_rendering(device); + ir3_compiler_destroy(device->compiler); vk_pipeline_cache_destroy(device->mem_cache, &device->vk.alloc); diff --git a/src/freedreno/vulkan/tu_drm.c b/src/freedreno/vulkan/tu_drm.c index 5fad4ba689c..6f6688366a7 100644 --- a/src/freedreno/vulkan/tu_drm.c +++ b/src/freedreno/vulkan/tu_drm.c @@ -53,10 +53,12 @@ struct tu_queue_submit struct vk_queue_submit *vk_submit; struct tu_u_trace_submission_data *u_trace_submission_data; + struct tu_cmd_buffer **cmd_buffers; struct drm_msm_gem_submit_cmd *cmds; struct drm_msm_gem_submit_syncobj *in_syncobjs; struct drm_msm_gem_submit_syncobj *out_syncobjs; + uint32_t nr_cmd_buffers; uint32_t nr_in_syncobjs; uint32_t nr_out_syncobjs; uint32_t entry_count; @@ -833,11 +835,17 @@ tu_queue_submit_create_locked(struct tu_queue *queue, bool has_trace_points = false; struct vk_command_buffer **vk_cmd_buffers = vk_submit->command_buffers; - struct tu_cmd_buffer **cmd_buffers = (void *)vk_cmd_buffers; + + memset(new_submit, 0, sizeof(struct tu_queue_submit)); + + new_submit->cmd_buffers = (void *)vk_cmd_buffers; + new_submit->nr_cmd_buffers = vk_submit->command_buffer_count; + tu_insert_dynamic_cmdbufs(queue->device, &new_submit->cmd_buffers, + &new_submit->nr_cmd_buffers); uint32_t entry_count = 0; - for (uint32_t j = 0; j < vk_submit->command_buffer_count; ++j) { - struct tu_cmd_buffer *cmdbuf = cmd_buffers[j]; + for (uint32_t j = 0; j < new_submit->nr_cmd_buffers; ++j) { + struct tu_cmd_buffer *cmdbuf = new_submit->cmd_buffers[j]; if (perf_pass_index != ~0) entry_count++; @@ -852,11 +860,8 @@ tu_queue_submit_create_locked(struct tu_queue *queue, } } - - memset(new_submit, 0, sizeof(struct tu_queue_submit)); - new_submit->autotune_fence = - tu_autotune_submit_requires_fence(cmd_buffers, vk_submit->command_buffer_count); + tu_autotune_submit_requires_fence(new_submit->cmd_buffers, new_submit->nr_cmd_buffers); if (new_submit->autotune_fence) entry_count++; @@ -872,8 +877,8 @@ tu_queue_submit_create_locked(struct tu_queue *queue, if (has_trace_points) { result = tu_u_trace_submission_data_create( - queue->device, cmd_buffers, - vk_submit->command_buffer_count, + queue->device, new_submit->cmd_buffers, + new_submit->nr_cmd_buffers, &new_submit->u_trace_submission_data); if (result != VK_SUCCESS) { @@ -927,6 +932,8 @@ tu_queue_submit_finish(struct tu_queue *queue, struct tu_queue_submit *submit) vk_free(&queue->device->vk.alloc, submit->cmds); vk_free(&queue->device->vk.alloc, submit->in_syncobjs); vk_free(&queue->device->vk.alloc, submit->out_syncobjs); + if (submit->cmd_buffers != (void *) submit->vk_submit->command_buffers) + vk_free(&queue->device->vk.alloc, submit->cmd_buffers); } static void @@ -951,13 +958,10 @@ tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue, struct tu_device *dev = queue->device; struct drm_msm_gem_submit_cmd *cmds = submit->cmds; - struct vk_command_buffer **vk_cmd_buffers = submit->vk_submit->command_buffers; - struct tu_cmd_buffer **cmd_buffers = (void *)vk_cmd_buffers; - uint32_t entry_idx = 0; - for (uint32_t j = 0; j < submit->vk_submit->command_buffer_count; ++j) { + for (uint32_t j = 0; j < submit->nr_cmd_buffers; ++j) { struct tu_device *dev = queue->device; - struct tu_cmd_buffer *cmdbuf = cmd_buffers[j]; + struct tu_cmd_buffer *cmdbuf = submit->cmd_buffers[j]; struct tu_cs *cs = &cmdbuf->cs; if (submit->perf_pass_index != ~0) { @@ -996,11 +1000,10 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit) struct tu_cs *autotune_cs = NULL; if (submit->autotune_fence) { - struct tu_cmd_buffer **cmd_buffers = (void *)submit->vk_submit->command_buffers; autotune_cs = tu_autotune_on_submit(queue->device, &queue->device->autotune, - cmd_buffers, - submit->vk_submit->command_buffer_count); + submit->cmd_buffers, + submit->nr_cmd_buffers); } uint32_t flags = MSM_PIPE_3D0; @@ -1062,7 +1065,7 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit) submit->u_trace_submission_data = NULL; - for (uint32_t i = 0; i < submit->vk_submit->command_buffer_count; i++) { + for (uint32_t i = 0; i < submission_data->cmd_buffer_count; i++) { bool free_data = i == submission_data->last_buffer_with_tracepoints; if (submission_data->cmd_trace_data[i].trace) u_trace_flush(submission_data->cmd_trace_data[i].trace, diff --git a/src/freedreno/vulkan/tu_dynamic_rendering.c b/src/freedreno/vulkan/tu_dynamic_rendering.c new file mode 100644 index 00000000000..e29a742a6c4 --- /dev/null +++ b/src/freedreno/vulkan/tu_dynamic_rendering.c @@ -0,0 +1,237 @@ +/* + * Copyright © 2022 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* When using dynamic rendering with the suspend/resume functionality, we + * sometimes need to merge together multiple suspended render passes + * dynamically at submit time. This involves combining all the saved-up IBs, + * emitting the rendering commands usually emitted by + * CmdEndRenderPass()/CmdEndRendering(), and inserting them in between the + * user command buffers. This gets tricky, because the same command buffer can + * be submitted multiple times, each time with a different other set of + * command buffers, and with VK_COMMAND_BUFFER_SIMULTANEOUS_USE_BIT, this can + * happen before the previous submission of the same command buffer has + * finished. At some point we have to free these commands and the BOs they are + * contained in, and we can't do that when resubmitting the last command + * buffer in the sequence because it may still be in use. This means we have + * to make the commands owned by the device and roll our own memory tracking. + */ + +#include "tu_private.h" +#include "tu_cs.h" + +struct dynamic_rendering_entry { + struct tu_cmd_buffer *cmd_buffer; + uint32_t fence; /* The fence value when cmd_buffer becomes available */ +}; + +static VkResult +get_cmd_buffer(struct tu_device *dev, struct tu_cmd_buffer **cmd_buffer_out) +{ + struct tu6_global *global = dev->global_bo->map; + + /* Note: because QueueSubmit is serialized, we don't need any locks here. + */ + uint32_t fence = global->dynamic_rendering_fence; + + /* Go through the entries and return the finished ones to the pool, + * shrinking the array of pending entries. + */ + struct dynamic_rendering_entry *new_entry = + util_dynarray_begin(&dev->dynamic_rendering_pending); + uint32_t entries = 0; + util_dynarray_foreach(&dev->dynamic_rendering_pending, + struct dynamic_rendering_entry, entry) { + if (entry->fence <= fence) { + VkCommandBuffer vk_buf = tu_cmd_buffer_to_handle(entry->cmd_buffer); + tu_FreeCommandBuffers(tu_device_to_handle(dev), + dev->dynamic_rendering_pool, 1, &vk_buf); + } else { + *new_entry = *entry; + new_entry++; + entries++; + } + } + UNUSED void *dummy = + util_dynarray_resize(&dev->dynamic_rendering_pending, + struct dynamic_rendering_entry, entries); + + VkCommandBuffer vk_buf; + const VkCommandBufferAllocateInfo info = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .pNext = NULL, + .commandPool = dev->dynamic_rendering_pool, + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = 1, + }; + VkResult result = + tu_AllocateCommandBuffers(tu_device_to_handle(dev), &info, &vk_buf); + if (result != VK_SUCCESS) + return result; + + TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, vk_buf); + + struct dynamic_rendering_entry entry = { + .cmd_buffer = cmd_buffer, + .fence = ++dev->dynamic_rendering_fence, + }; + + util_dynarray_append(&dev->dynamic_rendering_pending, + struct dynamic_rendering_entry, entry); + *cmd_buffer_out = cmd_buffer; + + return VK_SUCCESS; +} + +VkResult +tu_init_dynamic_rendering(struct tu_device *dev) +{ + util_dynarray_init(&dev->dynamic_rendering_pending, NULL); + dev->dynamic_rendering_fence = 0; + + return tu_CreateCommandPool(tu_device_to_handle(dev), + &(VkCommandPoolCreateInfo) { + .pNext = NULL, + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .flags = 0, + .queueFamilyIndex = 0, + }, &dev->vk.alloc, &dev->dynamic_rendering_pool); +} + +void +tu_destroy_dynamic_rendering(struct tu_device *dev) +{ + tu_DestroyCommandPool(tu_device_to_handle(dev), + dev->dynamic_rendering_pool, + &dev->vk.alloc); + util_dynarray_fini(&dev->dynamic_rendering_pending); +} + +VkResult +tu_insert_dynamic_cmdbufs(struct tu_device *dev, + struct tu_cmd_buffer ***cmds_ptr, + uint32_t *size) +{ + struct tu_cmd_buffer **old_cmds = *cmds_ptr; + + bool has_dynamic = false; + for (unsigned i = 0; i < *size; i++) { + if (old_cmds[i]->state.suspend_resume != SR_NONE) { + has_dynamic = true; + break; + } + } + + if (!has_dynamic) + return VK_SUCCESS; + + struct util_dynarray cmds = {0}; + struct tu_cmd_buffer *cmd_buffer = NULL; + + for (unsigned i = 0; i < *size; i++) { + switch (old_cmds[i]->state.suspend_resume) { + case SR_NONE: + case SR_IN_CHAIN: + case SR_IN_PRE_CHAIN: + break; + + case SR_AFTER_PRE_CHAIN: + case SR_IN_CHAIN_AFTER_PRE_CHAIN: + tu_append_pre_chain(cmd_buffer, old_cmds[i]); + + if (!(old_cmds[i]->usage_flags & + VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) { + u_trace_disable_event_range(old_cmds[i]->pre_chain.trace_renderpass_start, + old_cmds[i]->pre_chain.trace_renderpass_end); + } + + tu_cmd_render(cmd_buffer); + + tu_cs_emit_pkt7(&cmd_buffer->cs, CP_MEM_WRITE, 3); + tu_cs_emit_qw(&cmd_buffer->cs, + global_iova(cmd_buffer, dynamic_rendering_fence)); + tu_cs_emit(&cmd_buffer->cs, dev->dynamic_rendering_fence); + + tu_EndCommandBuffer(tu_cmd_buffer_to_handle(cmd_buffer)); + util_dynarray_append(&cmds, struct tu_cmd_buffer *, cmd_buffer); + cmd_buffer = NULL; + break; + } + + util_dynarray_append(&cmds, struct tu_cmd_buffer *, old_cmds[i]); + + switch (old_cmds[i]->state.suspend_resume) { + case SR_NONE: + case SR_AFTER_PRE_CHAIN: + break; + case SR_IN_CHAIN: + case SR_IN_CHAIN_AFTER_PRE_CHAIN: { + assert(!cmd_buffer); + VkResult result = get_cmd_buffer(dev, &cmd_buffer); + if (result != VK_SUCCESS) + return result; + + tu_cmd_buffer_begin(cmd_buffer, + VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT); + + /* Setup the render pass using the first command buffer involved in + * the chain, so that it will look like we're inside a render pass + * for tu_cmd_render(). + */ + tu_restore_suspended_pass(cmd_buffer, old_cmds[i]); + FALLTHROUGH; + } + case SR_IN_PRE_CHAIN: + assert(cmd_buffer); + + tu_append_pre_post_chain(cmd_buffer, old_cmds[i]); + + if (old_cmds[i]->usage_flags & + VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) { + u_trace_disable_event_range(old_cmds[i]->trace_renderpass_start, + old_cmds[i]->trace_renderpass_end); + } + + /* When the command buffer is finally recorded, we need its state + * to be the state of the command buffer before it. We need this + * because we skip tu6_emit_hw(). + */ + cmd_buffer->state.ccu_state = old_cmds[i]->state.ccu_state; + cmd_buffer->vsc_draw_strm_pitch = old_cmds[i]->vsc_draw_strm_pitch; + cmd_buffer->vsc_prim_strm_pitch = old_cmds[i]->vsc_prim_strm_pitch; + break; + } + } + + struct tu_cmd_buffer **new_cmds = + vk_alloc(&dev->vk.alloc, cmds.size, alignof(struct tu_cmd_buffer *), + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!new_cmds) + return VK_ERROR_OUT_OF_HOST_MEMORY; + memcpy(new_cmds, cmds.data, cmds.size); + *cmds_ptr = new_cmds; + *size = util_dynarray_num_elements(&cmds, struct tu_cmd_buffer *); + util_dynarray_fini(&cmds); + + return VK_SUCCESS; +} + diff --git a/src/freedreno/vulkan/tu_kgsl.c b/src/freedreno/vulkan/tu_kgsl.c index 0943e09c63a..08f08910518 100644 --- a/src/freedreno/vulkan/tu_kgsl.c +++ b/src/freedreno/vulkan/tu_kgsl.c @@ -367,6 +367,9 @@ tu_QueueSubmit2(VkQueue _queue, tu_dbg_log_gmem_load_store_skips(queue->device); } + struct tu_cmd_buffer **submit_cmd_buffers[submitCount]; + uint32_t submit_cmd_buffer_count[submitCount]; + uint32_t max_entry_count = 0; for (uint32_t i = 0; i < submitCount; ++i) { const VkSubmitInfo2 *submit = pSubmits + i; @@ -375,17 +378,34 @@ tu_QueueSubmit2(VkQueue _queue, vk_find_struct_const(pSubmits[i].pNext, PERFORMANCE_QUERY_SUBMIT_INFO_KHR); - uint32_t entry_count = 0; - struct tu_cmd_buffer *cmd_buffers[submit->commandBufferInfoCount]; - for (uint32_t j = 0; j < submit->commandBufferInfoCount; ++j) { + struct tu_cmd_buffer *old_cmd_buffers[submit->commandBufferInfoCount]; + uint32_t cmdbuf_count = submit->commandBufferInfoCount; + for (uint32_t j = 0; j < cmdbuf_count; ++j) { TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->pCommandBufferInfos[j].commandBuffer); - cmd_buffers[j] = cmdbuf; - entry_count += cmdbuf->cs.entry_count; + old_cmd_buffers[j] = cmdbuf; + } + + struct tu_cmd_buffer **cmd_buffers = old_cmd_buffers; + tu_insert_dynamic_cmdbufs(queue->device, &cmd_buffers, &cmdbuf_count); + if (cmd_buffers == old_cmd_buffers) { + cmd_buffers = + vk_alloc(&queue->device->vk.alloc, + sizeof(*cmd_buffers) * cmdbuf_count, 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + memcpy(cmd_buffers, old_cmd_buffers, + sizeof(*cmd_buffers) * cmdbuf_count); + } + submit_cmd_buffers[i] = cmd_buffers; + submit_cmd_buffer_count[i] = cmdbuf_count; + + uint32_t entry_count = 0; + for (uint32_t j = 0; j < cmdbuf_count; ++j) { + entry_count += cmd_buffers[i]->cs.entry_count; if (perf_info) entry_count++; } - if (tu_autotune_submit_requires_fence(cmd_buffers, submit->commandBufferInfoCount)) + if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) entry_count++; max_entry_count = MAX2(max_entry_count, entry_count); @@ -406,10 +426,10 @@ tu_QueueSubmit2(VkQueue _queue, PERFORMANCE_QUERY_SUBMIT_INFO_KHR); - struct tu_cmd_buffer *cmd_buffers[submit->commandBufferInfoCount]; - for (uint32_t j = 0; j < submit->commandBufferInfoCount; j++) { - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->pCommandBufferInfos[j].commandBuffer); - cmd_buffers[j] = cmdbuf; + struct tu_cmd_buffer **cmd_buffers = submit_cmd_buffers[i]; + uint32_t cmdbuf_count = submit_cmd_buffer_count[i]; + for (uint32_t j = 0; j < cmdbuf_count; j++) { + struct tu_cmd_buffer *cmdbuf = cmd_buffers[j]; struct tu_cs *cs = &cmdbuf->cs; if (perf_info) { @@ -436,12 +456,12 @@ tu_QueueSubmit2(VkQueue _queue, } } - if (tu_autotune_submit_requires_fence(cmd_buffers, submit->commandBufferInfoCount)) { + if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) { struct tu_cs *autotune_cs = tu_autotune_on_submit(queue->device, &queue->device->autotune, cmd_buffers, - submit->commandBufferInfoCount); + cmdbuf_count); cmds[entry_idx++] = (struct kgsl_command_object) { .offset = autotune_cs->entries[0].offset, .gpuaddr = autotune_cs->entries[0].bo->iova, diff --git a/src/freedreno/vulkan/tu_lrz.c b/src/freedreno/vulkan/tu_lrz.c index 07d05087c51..0c78ddf8b1b 100644 --- a/src/freedreno/vulkan/tu_lrz.c +++ b/src/freedreno/vulkan/tu_lrz.c @@ -275,6 +275,32 @@ tu_lrz_init_secondary(struct tu_cmd_buffer *cmd, cmd->state.lrz.reuse_previous_state = false; } +/* This is generally the same as tu_lrz_begin_renderpass(), but we skip + * actually emitting anything. The lrz state needs to be consistent between + * renderpasses, but only the first should actually emit commands to disable + * lrz etc. + */ +void +tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd, + const VkClearValue *clear_values) +{ + /* Track LRZ valid state */ + memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz)); + uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment; + if (a != VK_ATTACHMENT_UNUSED) { + const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a]; + tu_lrz_init_state(cmd, att, cmd->state.attachments[a]); + if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) { + VkClearValue clear = clear_values[a]; + cmd->state.lrz.depth_clear_value = clear; + cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear && + (clear.depthStencil.depth == 0.f || + clear.depthStencil.depth == 1.f); + } + cmd->state.dirty |= TU_CMD_DIRTY_LRZ; + } +} + void tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd, const VkClearValue *clear_values) @@ -304,20 +330,7 @@ tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd, } /* Track LRZ valid state */ - memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz)); - uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment; - if (a != VK_ATTACHMENT_UNUSED) { - const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a]; - tu_lrz_init_state(cmd, att, cmd->state.attachments[a]); - if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) { - VkClearValue clear = clear_values[a]; - cmd->state.lrz.depth_clear_value = clear; - cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear && - (clear.depthStencil.depth == 0.f || - clear.depthStencil.depth == 1.f); - } - cmd->state.dirty |= TU_CMD_DIRTY_LRZ; - } + tu_lrz_begin_resumed_renderpass(cmd, clear_values); if (!cmd->state.lrz.valid) { tu6_emit_lrz_buffer(&cmd->cs, NULL); diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 55afea180b1..d332f7620be 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -489,6 +489,9 @@ struct tu6_global /* To know when renderpass stats for autotune are valid */ volatile uint32_t autotune_fence; + /* For recycling command buffers for dynamic suspend/resume comamnds */ + volatile uint32_t dynamic_rendering_fence; + volatile uint32_t dbg_one; volatile uint32_t dbg_gmem_total_loads; volatile uint32_t dbg_gmem_taken_loads; @@ -593,6 +596,10 @@ struct tu_device struct tu_cs *perfcntrs_pass_cs; struct tu_cs_entry *perfcntrs_pass_cs_entries; + struct util_dynarray dynamic_rendering_pending; + VkCommandPool dynamic_rendering_pool; + uint32_t dynamic_rendering_fence; + /* Condition variable for timeline semaphore to notify waiters when a * new submit is executed. */ pthread_cond_t timeline_cond; @@ -624,6 +631,14 @@ void tu_init_clear_blit_shaders(struct tu_device *dev); void tu_destroy_clear_blit_shaders(struct tu_device *dev); +VkResult tu_init_dynamic_rendering(struct tu_device *dev); + +void tu_destroy_dynamic_rendering(struct tu_device *dev); + +VkResult tu_insert_dynamic_cmdbufs(struct tu_device *dev, + struct tu_cmd_buffer ***cmds_ptr, + uint32_t *size); + VkResult tu_device_submit_deferred_locked(struct tu_device *dev); @@ -1327,6 +1342,8 @@ struct tu_render_pass_state uint32_t drawcall_bandwidth_per_sample_sum; }; +void tu_render_pass_state_merge(struct tu_render_pass_state *dst, + const struct tu_render_pass_state *src); struct tu_cmd_state { uint32_t dirty; @@ -1403,6 +1420,22 @@ struct tu_cmd_state const struct tu_image_view **attachments; + /* State that in the dynamic case comes from VkRenderingInfo and needs to + * be saved/restored when suspending. This holds the state for the last + * suspended renderpass, which may point to this command buffer's dynamic_* + * or another command buffer if executed on a secondary. + */ + struct { + const struct tu_render_pass *pass; + const struct tu_subpass *subpass; + const struct tu_framebuffer *framebuffer; + VkRect2D render_area; + + const struct tu_image_view **attachments; + + struct tu_lrz_state lrz; + } suspended_pass; + bool tessfactor_addr_set; bool predication_active; enum a5xx_line_mode line_mode; @@ -1416,6 +1449,97 @@ struct tu_cmd_state bool prim_generated_query_running_before_rp; + /* These are the states of the suspend/resume state machine. In addition to + * tracking whether we're in the middle of a chain of suspending and + * resuming passes that will be merged, we need to track whether the + * command buffer begins in the middle of such a chain, for when it gets + * merged with other command buffers. We call such a chain that begins + * before the command buffer starts a "pre-chain". + * + * Note that when this command buffer is finished, this state is untouched + * but it gains a different meaning. For example, if we finish in state + * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so + * there's a suspend/resume chain that extends past the end of the command + * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which + * means that there's a suspend/resume chain that extends before the + * beginning. + */ + enum { + /* Either there are no suspend/resume chains, or they are entirely + * contained in the current command buffer. + * + * BeginCommandBuffer() <- start of current command buffer + * ... + * // we are here + */ + SR_NONE = 0, + + /* We are in the middle of a suspend/resume chain that starts before the + * current command buffer. This happens when the command buffer begins + * with a resuming render pass and all of the passes up to the current + * one are suspending. In this state, our part of the chain is not saved + * and is in the current draw_cs/state. + * + * BeginRendering() ... EndRendering(suspending) + * BeginCommandBuffer() <- start of current command buffer + * BeginRendering(resuming) ... EndRendering(suspending) + * BeginRendering(resuming) ... EndRendering(suspending) + * ... + * // we are here + */ + SR_IN_PRE_CHAIN, + + /* We are currently outside of any suspend/resume chains, but there is a + * chain starting before the current command buffer. It is saved in + * pre_chain. + * + * BeginRendering() ... EndRendering(suspending) + * BeginCommandBuffer() <- start of current command buffer + * // This part is stashed in pre_chain + * BeginRendering(resuming) ... EndRendering(suspending) + * BeginRendering(resuming) ... EndRendering(suspending) + * ... + * BeginRendering(resuming) ... EndRendering() // end of chain + * ... + * // we are here + */ + SR_AFTER_PRE_CHAIN, + + /* We are in the middle of a suspend/resume chain and there is no chain + * starting before the current command buffer. + * + * BeginCommandBuffer() <- start of current command buffer + * ... + * BeginRendering() ... EndRendering(suspending) + * BeginRendering(resuming) ... EndRendering(suspending) + * BeginRendering(resuming) ... EndRendering(suspending) + * ... + * // we are here + */ + SR_IN_CHAIN, + + /* We are in the middle of a suspend/resume chain and there is another, + * separate, chain starting before the current command buffer. + * + * BeginRendering() ... EndRendering(suspending) + * CommandBufferBegin() <- start of current command buffer + * // This part is stashed in pre_chain + * BeginRendering(resuming) ... EndRendering(suspending) + * BeginRendering(resuming) ... EndRendering(suspending) + * ... + * BeginRendering(resuming) ... EndRendering() // end of chain + * ... + * BeginRendering() ... EndRendering(suspending) + * BeginRendering(resuming) ... EndRendering(suspending) + * BeginRendering(resuming) ... EndRendering(suspending) + * ... + * // we are here + */ + SR_IN_CHAIN_AFTER_PRE_CHAIN, + } suspend_resume; + + bool suspending, resuming; + struct tu_lrz_state lrz; struct tu_draw_state lrz_and_depth_plane_state; @@ -1487,6 +1611,24 @@ struct tu_cmd_buffer struct tu_cs draw_epilogue_cs; struct tu_cs sub_cs; + /* If the first render pass in the command buffer is resuming, then it is + * part of a suspend/resume chain that starts before the current command + * buffer and needs to be merged later. In this case, its incomplete state + * is stored in pre_chain. In the symmetric case where the last render pass + * is suspending, we just skip ending the render pass and its state is + * stored in draw_cs/the current state. The first and last render pass + * might be part of different chains, which is why all the state may need + * to be saved separately here. + */ + struct { + struct tu_cs draw_cs; + struct tu_cs draw_epilogue_cs; + + struct u_trace_iterator trace_renderpass_start, trace_renderpass_end; + + struct tu_render_pass_state state; + } pre_chain; + uint32_t vsc_draw_strm_pitch; uint32_t vsc_prim_strm_pitch; }; @@ -1504,6 +1646,8 @@ struct tu_reg_value { uint32_t bo_shift; }; +VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer, + VkCommandBufferUsageFlags usage_flags); void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer, struct tu_cs *cs); @@ -1521,6 +1665,24 @@ void tu_setup_dynamic_inheritance(struct tu_cmd_buffer *cmd_buffer, void tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer, const VkRenderingInfo *pRenderingInfo); +void +tu_append_pre_chain(struct tu_cmd_buffer *cmd, + struct tu_cmd_buffer *secondary); + +void +tu_append_pre_post_chain(struct tu_cmd_buffer *cmd, + struct tu_cmd_buffer *secondary); + +void +tu_append_post_chain(struct tu_cmd_buffer *cmd, + struct tu_cmd_buffer *secondary); + +void +tu_restore_suspended_pass(struct tu_cmd_buffer *cmd, + struct tu_cmd_buffer *suspended); + +void tu_cmd_render(struct tu_cmd_buffer *cmd); + void tu6_emit_event_write(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -1756,6 +1918,10 @@ void tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd, const VkClearValue *clear_values); +void +tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd, + const VkClearValue *clear_values); + void tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd);