From 25b73dff5a01e160e8d42bbbc0026543efa96e84 Mon Sep 17 00:00:00 2001 From: Zan Dobersek Date: Wed, 23 Oct 2024 12:07:50 +0200 Subject: [PATCH] tu/a7xx: use concurrent resolve groups Establish groups of resolve and unresolve operations that the a7xx hardware can then use to improve efficiency. Creating such groups enables continuation of command stream processing while these (un)resolves are in progress, as long as those latter operations don't depend on the grouped (un)resolves. To enable concurrent resolves and unresolves, corresponding fields on the RB_CCU_CNTL register have to be set appropriately. Resolve groups are tracked through a scoped struct that logs any pending resolve operation. Once the group is complete, the emit helper function will write out the CCU_END_RESOLVE_GROUP event to the command stream. The buffer ID field on the RB_BLIT_INFO register can be used to disperse different resolve operations across all available slots in the resolve engine. The 0x8 and 0x9 IDs are reserved for depth and stencil buffers, while the 0x0-0x7 range is used for color buffers. A simple incremented counter is used to assign IDs for all color buffers inside any resolve group. While it can occur for two color or depth/stencil buffers inside the same resolve group to have identical IDs, hardware doesn't seem to have a problem with handling that. Two TU_DEBUG options are provided, 'noconcurrentresolves' and 'noconcurrentunresolves` disable respective operations by adjusting the mode set through RB_CCU_CNTL. Signed-off-by: Zan Dobersek Reviewed-by: Danylo Piliaiev Part-of: --- src/freedreno/vulkan/tu_clear_blit.cc | 184 ++++++++++++++++++++++---- src/freedreno/vulkan/tu_clear_blit.h | 15 +++ src/freedreno/vulkan/tu_cmd_buffer.cc | 58 +++++--- src/freedreno/vulkan/tu_cmd_buffer.h | 7 + src/freedreno/vulkan/tu_util.cc | 2 + src/freedreno/vulkan/tu_util.h | 2 + 6 files changed, 224 insertions(+), 44 deletions(-) diff --git a/src/freedreno/vulkan/tu_clear_blit.cc b/src/freedreno/vulkan/tu_clear_blit.cc index a7cfe5b7553..64f2e57243c 100644 --- a/src/freedreno/vulkan/tu_clear_blit.cc +++ b/src/freedreno/vulkan/tu_clear_blit.cc @@ -1516,6 +1516,8 @@ aspect_write_mask_generic_clear(enum pipe_format format, VkImageAspectFlags aspe mask = 0x1; if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) mask = 0x2; + if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) + mask = 0x3; } return mask; } @@ -1882,6 +1884,7 @@ pack_blit_event_clear_value(const VkClearValue *val, enum pipe_format format, ui static void event_blit_setup(struct tu_cs *cs, + uint32_t buffer_id, const struct tu_render_pass_attachment *att, enum a6xx_blit_event_type blit_event_type, uint32_t clear_mask) @@ -1899,7 +1902,8 @@ event_blit_setup(struct tu_cs *cs, vk_format_is_int(att->format) || vk_format_is_depth_or_stencil(att->format), .depth = vk_format_is_depth_or_stencil(att->format), - .clear_mask = clear_mask, )); + .clear_mask = clear_mask, + .buffer_id = buffer_id)); } struct event_blit_dst_view { @@ -1984,6 +1988,7 @@ event_blit_run(struct tu_cmd_buffer *cmd, static void tu7_generic_layer_clear(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + uint32_t buffer_id, enum pipe_format format, uint8_t clear_mask, bool separate_stencil, @@ -2003,7 +2008,7 @@ tu7_generic_layer_clear(struct tu_cmd_buffer *cmd, event_blit_dst_view blt_view = blt_view_from_tu_view(iview, layer); - event_blit_setup(cs, att, BLIT_EVENT_CLEAR, clear_mask); + event_blit_setup(cs, buffer_id, att, BLIT_EVENT_CLEAR, clear_mask); event_blit_run(cmd, cs, att, &blt_view, separate_stencil); } @@ -3476,6 +3481,70 @@ tu_resolve_sysmem(struct tu_cmd_buffer *cmd, } TU_GENX(tu_resolve_sysmem); +enum tu_resolve_group_buffer_type { + TU_RESOLVE_GROUP_COLOR_BUFFER, + TU_RESOLVE_GROUP_DEPTH_BUFFER, + TU_RESOLVE_GROUP_STENCIL_BUFFER, +}; + +template +static uint32_t +tu_resolve_group_include_buffer(struct tu_resolve_group *resolve_group, + enum tu_resolve_group_buffer_type buffer_type) +{ + /* Resolve groups are not usable on a6xx, so no pending resolve is + * established. The default value of 0 is returned as the buffer ID. + */ + if (CHIP == A6XX) + return 0; + + resolve_group->pending_resolves = true; + + if (buffer_type == TU_RESOLVE_GROUP_DEPTH_BUFFER) + return 0x8; + if (buffer_type == TU_RESOLVE_GROUP_STENCIL_BUFFER) + return 0x9; + + const uint32_t max_color_buffers = 8; + uint32_t buffer_id = resolve_group->color_buffer_id++; + return buffer_id % max_color_buffers; +} + +template +static uint32_t +tu_resolve_group_include_buffer_for_format(struct tu_resolve_group *resolve_group, + VkFormat format) +{ + enum tu_resolve_group_buffer_type buffer_type = TU_RESOLVE_GROUP_COLOR_BUFFER; + + /* D24_UNORM_S8_UINT should be assigned the depth buffer type, regardless of + * whether depth, stencil or both are being resolved. + */ + if (format == VK_FORMAT_D24_UNORM_S8_UINT) + buffer_type = TU_RESOLVE_GROUP_DEPTH_BUFFER; + + return tu_resolve_group_include_buffer(resolve_group, buffer_type); +} + +template +void +tu_emit_resolve_group(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + struct tu_resolve_group *resolve_group) +{ + /* Resolve groups are not usable on A6XX, so that template instantiation + * should behave as a no-op. + */ + if (CHIP == A6XX || !resolve_group->pending_resolves) + return; + + resolve_group->color_buffer_id = 0; + resolve_group->pending_resolves = false; + + tu_emit_raw_event_write(cmd, cs, CCU_END_RESOLVE_GROUP, false); +} +TU_GENX(tu_emit_resolve_group); + template static void clear_image_cp_blit(struct tu_cmd_buffer *cmd, @@ -3538,6 +3607,7 @@ clear_image_cp_blit(struct tu_cmd_buffer *cmd, static void clear_image_event_blit(struct tu_cmd_buffer *cmd, struct tu_image *image, + uint32_t buffer_id, const VkClearValue *clear_value, const VkImageSubresourceRange *range, VkImageAspectFlags aspect_mask) @@ -3573,7 +3643,8 @@ clear_image_event_blit(struct tu_cmd_buffer *cmd, .sample_0 = vk_format_is_int(vk_format) || vk_format_is_depth_or_stencil(vk_format), .depth = vk_format_is_depth_or_stencil(vk_format), - .clear_mask = aspect_write_mask_generic_clear(format, aspect_mask))); + .clear_mask = aspect_write_mask_generic_clear(format, aspect_mask), + .buffer_id = buffer_id)); uint32_t clear_vals[4] = {}; pack_blit_event_clear_value(clear_value, format, clear_vals); @@ -3656,12 +3727,13 @@ template static void clear_image(struct tu_cmd_buffer *cmd, struct tu_image *image, + uint32_t buffer_id, const VkClearValue *clear_value, const VkImageSubresourceRange *range, VkImageAspectFlags aspect_mask) { if (use_generic_clear_for_image_clear(cmd, image)) { - clear_image_event_blit(cmd, image, clear_value, range, aspect_mask); + clear_image_event_blit(cmd, image, buffer_id, clear_value, range, aspect_mask); } else { clear_image_cp_blit(cmd, image, clear_value, range, aspect_mask); } @@ -3686,9 +3758,14 @@ tu_CmdClearColorImage(VkCommandBuffer commandBuffer, tu_emit_cache_flush(cmd); } + struct tu_resolve_group resolve_group = {}; + for (unsigned i = 0; i < rangeCount; i++) { - clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT); + uint32_t buffer_id = tu_resolve_group_include_buffer(&resolve_group, TU_RESOLVE_GROUP_COLOR_BUFFER); + clear_image(cmd, image, buffer_id, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT); } + + tu_emit_resolve_group(cmd, &cmd->cs, &resolve_group); } TU_GENX(tu_CmdClearColorImage); @@ -3712,19 +3789,31 @@ tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, tu_emit_cache_flush(cmd); } + struct tu_resolve_group resolve_group = {}; + for (unsigned i = 0; i < rangeCount; i++) { const VkImageSubresourceRange *range = &pRanges[i]; if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) { /* can't clear both depth and stencil at once, split up the aspect mask */ - u_foreach_bit(b, range->aspectMask) - clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b)); + u_foreach_bit(b, range->aspectMask) { + uint32_t buffer_id = 0; + if (BIT(b) == VK_IMAGE_ASPECT_DEPTH_BIT) + buffer_id = tu_resolve_group_include_buffer(&resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER); + if (BIT(b) == VK_IMAGE_ASPECT_STENCIL_BIT) + buffer_id = tu_resolve_group_include_buffer(&resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER); + + clear_image(cmd, image, buffer_id, (const VkClearValue*) pDepthStencil, range, BIT(b)); + } continue; } - clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask); + uint32_t buffer_id = tu_resolve_group_include_buffer_for_format(&resolve_group, image->vk.format); + clear_image(cmd, image, buffer_id, (const VkClearValue*) pDepthStencil, range, range->aspectMask); } + tu_emit_resolve_group(cmd, &cmd->cs, &resolve_group); + tu_lrz_clear_depth_image(cmd, image, pDepthStencil, rangeCount, pRanges); } TU_GENX(tu_CmdClearDepthStencilImage); @@ -3933,6 +4022,7 @@ template static void clear_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + uint32_t buffer_id, enum pipe_format format, uint8_t clear_mask, uint32_t gmem_offset, @@ -3943,7 +4033,8 @@ clear_gmem_attachment(struct tu_cmd_buffer *cmd, blit_base_format(format, false, true))); tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.type = BLIT_EVENT_CLEAR, - .clear_mask = clear_mask)); + .clear_mask = clear_mask, + .buffer_id = buffer_id)); tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1); tu_cs_emit(cs, gmem_offset); @@ -3964,6 +4055,7 @@ template static void tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + struct tu_resolve_group *resolve_group, uint32_t attachment, uint32_t base_layer, uint32_t layers, @@ -3984,15 +4076,18 @@ tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, uint32_t layer = i + base_layer; if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) { - clear_gmem_attachment(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf, + uint32_t buffer_id = tu_resolve_group_include_buffer(resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER); + clear_gmem_attachment(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, 0xf, tu_attachment_gmem_offset(cmd, att, layer), value); } if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) { - clear_gmem_attachment(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf, + uint32_t buffer_id = tu_resolve_group_include_buffer(resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER); + clear_gmem_attachment(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, 0xf, tu_attachment_gmem_offset_stencil(cmd, att, layer), value); } } else { - clear_gmem_attachment(cmd, cs, format, aspect_write_mask(format, mask), + uint32_t buffer_id = tu_resolve_group_include_buffer_for_format(resolve_group, att->format); + clear_gmem_attachment(cmd, cs, buffer_id, format, aspect_write_mask(format, mask), tu_attachment_gmem_offset(cmd, att, layer), value); } } @@ -4016,6 +4111,8 @@ tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd, if (rect_count > 1) perf_debug(cmd->device, "TODO: Swap tu_clear_gmem_attachments() loop for smaller command stream"); + struct tu_resolve_group resolve_group = {}; + for (unsigned i = 0; i < rect_count; i++) { unsigned x1 = rects[i].rect.offset.x; unsigned y1 = rects[i].rect.offset.y; @@ -4036,13 +4133,16 @@ tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd, if (a == VK_ATTACHMENT_UNUSED) continue; - tu_emit_clear_gmem_attachment(cmd, cs, a, rects[i].baseArrayLayer, + tu_emit_clear_gmem_attachment(cmd, cs, &resolve_group, a, + rects[i].baseArrayLayer, rects[i].layerCount, subpass->multiview_mask, attachments[j].aspectMask, &attachments[j].clearValue); } } + + tu_emit_resolve_group(cmd, cs, &resolve_group); } template @@ -4109,6 +4209,7 @@ static void tu7_clear_attachment_generic_single_rect( struct tu_cmd_buffer *cmd, struct tu_cs *cs, + struct tu_resolve_group *resolve_group, const struct tu_render_pass_attachment *att, const VkClearAttachment *clear_att, uint32_t a, @@ -4136,15 +4237,18 @@ tu7_clear_attachment_generic_single_rect( if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { if (clear_att->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { - tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_Z32_FLOAT, mask, + uint32_t buffer_id = tu_resolve_group_include_buffer(resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER); + tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, mask, false, layer, value, a); } if (clear_att->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) { - tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_S8_UINT, mask, true, + uint32_t buffer_id = tu_resolve_group_include_buffer(resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER); + tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, mask, true, layer, value, a); } } else { - tu7_generic_layer_clear(cmd, cs, format, mask, false, layer, value, a); + uint32_t buffer_id = tu_resolve_group_include_buffer_for_format(resolve_group, att->format); + tu7_generic_layer_clear(cmd, cs, buffer_id, format, mask, false, layer, value, a); } } } @@ -4178,6 +4282,8 @@ tu_clear_attachments_generic(struct tu_cmd_buffer *cmd, tu_cs_emit_wfi(cs); tu_cond_exec_end(cs); + struct tu_resolve_group resolve_group = {}; + const struct tu_subpass *subpass = cmd->state.subpass; for (uint32_t i = 0; i < attachmentCount; i++) { uint32_t a; @@ -4194,11 +4300,13 @@ tu_clear_attachments_generic(struct tu_cmd_buffer *cmd, iview->view.ubwc_enabled, att->samples); for (unsigned j = 0; j < rectCount; j++) { tu7_clear_attachment_generic_single_rect( - cmd, cs, att, &pAttachments[i], a, &pRects[j]); + cmd, cs, &resolve_group, att, &pAttachments[i], a, &pRects[j]); } trace_end_generic_clear(&cmd->trace, cs); } } + + tu_emit_resolve_group(cmd, cs, &resolve_group); } template @@ -4330,6 +4438,7 @@ template void tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + struct tu_resolve_group *resolve_group, uint32_t a) { const struct tu_render_pass_attachment *attachment = @@ -4338,7 +4447,8 @@ tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, if (!attachment->clear_mask) return; - tu_emit_clear_gmem_attachment(cmd, cs, a, 0, cmd->state.framebuffer->layers, + tu_emit_clear_gmem_attachment(cmd, cs, resolve_group, a, 0, + cmd->state.framebuffer->layers, attachment->clear_views, attachment->clear_mask, &cmd->state.clear_values[a]); @@ -4346,7 +4456,10 @@ tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, TU_GENX(tu_clear_gmem_attachment); void -tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a) +tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + struct tu_resolve_group *resolve_group, + uint32_t a) { const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a]; @@ -4363,15 +4476,18 @@ tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32 aspect_write_mask_generic_clear(format, att->clear_mask); if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { if (att->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) { - tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_Z32_FLOAT, mask, + uint32_t buffer_id = tu_resolve_group_include_buffer(resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER); + tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, mask, false, layer, value, a); } if (att->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) { - tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_S8_UINT, mask, true, + uint32_t buffer_id = tu_resolve_group_include_buffer(resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER); + tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, mask, true, layer, value, a); } } else { - tu7_generic_layer_clear(cmd, cs, format, mask, false, layer, value, a); + uint32_t buffer_id = tu_resolve_group_include_buffer_for_format(resolve_group, att->format); + tu7_generic_layer_clear(cmd, cs, buffer_id, format, mask, false, layer, value, a); } } @@ -4385,6 +4501,7 @@ template static void tu_emit_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + struct tu_resolve_group *resolve_group, const struct tu_image_view *iview, const struct tu_render_pass_attachment *attachment, const VkClearValue *clear_value, @@ -4426,7 +4543,18 @@ tu_emit_blit(struct tu_cmd_buffer *cmd, tu_cs_emit_array(cs, clear_vals, 4); } - event_blit_setup(cs, attachment, blit_event_type, clear_mask); + enum tu_resolve_group_buffer_type buffer_type = TU_RESOLVE_GROUP_COLOR_BUFFER; + if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + if (!separate_stencil) + buffer_type = TU_RESOLVE_GROUP_DEPTH_BUFFER; + else + buffer_type = TU_RESOLVE_GROUP_STENCIL_BUFFER; + } else if (attachment->format == VK_FORMAT_D24_UNORM_S8_UINT) { + buffer_type = TU_RESOLVE_GROUP_DEPTH_BUFFER; + } + + uint32_t buffer_id = tu_resolve_group_include_buffer(resolve_group, buffer_type); + event_blit_setup(cs, buffer_id, attachment, blit_event_type, clear_mask); for_each_layer(i, attachment->clear_views, cmd->state.framebuffer->layers) { event_blit_dst_view blt_view = blt_view_from_tu_view(iview, i); @@ -4618,6 +4746,7 @@ template void tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + struct tu_resolve_group *resolve_group, uint32_t a, bool cond_exec_allowed, bool force_load) @@ -4659,10 +4788,10 @@ tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, load_3d_blit(cmd, cs, iview, attachment, true); } else { if (load_common) - tu_emit_blit(cmd, cs, iview, attachment, NULL, BLIT_EVENT_LOAD, false); + tu_emit_blit(cmd, cs, resolve_group, iview, attachment, NULL, BLIT_EVENT_LOAD, false); if (load_stencil) - tu_emit_blit(cmd, cs, iview, attachment, NULL, BLIT_EVENT_LOAD, true); + tu_emit_blit(cmd, cs, resolve_group, iview, attachment, NULL, BLIT_EVENT_LOAD, true); } if (cond_exec) @@ -4928,6 +5057,7 @@ template void tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + struct tu_resolve_group *resolve_group, uint32_t a, uint32_t gmem_a, uint32_t layers, @@ -4982,9 +5112,9 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, /* use fast path when render area is aligned, except for unsupported resolve cases */ if (use_fast_path) { if (store_common) - tu_emit_blit(cmd, cs, iview, src, clear_value, BLIT_EVENT_STORE, false); + tu_emit_blit(cmd, cs, resolve_group, iview, src, clear_value, BLIT_EVENT_STORE, false); if (store_separate_stencil) - tu_emit_blit(cmd, cs, iview, src, clear_value, BLIT_EVENT_STORE, true); + tu_emit_blit(cmd, cs, resolve_group, iview, src, clear_value, BLIT_EVENT_STORE, true); if (cond_exec) { tu_end_load_store_cond_exec(cmd, cs, false); diff --git a/src/freedreno/vulkan/tu_clear_blit.h b/src/freedreno/vulkan/tu_clear_blit.h index cd938dc184d..d2a3c754f88 100644 --- a/src/freedreno/vulkan/tu_clear_blit.h +++ b/src/freedreno/vulkan/tu_clear_blit.h @@ -34,6 +34,17 @@ tu_resolve_sysmem(struct tu_cmd_buffer *cmd, uint32_t layers, const VkRect2D *rect); +struct tu_resolve_group { + uint32_t color_buffer_id; + bool pending_resolves; +}; + +template +void +tu_emit_resolve_group(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + struct tu_resolve_group *resolve_group); + template void tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, @@ -44,17 +55,20 @@ template void tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + struct tu_resolve_group *resolve_group, uint32_t a); void tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + struct tu_resolve_group *resolve_group, uint32_t a); template void tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + struct tu_resolve_group *resolve_group, uint32_t a, bool cond_exec_allowed, bool force_load); @@ -64,6 +78,7 @@ template void tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + struct tu_resolve_group *resolve_group, uint32_t a, uint32_t gmem_a, uint32_t layers, diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 18777d52299..4dc597de9f9 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -44,7 +44,7 @@ tu_clone_trace(struct tu_cmd_buffer *cmd, struct tu_cs *cs, } template -static void +void tu_emit_raw_event_write(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum vgt_event_type event, @@ -67,6 +67,7 @@ tu_emit_raw_event_write(struct tu_cmd_buffer *cmd, tu_cs_emit(cs, 0); } } +TU_GENX(tu_emit_raw_event_write); template void @@ -1241,6 +1242,8 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu6_emit_blit_scissor(cmd, cs, true); + struct tu_resolve_group resolve_group = {}; + /* Resolve should happen before store in case BLIT_EVENT_STORE_AND_CLEAR is * used for a store. */ @@ -1249,8 +1252,8 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) uint32_t a = subpass->resolve_attachments[i].attachment; if (a != VK_ATTACHMENT_UNUSED) { uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i); - tu_store_gmem_attachment(cmd, cs, a, gmem_a, fb->layers, - subpass->multiview_mask, false); + tu_store_gmem_attachment(cmd, cs, &resolve_group, a, gmem_a, + fb->layers, subpass->multiview_mask, false); } } } @@ -1259,12 +1262,14 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) if (pass->attachments[a].gmem) { const bool cond_exec_allowed = cmd->state.tiling->binning_possible && cmd->state.pass->has_cond_load_store; - tu_store_gmem_attachment(cmd, cs, a, a, + tu_store_gmem_attachment(cmd, cs, &resolve_group, a, a, fb->layers, subpass->multiview_mask, cond_exec_allowed); } } + tu_emit_resolve_group(cmd, cs, &resolve_group); + if (pass->has_fdm) tu_cs_set_writeable(cs, false); } @@ -1295,10 +1300,20 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs) * change per-RP and don't require a WFI to take effect, only CCU inval/flush * events are required. */ - tu_cs_emit_regs(cs, RB_CCU_CNTL(CHIP, + + enum a7xx_concurrent_resolve_mode resolve_mode = CONCURRENT_RESOLVE_MODE_2; + if (TU_DEBUG(NO_CONCURRENT_RESOLVES)) + resolve_mode = CONCURRENT_RESOLVE_MODE_DISABLED; + + enum a7xx_concurrent_unresolve_mode unresolve_mode = CONCURRENT_UNRESOLVE_MODE_FULL; + if (TU_DEBUG(NO_CONCURRENT_UNRESOLVES)) + unresolve_mode = CONCURRENT_UNRESOLVE_MODE_DISABLED; + + tu_cs_emit_regs(cs, RB_CCU_CNTL(A7XX, .gmem_fast_clear_disable = - !dev->physical_device->info->a6xx.has_gmem_fast_clear, - .concurrent_resolve = dev->physical_device->info->a6xx.concurrent_resolve, + !dev->physical_device->info->a6xx.has_gmem_fast_clear, + .concurrent_resolve_mode = resolve_mode, + .concurrent_unresolve_mode = unresolve_mode, )); } @@ -4467,7 +4482,7 @@ tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer, template static void -tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd) +tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resolve_group) { struct tu_cs *cs = &cmd->draw_cs; uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses; @@ -4498,7 +4513,8 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd) tu6_emit_blit_scissor(cmd, cs, true); emitted_scissor = true; } - tu_load_gmem_attachment(cmd, cs, i, cond_load_allowed, false); + tu_load_gmem_attachment(cmd, cs, resolve_group, i, + cond_load_allowed, false); } } @@ -4513,7 +4529,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd) tu6_emit_blit_scissor(cmd, cs, false); emitted_scissor = true; } - tu_clear_gmem_attachment(cmd, cs, i); + tu_clear_gmem_attachment(cmd, cs, resolve_group, i); } } } @@ -4546,7 +4562,7 @@ tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer *cmd) } static void -tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd) +tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resolve_group) { if (cmd->state.render_area.extent.width == 0 || cmd->state.render_area.extent.height == 0) @@ -4564,7 +4580,7 @@ tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd) tu6_emit_blit_scissor(cmd, cs, false); emitted_scissor = true; } - tu7_generic_clear_attachment(cmd, cs, i); + tu7_generic_clear_attachment(cmd, cs, resolve_group, i); } } } @@ -4582,12 +4598,16 @@ tu_emit_subpass_begin(struct tu_cmd_buffer *cmd) { tu_fill_render_pass_state(&cmd->state.vk_rp, cmd->state.pass, cmd->state.subpass); - tu_emit_subpass_begin_gmem(cmd); + struct tu_resolve_group resolve_group = {}; + + tu_emit_subpass_begin_gmem(cmd, &resolve_group); tu_emit_subpass_begin_sysmem(cmd); if (cmd->device->physical_device->info->a7xx.has_generic_clear) { - tu7_emit_subpass_clear(cmd); + tu7_emit_subpass_clear(cmd, &resolve_group); } + tu_emit_resolve_group(cmd, &cmd->draw_cs, &resolve_group); + tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs); tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs); tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false); @@ -4964,6 +4984,8 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, if (subpass->resolve_attachments) { tu6_emit_blit_scissor(cmd, cs, true); + struct tu_resolve_group resolve_group = {}; + for (unsigned i = 0; i < subpass->resolve_count; i++) { uint32_t a = subpass->resolve_attachments[i].attachment; if (a == VK_ATTACHMENT_UNUSED) @@ -4971,8 +4993,8 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i); - tu_store_gmem_attachment(cmd, cs, a, gmem_a, fb->layers, - subpass->multiview_mask, false); + tu_store_gmem_attachment(cmd, cs, &resolve_group, a, gmem_a, + fb->layers, subpass->multiview_mask, false); if (!pass->attachments[a].gmem) continue; @@ -4981,8 +5003,10 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, * if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM.. */ perf_debug(cmd->device, "TODO: missing GMEM->GMEM resolve path\n"); - tu_load_gmem_attachment(cmd, cs, a, false, true); + tu_load_gmem_attachment(cmd, cs, &resolve_group, a, false, true); } + + tu_emit_resolve_group(cmd, cs, &resolve_group); } tu_cond_exec_end(cs); diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index 669b8a0efc8..0be7f06e7e3 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -679,6 +679,13 @@ void tu_cmd_render(struct tu_cmd_buffer *cmd); enum fd_gpu_event : uint32_t; +template +void +tu_emit_raw_event_write(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + enum vgt_event_type event, + bool needs_seqno); + template void tu_emit_event_write(struct tu_cmd_buffer *cmd, diff --git a/src/freedreno/vulkan/tu_util.cc b/src/freedreno/vulkan/tu_util.cc index 992a410129e..dee233656c5 100644 --- a/src/freedreno/vulkan/tu_util.cc +++ b/src/freedreno/vulkan/tu_util.cc @@ -43,6 +43,8 @@ static const struct debug_control tu_debug_options[] = { { "noconform", TU_DEBUG_NOCONFORM }, { "rd", TU_DEBUG_RD }, { "hiprio", TU_DEBUG_HIPRIO }, + { "noconcurrentresolves", TU_DEBUG_NO_CONCURRENT_RESOLVES }, + { "noconcurrentunresolves", TU_DEBUG_NO_CONCURRENT_UNRESOLVES }, { NULL, 0 } }; diff --git a/src/freedreno/vulkan/tu_util.h b/src/freedreno/vulkan/tu_util.h index 0024d9ead2f..0910acc183e 100644 --- a/src/freedreno/vulkan/tu_util.h +++ b/src/freedreno/vulkan/tu_util.h @@ -49,6 +49,8 @@ enum tu_debug_flags TU_DEBUG_NOCONFORM = 1 << 24, TU_DEBUG_RD = 1 << 25, TU_DEBUG_HIPRIO = 1 << 26, + TU_DEBUG_NO_CONCURRENT_RESOLVES = 1 << 27, + TU_DEBUG_NO_CONCURRENT_UNRESOLVES = 1 << 28, }; struct tu_env {