diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index ccfa0d1c7ae..e9fc2f428a8 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -1443,25 +1443,43 @@ tu6_emit_gmem_resolves(struct tu_cmd_buffer *cmd, } } +/* Emits any tile stores at the end of a subpass. + * + * These are emitted into draw_cs for non-final subpasses, and tile_store_cs for + * the final subpass. The draw_cs ones mean that we have to disable IB2 skipping + * for the draw_cs so we don't exit before storing. The separate tile_store_cs + * lets us leave IB2 skipping enabled in the common case of a single-subpass + * renderpass (or dynamic rendering). + * + * To do better in the multi-subpass case, we'd need the individual CS entries + * of draw_cs to have a flag for whether they can be skipped or not, and + * interleave drawing cs entries with store cs entries. + * + * This is independent of cond_store_allowed, which is about "can we skip doing + * the store if no other rendering happened in the tile?" We can only skip if + * the cond that we set up at the start of the tile (or reset just before + * calling tile_store_cs) is still in place. + */ template static void -tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +tu6_emit_gmem_stores(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + struct tu_resolve_group *resolve_group, + const struct tu_subpass *subpass) { const struct tu_render_pass *pass = cmd->state.pass; - const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1]; const struct tu_framebuffer *fb = cmd->state.framebuffer; const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling); + uint32_t subpass_idx = subpass - cmd->state.pass->subpasses; + const bool cond_exec_allowed = vsc->binning_possible && + cmd->state.pass->has_cond_load_store && + (!cmd->state.rp.draw_cs_writes_to_cond_pred || + cs != &cmd->draw_cs); if (pass->has_fdm) tu_cs_set_writeable(cs, true); - tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); - tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RESOLVE) | - A6XX_CP_SET_MARKER_0_USES_GMEM); - - tu6_emit_blit_scissor(cmd, cs, true, false); - - struct tu_resolve_group resolve_group = {}; + bool scissor_emitted = false; /* Resolve should happen before store in case BLIT_EVENT_STORE_AND_CLEAR is * used for a store. @@ -1471,22 +1489,50 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) * been generated). a7xx has HW conditional resolve support that may skip * the resolve if geometry didn't cover it, anyway. */ - tu6_emit_gmem_resolves(cmd, subpass, &resolve_group, cs); + if (subpass->resolve_attachments) { + if (!scissor_emitted) { + tu6_emit_blit_scissor(cmd, cs, true, false); + scissor_emitted = true; + } + tu6_emit_gmem_resolves(cmd, subpass, resolve_group, cs); + } for (uint32_t a = 0; a < pass->attachment_count; ++a) { - if (pass->attachments[a].gmem) { - const bool cond_exec_allowed = vsc->binning_possible && - cmd->state.pass->has_cond_load_store; - tu_store_gmem_attachment(cmd, cs, &resolve_group, a, a, + const struct tu_render_pass_attachment *att = &pass->attachments[a]; + /* Note: att->cond_store_allowed implies at least one of att->store_* set */ + if (pass->attachments[a].gmem && att->last_subpass_idx == subpass_idx) { + if (!scissor_emitted) { + tu6_emit_blit_scissor(cmd, cs, true, false); + scissor_emitted = true; + } + tu_store_gmem_attachment(cmd, cs, resolve_group, a, a, fb->layers, subpass->multiview_mask, cond_exec_allowed); } } +} + +template +static void +tu6_emit_tile_store_cs(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + const struct tu_render_pass *pass = cmd->state.pass; + uint32_t subpass_idx = pass->subpass_count - 1; + const struct tu_subpass *subpass = &pass->subpasses[subpass_idx]; + + /* We believe setting the marker affects what state HW blocks save/restore + * during preemption. So we only emit it before the stores at the end of the + * last subpass, not other resolves. + */ + tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RESOLVE) | + A6XX_CP_SET_MARKER_0_USES_GMEM); + + struct tu_resolve_group resolve_group = {}; + + tu6_emit_gmem_stores(cmd, cs, &resolve_group, subpass); tu_emit_resolve_group(cmd, cs, &resolve_group); - - if (pass->has_fdm) - tu_cs_set_writeable(cs, false); } void @@ -2444,6 +2490,8 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_physical_device *phys_dev = cmd->device->physical_device; const struct tu_tiling_config *tiling = cmd->state.tiling; const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling); + const struct tu_render_pass *pass = cmd->state.pass; + tu_lrz_tiling_begin(cmd, cs); tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); @@ -2497,10 +2545,18 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, A6XX_VFD_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL)); } - tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - tu_cs_emit(cs, 0x1); - tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_LOCAL, 1); - tu_cs_emit(cs, 0x1); + /* Enable early return from CP_INDIRECT_BUFFER once the visibility stream + * is done. We don't enable this if there are stores in a non-final + * subpass, because it's more important to be able to share gmem space + * between attachments by storing early, than it is to do IB2 skipping + * (which has an effect we struggle to even measure). + */ + if (pass->allow_ib2_skipping) { + tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); + tu_cs_emit(cs, 0x1); + tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_LOCAL, 1); + tu_cs_emit(cs, 0x1); + } } else { if (vsc->binning_possible) { /* Mark all tiles as visible for tu6_emit_cond_for_load_stores(), since @@ -2563,8 +2619,14 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu6_emit_cond_for_load_stores(cmd, cs, tile->pipe, slot, false); } - tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - tu_cs_emit(cs, 0x0); + if (cmd->state.pass->allow_ib2_skipping) { + /* Disable CP_INDIRECT_BUFFER/CP_DRAW skipping again at the end of the + * pass -- tile_store_cs is for stores that can't be skipped based on + * visibility. + */ + tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); + tu_cs_emit(cs, 0x0); + } tu_cs_emit_call(cs, &cmd->tile_store_cs); @@ -2921,7 +2983,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, * called from tu6_render_tile(). */ tu_cs_begin(&cmd->tile_store_cs); - tu6_emit_tile_store(cmd, &cmd->tile_store_cs); + tu6_emit_tile_store_cs(cmd, &cmd->tile_store_cs); tu_cs_end(&cmd->tile_store_cs); cmd->trace_rp_drawcalls_end = u_trace_end_iterator(&cmd->trace); @@ -5866,20 +5928,22 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); + struct tu_resolve_group resolve_group = {}; + if (subpass->resolve_attachments) { tu6_emit_blit_scissor(cmd, cs, true, false); - struct tu_resolve_group resolve_group = {}; - /* TODO: we're emitting the resolves into the draw CS, which is conditionally * executed based on geometry being present. That's not actually correct * unless the resolve is generating geometry into the vis stream. */ tu6_emit_gmem_resolves(cmd, subpass, &resolve_group, cs); - - tu_emit_resolve_group(cmd, cs, &resolve_group); } + tu6_emit_gmem_stores(cmd, &cmd->draw_cs, &resolve_group, subpass); + + tu_emit_resolve_group(cmd, cs, &resolve_group); + tu_cond_exec_end(cs); if (cmd->state.pass->has_fdm) diff --git a/src/freedreno/vulkan/tu_pass.cc b/src/freedreno/vulkan/tu_pass.cc index 18012c40970..d8caa651e99 100644 --- a/src/freedreno/vulkan/tu_pass.cc +++ b/src/freedreno/vulkan/tu_pass.cc @@ -581,6 +581,24 @@ tu_render_pass_cond_config(struct tu_device *device, } } +/** + * Checks if the pass should allow IB2 skipping. + * + * If any stores would be emitted in a non-final subpass, then we need to turn + * off IB2 skipping to make sure that we don't early-return before they happen. + */ +static void +tu_render_pass_check_ib2_skip(struct tu_render_pass *pass) +{ + pass->allow_ib2_skipping = true; + for (int i = 0; i < pass->attachment_count; i++) { + struct tu_render_pass_attachment *att = &pass->attachments[i]; + if ((att->store || att->store_stencil) && + att->last_subpass_idx != pass->subpass_count - 1) + pass->allow_ib2_skipping = false; + } +} + static void tu_render_pass_gmem_config(struct tu_render_pass *pass, const struct tu_physical_device *phys_dev) @@ -803,11 +821,8 @@ tu_subpass_use_attachment(struct tu_render_pass *pass, int i, uint32_t a, const /* Loads and clears are emitted at the start of the subpass that needs them. */ att->first_subpass_idx = MIN2(i, att->first_subpass_idx); - /* Stores are emitted at vkEndRenderPass() time. */ - if (att->store || att->store_stencil) - att->last_subpass_idx = pass->subpass_count - 1; - else - att->last_subpass_idx = MAX2(i, att->last_subpass_idx); + /* Stores are emitted after the last subpass using them. */ + att->last_subpass_idx = MAX2(i, att->last_subpass_idx); } static void @@ -1044,6 +1059,7 @@ tu_CreateRenderPass2(VkDevice _device, } } + tu_render_pass_check_ib2_skip(pass); tu_render_pass_cond_config(device, pass); tu_render_pass_gmem_config(pass, device->physical_device); tu_render_pass_bandwidth_config(pass); @@ -1289,6 +1305,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer, pass->attachment_count = a; + tu_render_pass_check_ib2_skip(pass); tu_render_pass_cond_config(device, pass); tu_render_pass_gmem_config(pass, device->physical_device); tu_render_pass_bandwidth_config(pass); diff --git a/src/freedreno/vulkan/tu_pass.h b/src/freedreno/vulkan/tu_pass.h index c3e4d1169c1..fbe36d838cb 100644 --- a/src/freedreno/vulkan/tu_pass.h +++ b/src/freedreno/vulkan/tu_pass.h @@ -138,6 +138,7 @@ struct tu_render_pass struct tu_render_pass_attachment *attachments; bool has_cond_load_store; bool has_fdm; + bool allow_ib2_skipping; struct tu_subpass_barrier end_barrier; struct tu_subpass subpasses[0];