diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index b88acace021..5086344942f 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -133,6 +133,10 @@ struct fd_dev_info { bool has_dp2acc; bool has_dp4acc; + bool enable_lrz_fast_clear; + bool has_lrz_dir_tracking; + bool lrz_track_quirk; + struct { uint32_t RB_UNKNOWN_8E04_blit; uint32_t PC_POWER_CNTL; diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index a3b64f69510..111789c4836 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -138,6 +138,10 @@ class A6xxGPUInfo(GPUInfo): self.a6xx.has_cp_reg_write = True self.a6xx.has_8bpp_ubwc = True + # All a6xx gens support lrz fast-clear, however newer blob driver + # (v615) doesn't use it for gen1 and gen2. + self.a6xx.enable_lrz_fast_clear = True + for name, val in template.items(): if name == "magic": # handled above continue @@ -245,6 +249,8 @@ a6xx_gen3 = dict( has_ccu_flush_bug = True, has_8bpp_ubwc = False, has_dp2acc = True, + has_lrz_dir_tracking = True, + lrz_track_quirk = True, magic = dict( # this seems to be a chicken bit that fixes cubic filtering: TPL1_DBG_ECO_CNTL = 0x1000000, @@ -271,6 +277,7 @@ a6xx_gen4 = dict( has_getfiberid = True, has_dp2acc = True, has_dp4acc = True, + has_lrz_dir_tracking = True, magic = dict( TPL1_DBG_ECO_CNTL = 0x5008000, ), diff --git a/src/freedreno/fdl/fd6_view.c b/src/freedreno/fdl/fd6_view.c index 003939954e0..95ae463b06d 100644 --- a/src/freedreno/fdl/fd6_view.c +++ b/src/freedreno/fdl/fd6_view.c @@ -316,6 +316,15 @@ fdl6_view_init(struct fdl6_view *view, const struct fdl_layout **layouts, A6XX_RB_DEPTH_FLAG_BUFFER_PITCH_PITCH(ubwc_pitch) | A6XX_RB_DEPTH_FLAG_BUFFER_PITCH_ARRAY_PITCH(layout->ubwc_layer_size >> 2); + const struct util_format_description *format_desc = + util_format_description(args->format); + if (util_format_has_depth(format_desc)) { + view->GRAS_LRZ_DEPTH_VIEW = + A6XX_GRAS_LRZ_DEPTH_VIEW_BASE_LAYER(args->base_array_layer) | + A6XX_GRAS_LRZ_DEPTH_VIEW_LAYER_COUNT(args->layer_count) | + A6XX_GRAS_LRZ_DEPTH_VIEW_BASE_MIP_LEVEL(args->base_miplevel); + } + view->base_addr = base_addr; view->ubwc_addr = ubwc_addr; view->layer_size = layer_size; diff --git a/src/freedreno/fdl/freedreno_layout.h b/src/freedreno/fdl/freedreno_layout.h index 2b7d1f6f916..b743b0eb690 100644 --- a/src/freedreno/fdl/freedreno_layout.h +++ b/src/freedreno/fdl/freedreno_layout.h @@ -313,6 +313,8 @@ struct fdl6_view { uint32_t RB_2D_DST_INFO; uint32_t RB_BLIT_DST_INFO; + + uint32_t GRAS_LRZ_DEPTH_VIEW; }; void diff --git a/src/freedreno/registers/adreno/a6xx.xml b/src/freedreno/registers/adreno/a6xx.xml index 8771fad1855..e5f77d3afd9 100644 --- a/src/freedreno/registers/adreno/a6xx.xml +++ b/src/freedreno/registers/adreno/a6xx.xml @@ -1785,6 +1785,11 @@ to upconvert to 32b float internally? update MAX instead of MIN value, ie. GL_GREATER/GL_GEQUAL + + Clears the LRZ block being touched to: + - 0.0 if GREATER + - 1.0 if LESS + @@ -1857,6 +1862,14 @@ to upconvert to 32b float internally? + diff --git a/src/freedreno/vulkan/meson.build b/src/freedreno/vulkan/meson.build index e4e0b3afe3e..fdcfafa12e2 100644 --- a/src/freedreno/vulkan/meson.build +++ b/src/freedreno/vulkan/meson.build @@ -41,6 +41,7 @@ libtu_files = files( 'tu_descriptor_set.h', 'tu_formats.c', 'tu_image.c', + 'tu_lrz.c', 'tu_nir_lower_multiview.c', 'tu_pass.c', 'tu_pipeline.c', diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c index 69502e310a6..3e8120c25e2 100644 --- a/src/freedreno/vulkan/tu_clear_blit.c +++ b/src/freedreno/vulkan/tu_clear_blit.c @@ -1304,6 +1304,15 @@ tu6_clear_lrz(struct tu_cmd_buffer *cmd, { const struct blit_ops *ops = &r2d_ops; + /* It is assumed that LRZ cache is invalidated at this point for + * the writes here to become visible to LRZ. + * + * LRZ writes are going through UCHE cache, flush UCHE before changing + * LRZ via CCU. Don't need to invalidate CCU since we are presumably + * writing whole cache lines we assume to be 64 bytes. + */ + tu6_emit_event_write(cmd, &cmd->cs, CACHE_FLUSH_TS); + ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false, VK_SAMPLE_COUNT_1_BIT); ops->clear_value(cs, PIPE_FORMAT_Z16_UNORM, value); @@ -1313,6 +1322,32 @@ tu6_clear_lrz(struct tu_cmd_buffer *cmd, ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height}); ops->run(cmd, cs); ops->teardown(cmd, cs); + + /* Clearing writes via CCU color in the PS stage, and LRZ is read via + * UCHE in the earlier GRAS stage. + */ + cmd->state.cache.flush_bits |= + TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE | + TU_CMD_FLAG_WAIT_FOR_IDLE; +} + +void +tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + struct tu_image *image) +{ + const struct blit_ops *ops = &r2d_ops; + VkClearValue clear = { .color = { .uint32[0] = 0xffffffff } }; + + /* LRZ fast-clear buffer is always allocated with 512 bytes size. */ + ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false, + VK_SAMPLE_COUNT_1_BIT); + ops->clear_value(cs, PIPE_FORMAT_R32_UINT, &clear); + ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, + image->iova + image->lrz_fc_offset, 512); + ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {128, 1}); + ops->run(cmd, cs); + ops->teardown(cmd, cs); } static void @@ -1536,6 +1571,10 @@ tu_CmdBlitImage2KHR(VkCommandBuffer commandBuffer, tu6_blit_image(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i, pBlitImageInfo->filter); } + + if (dst_image->lrz_height) { + tu_disable_lrz(cmd, &cmd->cs, dst_image); + } } static void @@ -1640,6 +1679,10 @@ tu_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer, for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i) tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pCopyBufferToImageInfo->pRegions + i); + + if (dst_image->lrz_height) { + tu_disable_lrz(cmd, &cmd->cs, dst_image); + } } static void @@ -1954,6 +1997,10 @@ tu_CmdCopyImage2KHR(VkCommandBuffer commandBuffer, tu_copy_image_to_image(cmd, src_image, dst_image, pCopyImageInfo->pRegions + i); } + + if (dst_image->lrz_height) { + tu_disable_lrz(cmd, &cmd->cs, dst_image); + } } static void @@ -2284,6 +2331,8 @@ tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask); } + + tu_lrz_clear_depth_image(cmd, image, pDepthStencil, rangeCount, pRanges); } static void @@ -2643,8 +2692,8 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer, for (uint32_t j = 0; j < attachmentCount; j++) { if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0) continue; - cmd->state.lrz.valid = false; - cmd->state.dirty |= TU_CMD_DIRTY_LRZ; + + tu_lrz_disable_during_renderpass(cmd); } /* vkCmdClearAttachments is supposed to respect the predicate if active. diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index ab949110566..4b70b87e7cd 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -218,11 +218,6 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd, tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE)); - tu_cs_emit_regs(cs, - A6XX_GRAS_LRZ_BUFFER_BASE(0), - A6XX_GRAS_LRZ_BUFFER_PITCH(0), - A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0)); - tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0)); return; @@ -247,10 +242,6 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd, tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3); tu_cs_image_flag_ref(cs, &iview->view, 0); - tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_BUFFER_BASE(.qword = iview->image->iova + iview->image->lrz_offset), - A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = iview->image->lrz_pitch), - A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE()); - if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT || attachment->format == VK_FORMAT_S8_UINT) { @@ -1243,6 +1234,7 @@ tu_set_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_subpass *sub tu_emit_input_attachments(cmd, subpass, false)); } + static void tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd, const VkRenderPassBeginInfo *info) @@ -1274,14 +1266,15 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, { const struct tu_framebuffer *fb = cmd->state.framebuffer; + tu_lrz_sysmem_begin(cmd, cs); + assert(fb->width > 0 && fb->height > 0); tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1); tu6_emit_window_offset(cs, 0, 0); tu6_emit_bin_size(cs, 0, 0, - A6XX_RB_BIN_CONTROL_BUFFERS_LOCATION(BUFFERS_IN_SYSMEM)); - - tu6_emit_event_write(cmd, cs, LRZ_FLUSH); + A6XX_RB_BIN_CONTROL_BUFFERS_LOCATION(BUFFERS_IN_SYSMEM) | + A6XX_RB_BIN_CONTROL_FORCE_LRZ_WRITE_DIS); tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS)); @@ -1318,7 +1311,7 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); tu_cs_emit(cs, 0x0); - tu6_emit_event_write(cmd, cs, LRZ_FLUSH); + tu_lrz_sysmem_end(cmd, cs); tu_cs_sanity_check(cs); } @@ -1329,7 +1322,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, { struct tu_physical_device *phys_dev = cmd->device->physical_device; - tu6_emit_event_write(cmd, cs, LRZ_FLUSH); + tu_lrz_tiling_begin(cmd, cs); tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); tu_cs_emit(cs, 0x0); @@ -1424,10 +1417,7 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_emit_call(cs, &cmd->draw_epilogue_cs); - tu_cs_emit_regs(cs, - A6XX_GRAS_LRZ_CNTL(0)); - - tu6_emit_event_write(cmd, cs, LRZ_FLUSH); + tu_lrz_tiling_end(cmd, cs); tu6_emit_event_write(cmd, cs, PC_CCU_RESOLVE_TS); @@ -1770,9 +1760,13 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, } if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { + TU_FROM_HANDLE(tu_framebuffer, fb, pBeginInfo->pInheritanceInfo->framebuffer); + cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); cmd_buffer->state.subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; + + tu_lrz_begin_secondary_cmdbuf(cmd_buffer, fb); } else { /* When executing in the middle of another command buffer, the CCU * state is unknown. @@ -3359,6 +3353,11 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, cmd->state.draw_cs_writes_to_cond_pred |= secondary->state.draw_cs_writes_to_cond_pred; + /* If LRZ was made invalid in secondary - we should disable + * LRZ retroactively for the whole renderpass. + */ + if (!secondary->state.lrz.valid) + cmd->state.lrz.valid = false; } else { assert(tu_cs_is_empty(&secondary->draw_cs)); assert(tu_cs_is_empty(&secondary->draw_epilogue_cs)); @@ -3370,7 +3369,7 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, } cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */ - if (cmd->state.pass) { + if (!cmd->state.lrz.gpu_dir_tracking && cmd->state.pass) { /* After a secondary command buffer is executed, LRZ is not valid * until it is cleared again. */ @@ -3577,31 +3576,7 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, if (pass->subpasses[0].feedback_invalidate) cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE; - /* Track LRZ valid state */ - uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment; - if (a != VK_ATTACHMENT_UNUSED) { - const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a]; - struct tu_image *image = cmd->state.attachments[a]->image; - /* if image has lrz and it isn't a stencil-only clear: */ - if (image->lrz_height && - (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT))) { - cmd->state.lrz.image = image; - cmd->state.lrz.valid = true; - cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN; - - tu6_clear_lrz(cmd, &cmd->cs, image, &pRenderPassBegin->pClearValues[a]); - - /* Clearing writes via CCU color in the PS stage, and LRZ is read via - * UCHE in the earlier GRAS stage. - */ - cmd->state.cache.flush_bits |= - TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE | - TU_CMD_FLAG_WAIT_FOR_IDLE; - } else { - cmd->state.lrz.valid = false; - } - cmd->state.dirty |= TU_CMD_DIRTY_LRZ; - } + tu_lrz_begin_renderpass(cmd, pRenderPassBegin); cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace); @@ -3755,253 +3730,6 @@ tu6_emit_consts_geom(struct tu_cmd_buffer *cmd, return tu_cs_end_draw_state(&cmd->sub_cs, &cs); } -/* update lrz state based on stencil-test func: - * - * Conceptually the order of the pipeline is: - * - * - * FS -> Alpha-Test -> Stencil-Test -> Depth-Test - * | | - * if wrmask != 0 if wrmask != 0 - * | | - * v v - * Stencil-Write Depth-Write - * - * Because Stencil-Test can have side effects (Stencil-Write) prior - * to depth test, in this case we potentially need to disable early - * lrz-test. See: - * - * https://www.khronos.org/opengl/wiki/Per-Sample_Processing - */ -static void -tu6_lrz_stencil_op(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl, - VkCompareOp func, - bool stencil_write, - bool *invalidate_lrz) -{ - switch (func) { - case VK_COMPARE_OP_ALWAYS: - /* nothing to do for LRZ, but for stencil test when stencil- - * write is enabled, we need to disable lrz-test, since - * conceptually stencil test and write happens before depth-test. - */ - if (stencil_write) { - gras_lrz_cntl->enable = false; - gras_lrz_cntl->z_test_enable = false; - *invalidate_lrz = true; - } - break; - case VK_COMPARE_OP_NEVER: - /* fragment never passes, disable lrz_write for this draw. */ - gras_lrz_cntl->lrz_write = false; - break; - default: - /* whether the fragment passes or not depends on result - * of stencil test, which we cannot know when doing binning - * pass. - */ - gras_lrz_cntl->lrz_write = false; - /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side- - * effects from stencil test we need to disable lrz-test. - */ - if (stencil_write) { - gras_lrz_cntl->enable = false; - gras_lrz_cntl->z_test_enable = false; - *invalidate_lrz = true; - } - break; - } -} - -static struct A6XX_GRAS_LRZ_CNTL -tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd, - const uint32_t a) -{ - struct tu_pipeline *pipeline = cmd->state.pipeline; - bool z_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; - bool z_write_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; - bool z_read_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; - bool z_bounds_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE; - VkCompareOp depth_compare_op = (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT; - - struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 }; - - /* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth - * or early fragment tests. - */ - if (pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_LRZ) { - cmd->state.lrz.valid = false; - return gras_lrz_cntl; - } - - /* If depth test is disabled we shouldn't touch LRZ. - * Same if there is no depth attachment. - */ - if (a == VK_ATTACHMENT_UNUSED || !z_test_enable || - (cmd->device->instance->debug_flags & TU_DEBUG_NOLRZ)) - return gras_lrz_cntl; - - if (!cmd->state.attachments) { - /* Secondary cmdbuf - there is nothing we could do. */ - return gras_lrz_cntl; - } - - gras_lrz_cntl.enable = z_test_enable; - gras_lrz_cntl.lrz_write = - z_write_enable && - !(pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_WRITE); - gras_lrz_cntl.z_test_enable = z_read_enable; - gras_lrz_cntl.z_bounds_enable = z_bounds_enable; - - /* See comment in tu_pipeline about disabling LRZ write for blending. */ - if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_LOGIC_OP)) && - cmd->state.logic_op_enabled && cmd->state.rop_reads_dst) - gras_lrz_cntl.lrz_write = false; - - if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE)) && - cmd->state.color_write_enable != MASK(cmd->state.pipeline->num_rts)) - gras_lrz_cntl.lrz_write = false; - - /* LRZ is disabled until it is cleared, which means that one "wrong" - * depth test or shader could disable LRZ until depth buffer is cleared. - */ - bool disable_lrz = false; - bool temporary_disable_lrz = false; - - /* If Z is not written - it doesn't affect LRZ buffer state. - * Which means two things: - * - Don't lock direction until Z is written for the first time; - * - If Z isn't written and direction IS locked it's possible to just - * temporary disable LRZ instead of fully bailing out, when direction - * is changed. - */ - - enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN; - switch (depth_compare_op) { - case VK_COMPARE_OP_ALWAYS: - case VK_COMPARE_OP_NOT_EQUAL: - /* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction, - * so if there is a depth write - LRZ must be disabled. - */ - if (z_write_enable) { - perf_debug(cmd->device, "Invalidating LRZ due to ALWAYS/NOT_EQUAL"); - disable_lrz = true; - } else { - perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL"); - temporary_disable_lrz = true; - } - break; - case VK_COMPARE_OP_EQUAL: - case VK_COMPARE_OP_NEVER: - /* Blob disables LRZ for OP_EQUAL, and from our empirical - * evidence it is a right thing to do. - * - * Both OP_EQUAL and OP_NEVER don't change LRZ buffer so - * we could just temporary disable LRZ. - */ - temporary_disable_lrz = true; - break; - case VK_COMPARE_OP_GREATER: - case VK_COMPARE_OP_GREATER_OR_EQUAL: - lrz_direction = TU_LRZ_GREATER; - gras_lrz_cntl.greater = true; - break; - case VK_COMPARE_OP_LESS: - case VK_COMPARE_OP_LESS_OR_EQUAL: - lrz_direction = TU_LRZ_LESS; - gras_lrz_cntl.greater = false; - break; - default: - unreachable("bad VK_COMPARE_OP value or uninitialized"); - break; - }; - - /* If depthfunc direction is changed, bail out on using LRZ. The - * LRZ buffer encodes a min/max depth value per block, but if - * we switch from GT/GE <-> LT/LE, those values cannot be - * interpreted properly. - */ - if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN && - lrz_direction != TU_LRZ_UNKNOWN && - cmd->state.lrz.prev_direction != lrz_direction) { - if (z_write_enable) { - perf_debug(cmd->device, "Invalidating LRZ due to direction change"); - disable_lrz = true; - } else { - perf_debug(cmd->device, "Skipping LRZ due to direction change"); - temporary_disable_lrz = true; - } - } - - /* Consider the following sequence of depthfunc changes: - * - * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER - * LRZ is disabled during COMPARE_OP_EQUAL but could be enabled - * during second VK_COMPARE_OP_GREATER. - * - * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS - * Here, LRZ is disabled during COMPARE_OP_EQUAL and should become - * invalid during COMPARE_OP_LESS. - * - * This shows that we should keep last KNOWN direction. - */ - if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN) - cmd->state.lrz.prev_direction = lrz_direction; - - /* Invalidate LRZ and disable write if stencil test is enabled */ - bool stencil_test_enable = cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE; - if (stencil_test_enable) { - bool stencil_front_writemask = - (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ? - (cmd->state.dynamic_stencil_wrmask & 0xff) : - (pipeline->stencil_wrmask & 0xff); - - bool stencil_back_writemask = - (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ? - ((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) : - (pipeline->stencil_wrmask & 0xff00) >> 8; - - VkCompareOp stencil_front_compare_op = - (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC__SHIFT; - - VkCompareOp stencil_back_compare_op = - (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC_BF__SHIFT; - - tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_front_compare_op, - stencil_front_writemask, &disable_lrz); - - tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_back_compare_op, - stencil_back_writemask, &disable_lrz); - } - - if (disable_lrz) - cmd->state.lrz.valid = false; - - if (temporary_disable_lrz) - gras_lrz_cntl.enable = false; - - cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable; - if (!cmd->state.lrz.enabled) - memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl)); - - return gras_lrz_cntl; -} - -static void -tu6_build_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs) -{ - const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment; - struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a); - - tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL( - .enable = gras_lrz_cntl.enable, - .greater = gras_lrz_cntl.greater, - .lrz_write = gras_lrz_cntl.lrz_write, - .z_test_enable = gras_lrz_cntl.z_test_enable, - .z_bounds_enable = gras_lrz_cntl.z_bounds_enable)); - tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable)); -} - static bool tu6_writes_depth(struct tu_cmd_buffer *cmd, bool depth_test_enable) { @@ -4186,7 +3914,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, struct tu_cs cs; cmd->state.lrz_and_depth_plane_state = tu_cs_draw_state(&cmd->sub_cs, &cs, 8); - tu6_build_lrz(cmd, &cs); + tu6_emit_lrz(cmd, &cs); tu6_build_depth_plane_z_mode(cmd, &cs); } diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index 4264aaad87f..7e5e8505754 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -360,6 +360,7 @@ static const struct debug_control tu_debug_options[] = { { "noubwc", TU_DEBUG_NOUBWC }, { "nomultipos", TU_DEBUG_NOMULTIPOS }, { "nolrz", TU_DEBUG_NOLRZ }, + { "nolrzfc", TU_DEBUG_NOLRZFC }, { "perf", TU_DEBUG_PERF }, { "perfc", TU_DEBUG_PERFC }, { "flushall", TU_DEBUG_FLUSHALL }, diff --git a/src/freedreno/vulkan/tu_image.c b/src/freedreno/vulkan/tu_image.c index 8ab5f1481a5..5dcc5c1bd71 100644 --- a/src/freedreno/vulkan/tu_image.c +++ b/src/freedreno/vulkan/tu_image.c @@ -568,6 +568,35 @@ tu_image_init(struct tu_device *device, struct tu_image *image, image->lrz_offset = image->total_size; unsigned lrz_size = lrz_pitch * lrz_height * 2; image->total_size += lrz_size; + + unsigned nblocksx = DIV_ROUND_UP(DIV_ROUND_UP(width, 8), 16); + unsigned nblocksy = DIV_ROUND_UP(DIV_ROUND_UP(height, 8), 4); + + /* Fast-clear buffer is 1bit/block */ + image->lrz_fc_size = DIV_ROUND_UP(nblocksx * nblocksy, 8); + + /* Fast-clear buffer cannot be larger than 512 bytes (HW limitation) */ + bool has_lrz_fc = image->lrz_fc_size <= 512 && + device->physical_device->info->a6xx.enable_lrz_fast_clear && + !unlikely(device->physical_device->instance->debug_flags & TU_DEBUG_NOLRZFC); + + if (has_lrz_fc || device->physical_device->info->a6xx.has_lrz_dir_tracking) { + image->lrz_fc_offset = image->total_size; + image->total_size += 512; + + if (device->physical_device->info->a6xx.has_lrz_dir_tracking) { + /* Direction tracking uses 1 byte */ + image->total_size += 1; + /* GRAS_LRZ_DEPTH_VIEW needs 5 bytes: 4 for view data and 1 for padding */ + image->total_size += 5; + } + } + + if (!has_lrz_fc) { + image->lrz_fc_size = 0; + } + } else { + image->lrz_height = 0; } return VK_SUCCESS; diff --git a/src/freedreno/vulkan/tu_lrz.c b/src/freedreno/vulkan/tu_lrz.c new file mode 100644 index 00000000000..9f372064bac --- /dev/null +++ b/src/freedreno/vulkan/tu_lrz.c @@ -0,0 +1,796 @@ +/* + * Copyright © 2022 Igalia S.L. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "tu_private.h" + +#include "tu_cs.h" + +/* Low-resolution Z buffer is very similar to a depth prepass that helps + * the HW avoid executing the fragment shader on those fragments that will + * be subsequently discarded by the depth test afterwards. + * + * The interesting part of this feature is that it allows applications + * to submit the vertices in any order. + * + * In the binning pass it is possible to store the depth value of each + * vertex into internal low resolution depth buffer and quickly test + * the primitives against it during the render pass. + * + * There are a number of limitations when LRZ cannot be used: + * - Fragment shader side-effects (writing to SSBOs, atomic operations, etc); + * - Writing to stencil buffer + * - Writing depth while: + * - Changing direction of depth test (e.g. from OP_GREATER to OP_LESS); + * - Using OP_ALWAYS or OP_NOT_EQUAL; + * - Clearing depth with vkCmdClearAttachments; + * - (pre-a650) Not clearing depth attachment with LOAD_OP_CLEAR; + * - (pre-a650) Using secondary command buffers; + * - Sysmem rendering (with small caveat). + * + * Pre-a650 (before gen3) + * ====================== + * + * The direction is fully tracked on CPU. In renderpass LRZ starts with + * unknown direction, the direction is set first time when depth write occurs + * and if it does change afterwards - direction becomes invalid and LRZ is + * disabled for the rest of the renderpass. + * + * Since direction is not tracked by GPU - it's impossible to know whether + * LRZ is enabled during construction of secondary command buffers. + * + * For the same reason it's impossible to reuse LRZ between renderpasses. + * + * A650+ (gen3+) + * ============= + * + * Now LRZ direction could be tracked on GPU. There are to parts: + * - Direction byte which stores current LRZ direction; + * - Parameters of the last used depth view. + * + * The idea is the same as when LRZ tracked on CPU: when GRAS_LRZ_CNTL + * is used - its direction is compared to previously known direction + * and direction byte is set to disabled when directions are incompatible. + * + * Additionally, to reuse LRZ between renderpasses, GRAS_LRZ_CNTL checks + * if current value of GRAS_LRZ_DEPTH_VIEW is equal to the value + * stored in the buffer, if not - LRZ is disabled. (This is necessary + * because depth buffer may have several layers and mip levels, on the + * other hand LRZ buffer represents only a single layer + mip level). + * + * LRZ direction between renderpasses is disabled when underlying depth + * buffer is changed, the following commands could change depth image: + * - vkCmdBlitImage* + * - vkCmdCopyBufferToImage* + * - vkCmdCopyImage* + * + * LRZ Fast-Clear + * ============== + * + * The LRZ fast-clear buffer is initialized to zeroes and read/written + * when GRAS_LRZ_CNTL.FC_ENABLE (b3) is set. It appears to store 1b/block. + * '0' means block has original depth clear value, and '1' means that the + * corresponding block in LRZ has been modified. + * + * LRZ fast-clear conservatively clears LRZ buffer, at the point where LRZ is + * written the LRZ block which corresponds to a single fast-clear bit is cleared: + * - To 0.0 if depth comparison is GREATER; + * - To 1.0 if depth comparison is LESS; + * + * This way it's always valid to fast-clear. On the other hand we disable + * fast-clear if depth clear value is not 0.0 or 1.0 because it may be worse + * for perf if some primitives are expected to fail depth test against the + * actual depth clear value. + * + * LRZ Precision + * ============= + * + * LRZ always uses Z16_UNORM. The epsilon for it is 1.f / (1 << 16) which is + * not enough to represent all values of Z32_UNORM or Z32_FLOAT. + * This especially rises questions in context of fast-clear, if fast-clear + * uses a value which cannot be precisely represented by LRZ - we wouldn't + * be able to round it in the correct direction since direction is tracked + * on GPU. + * + * However, it seems that depth comparisons with LRZ values have some "slack" + * and nothing special should be done for such depth clear values. + * + * How it was tested: + * - Clear Z32_FLOAT attachment to 1.f / (1 << 17) + * - LRZ buffer contains all zeroes + * - Do draws and check whether all samples are passing: + * - OP_GREATER with (1.f / (1 << 17) + float32_epsilon) - passing; + * - OP_GREATER with (1.f / (1 << 17) - float32_epsilon) - not passing; + * - OP_LESS with (1.f / (1 << 17) - float32_epsilon) - samples; + * - OP_LESS with() 1.f / (1 << 17) + float32_epsilon) - not passing; + * - OP_LESS_OR_EQ with (1.f / (1 << 17) + float32_epsilon) - not passing; + * In all cases resulting LRZ buffer is all zeroes and LRZ direction is updated. + * + * LRZ Caches + * ========== + * + * ! The policy here is to flush LRZ cache right after it is changed, + * so if LRZ data is needed afterwards - there is no need to flush it + * before using LRZ. + * + * LRZ_FLUSH flushes and invalidates LRZ caches, there are two caches: + * - Cache for fast-clear buffer; + * - Cache for direction byte + depth view params. + * They could be cleared by LRZ_CLEAR. To become visible in GPU memory + * the caches should be flushed with LRZ_FLUSH afterwards. + * + * GRAS_LRZ_CNTL reads from these caches. + */ + +static void +tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image) +{ + if (!depth_image) { + tu_cs_emit_regs(cs, + A6XX_GRAS_LRZ_BUFFER_BASE(0), + A6XX_GRAS_LRZ_BUFFER_PITCH(0), + A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0)); + return; + } + + uint64_t lrz_iova = depth_image->iova + depth_image->lrz_offset; + uint64_t lrz_fc_iova = depth_image->iova + depth_image->lrz_fc_offset; + if (!depth_image->lrz_fc_offset) + lrz_fc_iova = 0; + + tu_cs_emit_regs(cs, + A6XX_GRAS_LRZ_BUFFER_BASE(.qword = lrz_iova), + A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = depth_image->lrz_pitch), + A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(.qword = lrz_fc_iova)); +} + +static void +tu6_write_lrz_reg(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + struct tu_reg_value reg) +{ + if (cmd->device->physical_device->info->a6xx.lrz_track_quirk) { + tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3); + tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_LRZ)); + tu_cs_emit(cs, reg.reg); + tu_cs_emit(cs, reg.value); + } else { + tu_cs_emit_pkt4(cs, reg.reg, 1); + tu_cs_emit(cs, reg.value); + } +} + +static void +tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + /* Disable direction by writing invalid depth view. */ + tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW( + .base_layer = 0b11111111111, + .layer_count = 0b11111111111, + .base_mip_level = 0b1111, + )); + + tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL( + .enable = true, + .disable_on_wrong_dir = true, + )); + + tu6_emit_event_write(cmd, cs, LRZ_CLEAR); + tu6_emit_event_write(cmd, cs, LRZ_FLUSH); +} + +static void +tu_lrz_init_state(struct tu_cmd_buffer *cmd, + const struct tu_render_pass_attachment *att, + const struct tu_image_view *view) +{ + if (!view->image->lrz_height) + return; + + bool clears_depth = att->clear_mask & + (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT); + bool has_gpu_tracking = + cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking; + + if (!has_gpu_tracking && !clears_depth) + return; + + if (!clears_depth && !att->load) + return; + + cmd->state.lrz.image_view = view; + cmd->state.lrz.valid = true; + cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN; + /* Be optimistic and unconditionally enable fast-clear in + * secondary cmdbufs and when reusing previous LRZ state. + */ + cmd->state.lrz.fast_clear = view->image->lrz_fc_size > 0; + + cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking; + cmd->state.lrz.reuse_previous_state = !clears_depth; +} + +void +tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd, + const VkRenderPassBeginInfo *pRenderPassBegin) +{ + const struct tu_render_pass *pass = cmd->state.pass; + + int lrz_img_count = 0; + for (unsigned i = 0; i < pass->attachment_count; i++) { + if (cmd->state.attachments[i]->image->lrz_height) + lrz_img_count++; + } + + if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking && + cmd->state.pass->subpass_count > 1 && lrz_img_count > 1) { + /* Theoretically we could switch between LRZ buffers during the binning + * and tiling passes, but it is untested and would add complexity for + * presumably extremely rare case. + */ + perf_debug(cmd->device, + "Invalidating LRZ because there are several subpasses with " + "different depth attachments in a single renderpass"); + + for (unsigned i = 0; i < pass->attachment_count; i++) { + struct tu_image *image = cmd->state.attachments[i]->image; + tu_disable_lrz(cmd, &cmd->cs, image); + } + } + + /* Track LRZ valid state */ + cmd->state.lrz.valid = false; + uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment; + if (a != VK_ATTACHMENT_UNUSED) { + const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a]; + tu_lrz_init_state(cmd, att, cmd->state.attachments[a]); + if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) { + VkClearValue clear = pRenderPassBegin->pClearValues[a]; + cmd->state.lrz.depth_clear_value = clear; + cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear && + (clear.depthStencil.depth == 0.f || + clear.depthStencil.depth == 1.f); + } + cmd->state.dirty |= TU_CMD_DIRTY_LRZ; + } + + if (!cmd->state.lrz.valid) { + memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz)); + tu6_emit_lrz_buffer(&cmd->cs, NULL); + } +} + +void +tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd, + struct tu_framebuffer *fb) +{ + uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment; + if (a != VK_ATTACHMENT_UNUSED && + cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) { + const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a]; + struct tu_image_view *view = fb->attachments[a].attachment; + + tu_lrz_init_state(cmd, att, view); + } +} + +void +tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + if (!cmd->state.lrz.image_view) + return; + + struct tu_lrz_state *lrz = &cmd->state.lrz; + + tu6_emit_lrz_buffer(cs, lrz->image_view->image); + + if (lrz->reuse_previous_state) { + /* Reuse previous LRZ state, LRZ cache is assumed to be + * already invalidated by previous renderpass. + */ + assert(lrz->gpu_dir_tracking); + + tu6_write_lrz_reg(cmd, cs, + A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW)); + return; + } + + bool invalidate_lrz = !lrz->valid && lrz->gpu_dir_tracking; + if (invalidate_lrz) { + /* Following the blob we elect to disable LRZ for the whole renderpass + * if it is known that LRZ is disabled somewhere in the renderpass. + * + * This is accomplished by making later GRAS_LRZ_CNTL (in binning pass) + * to fail the comparison of depth views. + */ + tu6_disable_lrz_via_depth_view(cmd, cs); + tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0)); + } else if (lrz->fast_clear || lrz->gpu_dir_tracking) { + if (lrz->gpu_dir_tracking) { + tu6_write_lrz_reg(cmd, cs, + A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW)); + } + + tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL( + .enable = true, + .fc_enable = lrz->fast_clear, + .disable_on_wrong_dir = lrz->gpu_dir_tracking, + )); + + /* LRZ_CLEAR.fc_enable + LRZ_CLEAR - clears fast-clear buffer; + * LRZ_CLEAR.disable_on_wrong_dir + LRZ_CLEAR - sets direction to + * CUR_DIR_UNSET. + */ + tu6_emit_event_write(cmd, cs, LRZ_CLEAR); + } + + if (!lrz->fast_clear && !invalidate_lrz) { + tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value); + + /* Even though we disable fast-clear we still have to dirty + * fast-clear buffer because both secondary cmdbufs and following + * renderpasses won't know that fast-clear is disabled. + * + * TODO: we could avoid this if we don't store depth and don't + * expect secondary cmdbufs. + */ + if (lrz->image_view->image->lrz_fc_size) { + tu6_dirty_lrz_fc(cmd, cs, lrz->image_view->image); + } + } +} + +void +tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + if (cmd->state.lrz.fast_clear || cmd->state.lrz.gpu_dir_tracking) { + tu6_emit_lrz_buffer(cs, cmd->state.lrz.image_view->image); + + if (cmd->state.lrz.gpu_dir_tracking) { + tu6_write_lrz_reg(cmd, &cmd->cs, + A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = cmd->state.lrz.image_view->view.GRAS_LRZ_DEPTH_VIEW)); + } + + /* Enable flushing of LRZ fast-clear and of direction buffer */ + tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL( + .enable = true, + .fc_enable = cmd->state.lrz.fast_clear, + .disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking, + )); + } else { + tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(0)); + } + + tu6_emit_event_write(cmd, cs, LRZ_FLUSH); + + /* If gpu_dir_tracking is enabled and lrz is not valid blob, at this point, + * additionally clears direction buffer: + * GRAS_LRZ_DEPTH_VIEW(.dword = 0) + * GRAS_LRZ_DEPTH_VIEW(.dword = 0xffffffff) + * A6XX_GRAS_LRZ_CNTL(.enable = true, .disable_on_wrong_dir = true) + * LRZ_CLEAR + * LRZ_FLUSH + * Since it happens after all of the rendering is done there is no known + * reason to do such clear. + */ +} + +void +tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + if (!cmd->state.lrz.image_view) + return; + + /* Actually, LRZ buffer could be filled in sysmem, in theory to + * be used in another renderpass, but the benefit is rather dubious. + */ + + struct tu_lrz_state *lrz = &cmd->state.lrz; + + if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) { + tu_disable_lrz(cmd, cs, lrz->image_view->image); + /* Make sure depth view comparison will fail. */ + tu6_write_lrz_reg(cmd, cs, + A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0)); + } else { + tu6_emit_lrz_buffer(cs, lrz->image_view->image); + /* Even though we disable LRZ writes in sysmem mode - there is still + * LRZ test, so LRZ should be cleared. + */ + if (lrz->fast_clear) { + tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL( + .enable = true, + .fc_enable = true, + )); + tu6_emit_event_write(cmd, &cmd->cs, LRZ_CLEAR); + tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH); + } else { + tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value); + } + } +} + +void +tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH); +} + +/* Disable LRZ outside of renderpass. */ +void +tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + struct tu_image *image) +{ + if (!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) + return; + + if (!image->lrz_height) + return; + + tu6_emit_lrz_buffer(cs, image); + tu6_disable_lrz_via_depth_view(cmd, cs); +} + +/* Clear LRZ, used for out of renderpass depth clears. */ +void +tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd, + struct tu_image *image, + const VkClearDepthStencilValue *pDepthStencil, + uint32_t rangeCount, + const VkImageSubresourceRange *pRanges) +{ + if (!rangeCount || !image->lrz_height || + !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) + return; + + /* We cannot predict which depth subresource would be used later on, + * so we just pick the first one with depth cleared and clear the LRZ. + */ + const VkImageSubresourceRange *range = NULL; + for (unsigned i = 0; i < rangeCount; i++) { + if (pRanges[i].aspectMask & + (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) { + range = &pRanges[i]; + break; + } + } + + if (!range) + return; + + bool fast_clear = image->lrz_fc_size && (pDepthStencil->depth == 0.f || + pDepthStencil->depth == 1.f); + + tu6_emit_lrz_buffer(&cmd->cs, image); + + tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_DEPTH_VIEW( + .base_layer = range->baseArrayLayer, + .layer_count = tu_get_layerCount(image, range), + .base_mip_level = range->baseMipLevel, + )); + + tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL( + .enable = true, + .fc_enable = fast_clear, + .disable_on_wrong_dir = true, + )); + + tu6_emit_event_write(cmd, &cmd->cs, LRZ_CLEAR); + tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH); + + if (!fast_clear) { + tu6_clear_lrz(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil); + } +} + +void +tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd) +{ + assert(cmd->state.pass); + + cmd->state.lrz.valid = false; + cmd->state.dirty |= TU_CMD_DIRTY_LRZ; + + if (cmd->state.lrz.gpu_dir_tracking) { + tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL( + .enable = true, + .dir = LRZ_DIR_INVALID, + .disable_on_wrong_dir = true, + )); + } +} + +/* update lrz state based on stencil-test func: + * + * Conceptually the order of the pipeline is: + * + * + * FS -> Alpha-Test -> Stencil-Test -> Depth-Test + * | | + * if wrmask != 0 if wrmask != 0 + * | | + * v v + * Stencil-Write Depth-Write + * + * Because Stencil-Test can have side effects (Stencil-Write) prior + * to depth test, in this case we potentially need to disable early + * lrz-test. See: + * + * https://www.khronos.org/opengl/wiki/Per-Sample_Processing + */ +static bool +tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl, + VkCompareOp func, + bool stencil_write) +{ + switch (func) { + case VK_COMPARE_OP_ALWAYS: + /* nothing to do for LRZ, but for stencil test when stencil- + * write is enabled, we need to disable lrz-test, since + * conceptually stencil test and write happens before depth-test. + */ + if (stencil_write) { + return false; + } + break; + case VK_COMPARE_OP_NEVER: + /* fragment never passes, disable lrz_write for this draw. */ + gras_lrz_cntl->lrz_write = false; + break; + default: + /* whether the fragment passes or not depends on result + * of stencil test, which we cannot know when doing binning + * pass. + */ + gras_lrz_cntl->lrz_write = false; + /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side- + * effects from stencil test we need to disable lrz-test. + */ + if (stencil_write) { + return false; + } + break; + } + + return true; +} + +static struct A6XX_GRAS_LRZ_CNTL +tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd, + const uint32_t a) +{ + struct tu_pipeline *pipeline = cmd->state.pipeline; + bool z_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; + bool z_write_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; + bool z_read_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; + bool z_bounds_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE; + VkCompareOp depth_compare_op = (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT; + + struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 }; + + if (!cmd->state.lrz.valid) { + return gras_lrz_cntl; + } + + /* If depth test is disabled we shouldn't touch LRZ. + * Same if there is no depth attachment. + */ + if (a == VK_ATTACHMENT_UNUSED || !z_test_enable || + (cmd->device->instance->debug_flags & TU_DEBUG_NOLRZ)) + return gras_lrz_cntl; + + if (!cmd->state.lrz.gpu_dir_tracking && !cmd->state.attachments) { + /* Without on-gpu LRZ direction tracking - there is nothing we + * can do to enable LRZ in secondary command buffers. + */ + return gras_lrz_cntl; + } + + gras_lrz_cntl.enable = true; + gras_lrz_cntl.lrz_write = + z_write_enable && + !(pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_WRITE); + gras_lrz_cntl.z_test_enable = z_read_enable && z_write_enable; + gras_lrz_cntl.z_bounds_enable = z_bounds_enable; + gras_lrz_cntl.fc_enable = cmd->state.lrz.fast_clear; + gras_lrz_cntl.dir_write = cmd->state.lrz.gpu_dir_tracking; + gras_lrz_cntl.disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking; + + /* See comment in tu_pipeline about disabling LRZ write for blending. */ + if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_LOGIC_OP)) && + cmd->state.logic_op_enabled && cmd->state.rop_reads_dst) + gras_lrz_cntl.lrz_write = false; + + if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE)) && + cmd->state.color_write_enable != MASK(cmd->state.pipeline->num_rts)) + gras_lrz_cntl.lrz_write = false; + + /* LRZ is disabled until it is cleared, which means that one "wrong" + * depth test or shader could disable LRZ until depth buffer is cleared. + */ + bool disable_lrz = false; + bool temporary_disable_lrz = false; + + /* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth + * or early fragment tests. + */ + if (pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_LRZ) { + perf_debug(cmd->device, "Invalidating LRZ due to FS"); + disable_lrz = true; + } + + /* If Z is not written - it doesn't affect LRZ buffer state. + * Which means two things: + * - Don't lock direction until Z is written for the first time; + * - If Z isn't written and direction IS locked it's possible to just + * temporary disable LRZ instead of fully bailing out, when direction + * is changed. + */ + + enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN; + switch (depth_compare_op) { + case VK_COMPARE_OP_ALWAYS: + case VK_COMPARE_OP_NOT_EQUAL: + /* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction, + * so if there is a depth write - LRZ must be disabled. + */ + if (z_write_enable) { + perf_debug(cmd->device, "Invalidating LRZ due to ALWAYS/NOT_EQUAL"); + disable_lrz = true; + gras_lrz_cntl.dir = LRZ_DIR_INVALID; + } else { + perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL"); + temporary_disable_lrz = true; + } + break; + case VK_COMPARE_OP_EQUAL: + case VK_COMPARE_OP_NEVER: + /* Blob disables LRZ for OP_EQUAL, and from our empirical + * evidence it is a right thing to do. + * + * Both OP_EQUAL and OP_NEVER don't change LRZ buffer so + * we could just temporary disable LRZ. + */ + temporary_disable_lrz = true; + break; + case VK_COMPARE_OP_GREATER: + case VK_COMPARE_OP_GREATER_OR_EQUAL: + lrz_direction = TU_LRZ_GREATER; + gras_lrz_cntl.greater = true; + gras_lrz_cntl.dir = LRZ_DIR_GE; + break; + case VK_COMPARE_OP_LESS: + case VK_COMPARE_OP_LESS_OR_EQUAL: + lrz_direction = TU_LRZ_LESS; + gras_lrz_cntl.greater = false; + gras_lrz_cntl.dir = LRZ_DIR_LE; + break; + default: + unreachable("bad VK_COMPARE_OP value or uninitialized"); + break; + }; + + /* If depthfunc direction is changed, bail out on using LRZ. The + * LRZ buffer encodes a min/max depth value per block, but if + * we switch from GT/GE <-> LT/LE, those values cannot be + * interpreted properly. + */ + if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN && + lrz_direction != TU_LRZ_UNKNOWN && + cmd->state.lrz.prev_direction != lrz_direction) { + if (z_write_enable) { + perf_debug(cmd->device, "Invalidating LRZ due to direction change"); + disable_lrz = true; + } else { + perf_debug(cmd->device, "Skipping LRZ due to direction change"); + temporary_disable_lrz = true; + } + } + + /* Consider the following sequence of depthfunc changes: + * + * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER + * LRZ is disabled during COMPARE_OP_EQUAL but could be enabled + * during second VK_COMPARE_OP_GREATER. + * + * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS + * Here, LRZ is disabled during COMPARE_OP_EQUAL and should become + * invalid during COMPARE_OP_LESS. + * + * This shows that we should keep last KNOWN direction. + */ + if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN) + cmd->state.lrz.prev_direction = lrz_direction; + + /* Invalidate LRZ and disable write if stencil test is enabled */ + bool stencil_test_enable = cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE; + if (!disable_lrz && stencil_test_enable) { + bool stencil_front_writemask = + (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ? + (cmd->state.dynamic_stencil_wrmask & 0xff) : + (pipeline->stencil_wrmask & 0xff); + + bool stencil_back_writemask = + (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ? + ((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) : + (pipeline->stencil_wrmask & 0xff00) >> 8; + + VkCompareOp stencil_front_compare_op = + (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC__SHIFT; + + VkCompareOp stencil_back_compare_op = + (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC_BF__SHIFT; + + bool lrz_allowed = true; + lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed( + &gras_lrz_cntl, stencil_front_compare_op, + stencil_front_writemask); + + lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed( + &gras_lrz_cntl, stencil_back_compare_op, + stencil_back_writemask); + + /* Without depth write it's enough to make sure that depth test + * is executed after stencil test, so temporary disabling LRZ is enough. + */ + if (!lrz_allowed) { + if (z_write_enable) { + perf_debug(cmd->device, "Invalidating LRZ due to stencil write"); + disable_lrz = true; + } else { + perf_debug(cmd->device, "Skipping LRZ due to stencil write"); + temporary_disable_lrz = true; + } + } + } + + if (disable_lrz) + cmd->state.lrz.valid = false; + + if (disable_lrz && cmd->state.lrz.gpu_dir_tracking) { + /* Direction byte on GPU should be set to CUR_DIR_DISABLED, + * for this it's not enough to emit empty GRAS_LRZ_CNTL. + */ + gras_lrz_cntl.enable = true; + gras_lrz_cntl.dir = LRZ_DIR_INVALID; + + return gras_lrz_cntl; + } + + if (temporary_disable_lrz) + gras_lrz_cntl.enable = false; + + cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable; + if (!cmd->state.lrz.enabled) + memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl)); + + return gras_lrz_cntl; +} + +void +tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment; + struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a); + + tu6_write_lrz_reg(cmd, cs, pack_A6XX_GRAS_LRZ_CNTL(gras_lrz_cntl)); + tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable)); +} diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index a78ea48aa95..8b2ccd8b5ec 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -265,6 +265,7 @@ enum tu_debug_flags TU_DEBUG_LAYOUT = 1 << 16, TU_DEBUG_LOG_SKIP_GMEM_OPS = 1 << 17, TU_DEBUG_PERF = 1 << 18, + TU_DEBUG_NOLRZFC = 1 << 19, }; struct tu_instance @@ -1126,11 +1127,16 @@ struct tu_lrz_pipeline struct tu_lrz_state { /* Depth/Stencil image currently on use to do LRZ */ - struct tu_image *image; + const struct tu_image_view *image_view; + VkClearValue depth_clear_value; /* If LRZ is in invalid state we cannot use it until depth is cleared */ bool valid : 1; /* Allows to temporary disable LRZ */ bool enabled : 1; + bool fast_clear : 1; + bool gpu_dir_tracking : 1; + /* Continue using old LRZ state (LOAD_OP_LOAD of depth) */ + bool reuse_previous_state : 1; enum tu_lrz_direction prev_direction; }; @@ -1535,6 +1541,51 @@ struct tu_pipeline struct util_dynarray executables; }; +struct tu_image; + +void +tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value); + +void +tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image); + +void +tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs); + +void +tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + struct tu_image *image); + +void +tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd, + struct tu_image *image, + const VkClearDepthStencilValue *pDepthStencil, + uint32_t rangeCount, + const VkImageSubresourceRange *pRanges); + +void +tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd, + const VkRenderPassBeginInfo *pRenderPassBegin); + +void +tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd, + struct tu_framebuffer *fb); + +void +tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs); + +void +tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs); + +void +tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs); + +void +tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs); + +void +tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd); + void tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport, bool z_negative_one_to_one); @@ -1542,9 +1593,6 @@ tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_vie void tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count); -void -tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value); - void tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc); @@ -1691,6 +1739,8 @@ struct tu_image uint32_t lrz_height; uint32_t lrz_pitch; uint32_t lrz_offset; + uint32_t lrz_fc_offset; + uint32_t lrz_fc_size; bool shareable; };