diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index e37495c7077..c63ae7dfe36 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -3207,52 +3207,38 @@ sanitize_dst_stage(VkPipelineStageFlags2 stage_mask) static enum tu_stage vk2tu_single_stage(VkPipelineStageFlags2 vk_stage, bool dst) { + /* If the destination stage is executed on the CP, then the CP also has to + * wait for any WFI's to finish. This is already done for draw calls, + * including before indirect param reads, for the most part, so we just + * need to WFI and can use TU_STAGE_GPU. + * + * However, some indirect draw opcodes, depending on firmware, don't have + * implicit CP_WAIT_FOR_ME so we have to handle it manually. + * + * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly + * does CP_WAIT_FOR_ME, so we don't include them here. + * + * Currently we read the draw predicate using CP_MEM_TO_MEM, which + * also implicitly does CP_WAIT_FOR_ME. However CP_DRAW_PRED_SET does *not* + * implicitly do CP_WAIT_FOR_ME, it seems to only wait for counters to + * complete since it's written for DX11 where you can only predicate on the + * result of a query object. So if we implement 64-bit comparisons in the + * future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit + * comparisons, then this will have to be dealt with. + */ if (vk_stage == VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT || vk_stage == VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT || vk_stage == VK_PIPELINE_STAGE_2_FRAGMENT_DENSITY_PROCESS_BIT_EXT) return TU_STAGE_CP; - if (vk_stage == VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT || - vk_stage == VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT || - vk_stage == VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT) - return TU_STAGE_FE; - - if (vk_stage == VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT || - vk_stage == VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT || - vk_stage == VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT || - vk_stage == VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT || - vk_stage == VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT) - return TU_STAGE_SP_VS; - - if (vk_stage == VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT || - vk_stage == VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT) - return TU_STAGE_SP_PS; - - if (vk_stage == VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT || /* Yes, really */ - /* See comment in TU_STAGE_GRAS about early fragment tests */ - vk_stage == VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT || - vk_stage == VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT || - vk_stage == VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT) - - return TU_STAGE_PS; - - if (vk_stage == VK_PIPELINE_STAGE_2_COPY_BIT || - vk_stage == VK_PIPELINE_STAGE_2_BLIT_BIT || - vk_stage == VK_PIPELINE_STAGE_2_RESOLVE_BIT || - vk_stage == VK_PIPELINE_STAGE_2_CLEAR_BIT || - vk_stage == VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT) - /* Blits read in SP_PS and write in PS, in both 2d and 3d cases */ - return dst ? TU_STAGE_SP_PS : TU_STAGE_PS; - if (vk_stage == VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT || vk_stage == VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT) - /* Be conservative */ - return dst ? TU_STAGE_CP : TU_STAGE_PS; + return dst ? TU_STAGE_CP : TU_STAGE_GPU; if (vk_stage == VK_PIPELINE_STAGE_2_HOST_BIT) - return dst ? TU_STAGE_PS : TU_STAGE_CP; + return dst ? TU_STAGE_BOTTOM : TU_STAGE_CP; - unreachable("unknown pipeline stage"); + return TU_STAGE_GPU; } static enum tu_stage @@ -3270,7 +3256,7 @@ vk2tu_src_stage(VkPipelineStageFlags2 vk_stages) static enum tu_stage vk2tu_dst_stage(VkPipelineStageFlags2 vk_stages) { - enum tu_stage stage = TU_STAGE_PS; + enum tu_stage stage = TU_STAGE_BOTTOM; u_foreach_bit64 (bit, vk_stages) { enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true); stage = MIN2(stage, new_stage); @@ -3283,34 +3269,14 @@ static void tu_flush_for_stage(struct tu_cache_state *cache, enum tu_stage src_stage, enum tu_stage dst_stage) { - /* As far as we know, flushes take place in the last stage so if there are - * any pending flushes then we have to move down the source stage, because - * the data only becomes available when the flush finishes. In particular - * this can matter when the CP writes something and we need to invalidate - * UCHE to read it. + /* Even if the source is the host or CP, the destination access could + * generate invalidates that we have to wait to complete. */ - if (cache->flush_bits & (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE)) - src_stage = TU_STAGE_PS; + if (src_stage == TU_STAGE_CP && + (cache->flush_bits & TU_CMD_FLAG_ALL_INVALIDATE)) + src_stage = TU_STAGE_GPU; - /* Note: if the destination stage is the CP, then the CP also has to wait - * for any WFI's to finish. This is already done for draw calls, including - * before indirect param reads, for the most part, so we just need to WFI. - * - * However, some indirect draw opcodes, depending on firmware, don't have - * implicit CP_WAIT_FOR_ME so we have to handle it manually. - * - * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly - * does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it. - * - * Currently we read the draw predicate using CP_MEM_TO_MEM, which - * also implicitly does CP_WAIT_FOR_ME. However CP_DRAW_PRED_SET does *not* - * implicitly do CP_WAIT_FOR_ME, it seems to only wait for counters to - * complete since it's written for DX11 where you can only predicate on the - * result of a query object. So if we implement 64-bit comparisons in the - * future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit - * comparisons, then this will have to be dealt with. - */ - if (src_stage > dst_stage) { + if (src_stage >= dst_stage) { cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE; if (dst_stage == TU_STAGE_CP) cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME; diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index 570cf61b48b..b9f2080af0c 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -149,55 +149,28 @@ enum tu_cmd_access_mask { TU_ACCESS_WRITE, }; -/* Starting with a6xx, the pipeline is split into several "clusters" (really - * pipeline stages). Each stage has its own pair of register banks and can - * switch them independently, so that earlier stages can run ahead of later - * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at - * the same time. +/* From the driver's point of view, we only need to distinguish between things + * which won't start until a WFI is complete and things which additionally + * need a WAIT_FOR_ME. * - * As a result of this, we need to insert a WFI when an earlier stage depends - * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any - * pending WFI's to complete before starting, and usually before reading - * indirect params even, so a WFI also acts as a full "pipeline stall". - * - * Note, the names of the stages come from CLUSTER_* in devcoredump. We - * include all the stages for completeness, even ones which do not read/write - * anything. + * TODO: This will get more complicated with concurrent binning. */ - enum tu_stage { - /* This doesn't correspond to a cluster, but we need it for tracking - * indirect draw parameter reads etc. + /* As a destination stage, this is for operations on the CP which don't + * wait for pending WFIs to complete and therefore need a CP_WAIT_FOR_ME. + * As a source stage, it is for things needing no waits. */ TU_STAGE_CP, - /* - Fetch index buffer - * - Fetch vertex attributes, dispatch VS + /* This is for most operations, which WFI will wait to finish and will not + * start until any pending WFIs are finished. */ - TU_STAGE_FE, + TU_STAGE_GPU, - /* Execute all geometry stages (VS thru GS) */ - TU_STAGE_SP_VS, - - /* Write to VPC, do primitive assembly. */ - TU_STAGE_PC_VS, - - /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according - * to devcoredump so presumably this stage stalls for TU_STAGE_PS when - * early depth testing is enabled before dispatching fragments? However - * GRAS reads and writes LRZ directly. + /* This is only used as a destination stage and is for things needing no + * waits on the GPU (e.g. host operations). */ - TU_STAGE_GRAS, - - /* Execute FS */ - TU_STAGE_SP_PS, - - /* - Fragment tests - * - Write color/depth - * - Streamout writes (???) - * - Varying interpolation (???) - */ - TU_STAGE_PS, + TU_STAGE_BOTTOM, }; enum tu_cmd_flush_bits {