diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc
index e37495c7077..c63ae7dfe36 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@@ -3207,52 +3207,38 @@ sanitize_dst_stage(VkPipelineStageFlags2 stage_mask)
 static enum tu_stage
 vk2tu_single_stage(VkPipelineStageFlags2 vk_stage, bool dst)
 {
+   /* If the destination stage is executed on the CP, then the CP also has to
+    * wait for any WFI's to finish. This is already done for draw calls,
+    * including before indirect param reads, for the most part, so we just
+    * need to WFI and can use TU_STAGE_GPU.
+    *
+    * However, some indirect draw opcodes, depending on firmware, don't have
+    * implicit CP_WAIT_FOR_ME so we have to handle it manually.
+    *
+    * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly
+    * does CP_WAIT_FOR_ME, so we don't include them here.
+    *
+    * Currently we read the draw predicate using CP_MEM_TO_MEM, which
+    * also implicitly does CP_WAIT_FOR_ME. However CP_DRAW_PRED_SET does *not*
+    * implicitly do CP_WAIT_FOR_ME, it seems to only wait for counters to
+    * complete since it's written for DX11 where you can only predicate on the
+    * result of a query object. So if we implement 64-bit comparisons in the
+    * future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit
+    * comparisons, then this will have to be dealt with.
+    */
    if (vk_stage == VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT ||
        vk_stage == VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT ||
        vk_stage == VK_PIPELINE_STAGE_2_FRAGMENT_DENSITY_PROCESS_BIT_EXT)
       return TU_STAGE_CP;
 
-   if (vk_stage == VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT ||
-       vk_stage == VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT ||
-       vk_stage == VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT)
-      return TU_STAGE_FE;
-
-   if (vk_stage == VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT ||
-       vk_stage == VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT ||
-       vk_stage == VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT ||
-       vk_stage == VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT ||
-       vk_stage == VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT)
-      return TU_STAGE_SP_VS;
-
-   if (vk_stage == VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT ||
-       vk_stage == VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)
-      return TU_STAGE_SP_PS;
-
-   if (vk_stage == VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT || /* Yes, really */
-   /* See comment in TU_STAGE_GRAS about early fragment tests */
-       vk_stage == VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT ||
-       vk_stage == VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT ||
-       vk_stage == VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT)
-
-      return TU_STAGE_PS;
-
-   if (vk_stage == VK_PIPELINE_STAGE_2_COPY_BIT ||
-       vk_stage == VK_PIPELINE_STAGE_2_BLIT_BIT ||
-       vk_stage == VK_PIPELINE_STAGE_2_RESOLVE_BIT ||
-       vk_stage == VK_PIPELINE_STAGE_2_CLEAR_BIT ||
-       vk_stage == VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT)
-      /* Blits read in SP_PS and write in PS, in both 2d and 3d cases */
-      return dst ? TU_STAGE_SP_PS : TU_STAGE_PS;
-
    if (vk_stage == VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT ||
        vk_stage == VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)
-      /* Be conservative */
-      return dst ? TU_STAGE_CP : TU_STAGE_PS;
+      return dst ? TU_STAGE_CP : TU_STAGE_GPU;
 
    if (vk_stage == VK_PIPELINE_STAGE_2_HOST_BIT)
-      return dst ? TU_STAGE_PS : TU_STAGE_CP;
+      return dst ? TU_STAGE_BOTTOM : TU_STAGE_CP;
 
-   unreachable("unknown pipeline stage");
+   return TU_STAGE_GPU;
 }
 
 static enum tu_stage
@@ -3270,7 +3256,7 @@ vk2tu_src_stage(VkPipelineStageFlags2 vk_stages)
 static enum tu_stage
 vk2tu_dst_stage(VkPipelineStageFlags2 vk_stages)
 {
-   enum tu_stage stage = TU_STAGE_PS;
+   enum tu_stage stage = TU_STAGE_BOTTOM;
    u_foreach_bit64 (bit, vk_stages) {
       enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true);
       stage = MIN2(stage, new_stage);
@@ -3283,34 +3269,14 @@ static void
 tu_flush_for_stage(struct tu_cache_state *cache,
                    enum tu_stage src_stage, enum tu_stage dst_stage)
 {
-   /* As far as we know, flushes take place in the last stage so if there are
-    * any pending flushes then we have to move down the source stage, because
-    * the data only becomes available when the flush finishes. In particular
-    * this can matter when the CP writes something and we need to invalidate
-    * UCHE to read it.
+   /* Even if the source is the host or CP, the destination access could
+    * generate invalidates that we have to wait to complete.
     */
-   if (cache->flush_bits & (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE))
-      src_stage = TU_STAGE_PS;
+   if (src_stage == TU_STAGE_CP &&
+       (cache->flush_bits & TU_CMD_FLAG_ALL_INVALIDATE))
+      src_stage = TU_STAGE_GPU;
 
-   /* Note: if the destination stage is the CP, then the CP also has to wait
-    * for any WFI's to finish. This is already done for draw calls, including
-    * before indirect param reads, for the most part, so we just need to WFI.
-    *
-    * However, some indirect draw opcodes, depending on firmware, don't have
-    * implicit CP_WAIT_FOR_ME so we have to handle it manually.
-    *
-    * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly
-    * does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it.
-    *
-    * Currently we read the draw predicate using CP_MEM_TO_MEM, which
-    * also implicitly does CP_WAIT_FOR_ME. However CP_DRAW_PRED_SET does *not*
-    * implicitly do CP_WAIT_FOR_ME, it seems to only wait for counters to
-    * complete since it's written for DX11 where you can only predicate on the
-    * result of a query object. So if we implement 64-bit comparisons in the
-    * future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit
-    * comparisons, then this will have to be dealt with.
-    */
-   if (src_stage > dst_stage) {
+   if (src_stage >= dst_stage) {
       cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
       if (dst_stage == TU_STAGE_CP)
          cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME;
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h
index 570cf61b48b..b9f2080af0c 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.h
+++ b/src/freedreno/vulkan/tu_cmd_buffer.h
@@ -149,55 +149,28 @@ enum tu_cmd_access_mask {
       TU_ACCESS_WRITE,
 };
 
-/* Starting with a6xx, the pipeline is split into several "clusters" (really
- * pipeline stages). Each stage has its own pair of register banks and can
- * switch them independently, so that earlier stages can run ahead of later
- * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
- * the same time.
+/* From the driver's point of view, we only need to distinguish between things
+ * which won't start until a WFI is complete and things which additionally
+ * need a WAIT_FOR_ME.
  *
- * As a result of this, we need to insert a WFI when an earlier stage depends
- * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
- * pending WFI's to complete before starting, and usually before reading
- * indirect params even, so a WFI also acts as a full "pipeline stall".
- *
- * Note, the names of the stages come from CLUSTER_* in devcoredump. We
- * include all the stages for completeness, even ones which do not read/write
- * anything.
+ * TODO: This will get more complicated with concurrent binning.
  */
-
 enum tu_stage {
-   /* This doesn't correspond to a cluster, but we need it for tracking
-    * indirect draw parameter reads etc.
+   /* As a destination stage, this is for operations on the CP which don't
+    * wait for pending WFIs to complete and therefore need a CP_WAIT_FOR_ME.
+    * As a source stage, it is for things needing no waits. 
     */
    TU_STAGE_CP,
 
-   /* - Fetch index buffer
-    * - Fetch vertex attributes, dispatch VS
+   /* This is for most operations, which WFI will wait to finish and will not
+    * start until any pending WFIs are finished.
     */
-   TU_STAGE_FE,
+   TU_STAGE_GPU,
 
-   /* Execute all geometry stages (VS thru GS) */
-   TU_STAGE_SP_VS,
-
-   /* Write to VPC, do primitive assembly. */
-   TU_STAGE_PC_VS,
-
-   /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
-    * to devcoredump so presumably this stage stalls for TU_STAGE_PS when
-    * early depth testing is enabled before dispatching fragments? However
-    * GRAS reads and writes LRZ directly.
+   /* This is only used as a destination stage and is for things needing no
+    * waits on the GPU (e.g. host operations).
     */
-   TU_STAGE_GRAS,
-
-   /* Execute FS */
-   TU_STAGE_SP_PS,
-
-   /* - Fragment tests
-    * - Write color/depth
-    * - Streamout writes (???)
-    * - Varying interpolation (???)
-    */
-   TU_STAGE_PS,
+   TU_STAGE_BOTTOM,
 };
 
 enum tu_cmd_flush_bits {