From 54b50094a0dec65e481aff6b32d4ea7c8df47b83 Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Thu, 13 Feb 2025 11:29:29 -0500
Subject: [PATCH] tu: Implement bin merging for views

When apps use VK_QCOM_multiview_per_view_render_areas, there may be some
bins which are only visible (i.e. overlapping the render area) in one
view. In the typical VR use-case, there is a strip of bins to the right
of the the left eye and to the left of the right eye that are not used
with that eye. By making sure that the right eye is never rendered to,
we can reuse that space to double the GMEM height and merge two bins
along the left edge, partially offsetting the cost of extra bins from
offsetting the left and right viewports and render areas.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35894>
---
 src/freedreno/vulkan/tu_clear_blit.cc |  58 +++++---
 src/freedreno/vulkan/tu_cmd_buffer.cc | 184 ++++++++++++++++++++++----
 2 files changed, 194 insertions(+), 48 deletions(-)
diff --git a/src/freedreno/vulkan/tu_clear_blit.cc b/src/freedreno/vulkan/tu_clear_blit.cc
index 04fc06f642c..aa60d61f583 100644
--- a/src/freedreno/vulkan/tu_clear_blit.cc
+++ b/src/freedreno/vulkan/tu_clear_blit.cc
@@ -4392,18 +4392,25 @@ fdm_apply_gmem_clear_coords(struct tu_cmd_buffer *cmd,
       frag_area = (VkExtent2D) { 1, 1 };
    }
 
-   unsigned x1 = state->rect.offset.x / frag_area.width + offset.x;
-   unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width,
-                              frag_area.width) + offset.x - 1;
-   unsigned y1 = state->rect.offset.y / frag_area.height + offset.y;
-   unsigned y2 = DIV_ROUND_UP(state->rect.offset.y + state->rect.extent.height,
-                              frag_area.height) + offset.y - 1;
+   if (bin.extent.width == 0 && bin.extent.height == 0) {
+      /* clear a 0 area rectangle to skip this clear */
+      tu_cs_emit_regs(cs,
+                      A6XX_RB_RESOLVE_CNTL_1(.x = 1, .y = 1),
+                      A6XX_RB_RESOLVE_CNTL_2(.x = 0, .y = 0));
+   } else {
+      unsigned x1 = state->rect.offset.x / frag_area.width + offset.x;
+      unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width,
+                                 frag_area.width) + offset.x - 1;
+      unsigned y1 = state->rect.offset.y / frag_area.height + offset.y;
+      unsigned y2 = DIV_ROUND_UP(state->rect.offset.y + state->rect.extent.height,
+                                 frag_area.height) + offset.y - 1;
 
-   tu_cs_emit_pkt4(cs, REG_A6XX_RB_RESOLVE_CNTL_1, 2);
-   tu_cs_emit(cs,
-              A6XX_RB_RESOLVE_CNTL_1_X(x1) | A6XX_RB_RESOLVE_CNTL_1_Y(y1));
-   tu_cs_emit(cs,
-              A6XX_RB_RESOLVE_CNTL_2_X(x2) | A6XX_RB_RESOLVE_CNTL_2_Y(y2));
+      tu_cs_emit_pkt4(cs, REG_A6XX_RB_RESOLVE_CNTL_1, 2);
+      tu_cs_emit(cs,
+                 A6XX_RB_RESOLVE_CNTL_1_X(x1) | A6XX_RB_RESOLVE_CNTL_1_Y(y1));
+      tu_cs_emit(cs,
+                 A6XX_RB_RESOLVE_CNTL_2_X(x2) | A6XX_RB_RESOLVE_CNTL_2_Y(y2));
+   }
 }
 
 template <chip CHIP>
@@ -5578,15 +5585,26 @@ fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
    uint32_t scaled_width = bin.extent.width / frag_area.width;
    uint32_t scaled_height = bin.extent.height / frag_area.height;
 
-   tu_cs_emit_regs(
-      cs, GRAS_A2D_DEST_TL(CHIP, .x = bin.offset.x, .y = bin.offset.y),
-      GRAS_A2D_DEST_BR(CHIP, .x = bin.offset.x + bin.extent.width - 1,
-                       .y = bin.offset.y + bin.extent.height - 1));
-   tu_cs_emit_regs(cs,
-                   GRAS_A2D_SRC_XMIN(CHIP, common_bin_offset.x),
-                   GRAS_A2D_SRC_XMAX(CHIP, common_bin_offset.x + scaled_width - 1),
-                   GRAS_A2D_SRC_YMIN(CHIP, common_bin_offset.y),
-                   GRAS_A2D_SRC_YMAX(CHIP, common_bin_offset.y + scaled_height - 1));
+   if (bin.extent.width == 0 && bin.extent.height == 0) {
+      tu_cs_emit_regs(cs,
+         GRAS_A2D_DEST_TL(CHIP, .x = 1, .y = 1),
+         GRAS_A2D_DEST_BR(CHIP, .x = 0, .y = 0));
+      tu_cs_emit_regs(cs,
+                      GRAS_A2D_SRC_XMIN(CHIP, 1),
+                      GRAS_A2D_SRC_XMAX(CHIP, 0),
+                      GRAS_A2D_SRC_YMIN(CHIP, 1),
+                      GRAS_A2D_SRC_YMAX(CHIP, 0));
+   } else {
+      tu_cs_emit_regs(cs,
+         GRAS_A2D_DEST_TL(CHIP, .x = bin.offset.x, .y = bin.offset.y),
+         GRAS_A2D_DEST_BR(CHIP, .x = bin.offset.x + bin.extent.width - 1,
+                          .y = bin.offset.y + bin.extent.height - 1));
+      tu_cs_emit_regs(cs,
+                      GRAS_A2D_SRC_XMIN(CHIP, common_bin_offset.x),
+                      GRAS_A2D_SRC_XMAX(CHIP, common_bin_offset.x + scaled_width - 1),
+                      GRAS_A2D_SRC_YMIN(CHIP, common_bin_offset.y),
+                      GRAS_A2D_SRC_YMAX(CHIP, common_bin_offset.y + scaled_height - 1));
+   }
 }
 
 template <chip CHIP>
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc
index fd82f1f1385..c76cbedf9ac 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@@ -1428,7 +1428,17 @@ struct tu_tile_config {
    VkOffset2D pos;
    uint32_t pipe;
    uint32_t slot_mask;
-   VkExtent2D extent;
+   uint32_t visible_views;
+
+   /* For merged tiles, the extent in tiles when resolved to system memory.
+    */
+   VkExtent2D sysmem_extent;
+
+   /* For merged tiles, the extent in tiles in GMEM. This can only be more
+    * than 1 if there is extra free space from an unused view.
+    */
+   VkExtent2D gmem_extent;
+
    VkExtent2D frag_areas[MAX_VIEWS];
 };
 
@@ -1585,6 +1595,11 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
       tu7_emit_tile_render_begin_regs<CHIP>(cs);
    }
 
+   /* The GMEM stride is hardcoded when we emit input attachments and 3d
+    * loads, so the width can't be changed currently.
+    */
+   assert(tile->gmem_extent.width == 1);
+
    tu6_emit_bin_size_gmem<CHIP>(cmd, cs, BUFFERS_IN_GMEM, disable_lrz);
 
    tu_cs_emit_regs(cs,
@@ -1594,7 +1609,9 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
    const uint32_t y1 = tiling->tile0.height * tile->pos.y;
 
    const uint32_t x2 = MIN2(x1 + tiling->tile0.width, MAX_VIEWPORT_SIZE);
-   const uint32_t y2 = MIN2(y1 + tiling->tile0.height, MAX_VIEWPORT_SIZE);
+   const uint32_t y2 =
+      MIN2(y1 + tiling->tile0.height * tile->gmem_extent.height,
+           MAX_VIEWPORT_SIZE);
 
    if (bin_scale_en) {
       /* It seems that the window scissor happens *before*
@@ -1648,13 +1665,25 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
    if (cmd->fdm_bin_patchpoints.size != 0) {
       VkRect2D bin = {
          { x1, y1 },
-         { (x2 - x1) * tile->extent.width, (y2 - y1) * tile->extent.height }
+         {
+            tiling->tile0.width * tile->sysmem_extent.width,
+            tiling->tile0.height * tile->sysmem_extent.height
+         }
       };
       VkRect2D bins[views];
       VkOffset2D frag_offsets[MAX_VIEWS];
       for (unsigned i = 0; i < views; i++) {
          frag_offsets[i] = (VkOffset2D) { 0, 0 };
 
+         /* This makes the bin empty for non-visible views, which makes us not
+          * render anything. This frees up the GMEM space for the non-visible
+          * view to be used to combine tiles.
+          */
+         if (!(tile->visible_views & (1u << i))) {
+            bins[i] = { { 0, 0 }, { 0, 0 } };
+            continue;
+         }
+
          if (!fdm_offsets || cmd->state.rp.shared_viewport) {
             bins[i] = bin;
             continue;
@@ -1674,13 +1703,6 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
          if (bin_scale_en) {
             VkExtent2D frag_areas[MAX_HW_SCALED_VIEWS];
             for (unsigned i = 0; i < MAX_HW_SCALED_VIEWS; i++) {
-               if (i >= layers) {
-                  /* Make sure unused views aren't garbage */
-                  frag_areas[i] = (VkExtent2D) {1, 1};
-                  frag_offsets[i] = (VkOffset2D) { 0, 0 };
-                  continue;
-               }
-
                /* The HW bin offset is always per-layer, whereas if there is
                 * more than 1 layer (i.e. layered rendering instead of
                 * multiview rendering) and FDM is not per-layer then all
@@ -1688,6 +1710,14 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
                 * explicitly broadcast it here.
                 */
                unsigned view = MIN2(i, views - 1);
+
+               if (!(tile->visible_views & (1u << view)) || i >= layers) {
+                  /* Make sure unused views aren't garbage */
+                  frag_areas[i] = (VkExtent2D) {1, 1};
+                  frag_offsets[i] = (VkOffset2D) { 0, 0 };
+                  continue;
+               }
+
                frag_areas[i] = tile->frag_areas[view];
                frag_offsets[i].x = x1 - x1 / tile->frag_areas[view].width;
                frag_offsets[i].y = y1 - y1 / tile->frag_areas[view].height;
@@ -3798,16 +3828,64 @@ tu_identity_frag_area(struct tu_cmd_buffer *cmd,
       tile->frag_areas[i] = (VkExtent2D) { 1, 1 };
 }
 
+static bool
+rects_intersect(VkRect2D a, VkRect2D b)
+{
+   return a.offset.x < b.offset.x + (int32_t)b.extent.width &&
+          b.offset.x < a.offset.x + (int32_t)a.extent.width &&
+          a.offset.y < b.offset.y + (int32_t)b.extent.height &&
+          b.offset.y < a.offset.y + (int32_t)a.extent.height;
+}
+
+/* Use the render area(s) to figure out which views of the bin are visible.
+ */
+static void
+tu_calc_bin_visibility(struct tu_cmd_buffer *cmd,
+                       struct tu_tile_config *tile,
+                       const VkOffset2D *offsets)
+{
+   const struct tu_tiling_config *tiling = cmd->state.tiling;
+   uint32_t views = tu_fdm_num_layers(cmd);
+   VkRect2D bin = {
+      {
+         tile->pos.x * tiling->tile0.width,
+         tile->pos.y * tiling->tile0.height
+      },
+      tiling->tile0
+   };
+
+   tile->visible_views = 0;
+   for (unsigned i = 0; i < views; i++) {
+      VkRect2D offsetted_bin = bin;
+      if (offsets && !cmd->state.rp.shared_viewport) {
+         VkOffset2D bin_offset = tu_bin_offset(offsets[i], tiling);
+         offsetted_bin.offset.x -= bin_offset.x;
+         offsetted_bin.offset.y -= bin_offset.y;
+      }
+
+      if (rects_intersect(offsetted_bin,
+                          cmd->state.per_layer_render_area ?
+                          cmd->state.render_areas[i] :
+                          cmd->state.render_areas[0])) {
+         tile->visible_views |= (1u << i);
+      }
+   }
+}
+
 static bool
 try_merge_tiles(struct tu_tile_config *dst, const struct tu_tile_config *src,
-                unsigned views, bool has_abs_bin_mask)
+                unsigned views, bool has_abs_bin_mask, bool shared_viewport)
 {
    uint32_t slot_mask = dst->slot_mask | src->slot_mask;
+   uint32_t visible_views = dst->visible_views | src->visible_views;
 
-   /* The fragment areas must be the same. */
+   /* The fragment areas must be the same for views where both bins are
+    * visible.
+    */
    for (unsigned i = 0; i < views; i++) {
-      if (dst->frag_areas[i].width != src->frag_areas[i].width ||
-          dst->frag_areas[i].height != src->frag_areas[i].height)
+      if ((dst->visible_views & src->visible_views & (1u << i)) &&
+          (dst->frag_areas[i].width != src->frag_areas[i].width ||
+           dst->frag_areas[i].height != src->frag_areas[i].height))
          return false;
    }
 
@@ -3815,15 +3893,19 @@ try_merge_tiles(struct tu_tile_config *dst, const struct tu_tile_config *src,
     * compatible width/height.
     */
    if (dst->pos.x == src->pos.x) {
-      if (dst->extent.height != src->extent.height)
+      if (dst->sysmem_extent.height != src->sysmem_extent.height)
          return false;
    } else if (dst->pos.y == src->pos.y) {
-      if (dst->extent.width != src->extent.width)
+      if (dst->sysmem_extent.width != src->sysmem_extent.width)
          return false;
    } else {
       return false;
    }
 
+   if (dst->gmem_extent.width != src->gmem_extent.width ||
+       dst->gmem_extent.height != src->gmem_extent.height)
+      return false;
+
    if (!has_abs_bin_mask) {
       /* The mask of the combined tile has to fit in 16 bits */
       uint32_t hw_mask = slot_mask >> (ffs(slot_mask) - 1);
@@ -3835,28 +3917,54 @@ try_merge_tiles(struct tu_tile_config *dst, const struct tu_tile_config *src,
     * how we call this function below.
     */
    VkExtent2D extent = {
-      dst->extent.width + (dst->pos.x - src->pos.x),
-      dst->extent.height + (dst->pos.y - src->pos.y),
+      dst->sysmem_extent.width + (dst->pos.x - src->pos.x),
+      dst->sysmem_extent.height + (dst->pos.y - src->pos.y),
    };
 
-   assert(dst->extent.height > 0);
+   assert(dst->sysmem_extent.height > 0);
 
-   /* The common fragment areas must not be smaller than the combined bin
+   /* If only the first view is visible in both tiles, we can reuse the GMEM
+    * space meant for the rest of the views to multiply the height of the
+    * tile. We can't do this if we can't override the scissor for different
+    * views though.
+    */
+   unsigned height_multiplier = 1;
+   if (visible_views == 1 && views > 1 && dst->gmem_extent.height == 1 &&
+       !shared_viewport)
+      height_multiplier = views;
+   else
+      height_multiplier = dst->gmem_extent.height;
+
+   /* The combined fragment areas must not be smaller than the combined bin
     * extent, so that the combined bin is not larger than the original
     * unscaled bin.
     */
    for (unsigned i = 0; i < views; i++) {
-      if (dst->frag_areas[i].width < extent.width ||
-          dst->frag_areas[i].height < extent.height)
+      if ((dst->visible_views & (1u << i)) &&
+          (dst->frag_areas[i].width < extent.width ||
+           dst->frag_areas[i].height * height_multiplier < extent.height))
+         return false;
+      if ((src->visible_views & (1u << i)) &&
+          (src->frag_areas[i].width < extent.width ||
+           src->frag_areas[i].height * height_multiplier < extent.height))
          return false;
    }
 
    /* Ok, let's combine them. dst is below or to the right of src, so it takes
     * src's position.
     */
-   dst->extent = extent;
+   for (unsigned i = 0; i < views; i++) {
+      if (src->visible_views & ~dst->visible_views & (1u << i))
+         dst->frag_areas[i] = src->frag_areas[i];
+      if (((src->visible_views | dst->visible_views) & (1u << i)) &&
+          dst->frag_areas[i].height < extent.height)
+         dst->gmem_extent.height = height_multiplier;
+   }
+   dst->sysmem_extent = extent;
+   dst->visible_views = visible_views;
    dst->pos = src->pos;
    dst->slot_mask = slot_mask;
+
    return true;
 }
 
@@ -3872,6 +3980,7 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
    unsigned views = tu_fdm_num_layers(cmd);
    bool has_abs_mask =
       cmd->device->physical_device->info->props.has_abs_bin_mask;
+   bool shared_viewport = cmd->state.rp.shared_viewport;
 
    struct tu_tile_config tiles[width * height];
 
@@ -3880,9 +3989,11 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
       for (uint32_t x = 0; x < width; x++) {
          struct tu_tile_config *tile = &tiles[width * y + x];
          tile->pos = { x + tx1, y + ty1 };
-         tile->extent = { 1, 1 };
+         tile->sysmem_extent = { 1, 1 };
+         tile->gmem_extent = { 1, 1 };
          tile->pipe = pipe;
          tile->slot_mask = 1u << (width * y + x);
+         tu_calc_bin_visibility(cmd, tile, fdm_offsets);
          tu_calc_frag_area(cmd, tile, fdm, fdm_offsets);
       }
    }
@@ -3893,9 +4004,12 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
    for (uint32_t y = 0; y < height; y++) {
       for (uint32_t x = 0; x < width; x++) {
          struct tu_tile_config *tile = &tiles[width * y + x];
+         if (tile->visible_views == 0)
+            continue;
          if (x > 0) {
             struct tu_tile_config *prev_x_tile = &tiles[width * y + x - 1];
-            if (try_merge_tiles(tile, prev_x_tile, views, has_abs_mask)) {
+            if (try_merge_tiles(tile, prev_x_tile, views, has_abs_mask,
+                                shared_viewport)) {
                merged_tiles |= prev_x_tile->slot_mask;
             }
          }
@@ -3907,7 +4021,8 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
              * merged horizontally into its neighbor in the previous row.
              */
             if (!(merged_tiles & (1u << prev_y_idx)) &&
-                try_merge_tiles(tile, prev_y_tile, views, has_abs_mask)) {
+                try_merge_tiles(tile, prev_y_tile, views, has_abs_mask,
+                                shared_viewport)) {
                merged_tiles |= prev_y_tile->slot_mask;
             }
          }
@@ -3927,7 +4042,11 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
          if (merged_tiles & (1u << tile_idx))
             continue;
 
-         tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx], fdm_offsets);
+         struct tu_tile_config *tile = &tiles[tile_idx];
+         if (tile->visible_views == 0)
+            continue;
+
+         tu6_render_tile<CHIP>(cmd, &cmd->cs, tile, fdm_offsets);
       }
    }
 }
@@ -3983,6 +4102,13 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
    }
 
    bool has_fdm = fdm || (TU_DEBUG(FDM) && cmd->state.pass->has_fdm);
+   /* TODO: we should also be able to merge tiles when only
+    * per_view_render_areas is used without FDM. That requires using another
+    * method to force disable draws since we don't want to force the viewport
+    * to be re-emitted, like overriding the view mask. It would also require
+    * disabling stores, and adding patchpoints for CmdClearAttachments in
+    * secondaries or making it use the view mask.
+    */
    bool merge_tiles = has_fdm && !TU_DEBUG(NO_BIN_MERGING) &&
       cmd->device->physical_device->info->props.has_bin_mask;
 
@@ -4038,8 +4164,10 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
                   .pos = { tx1 + tx, ty },
                   .pipe = pipe,
                   .slot_mask = 1u << (slot_row + tx),
-                  .extent = { 1, 1 },
+                  .sysmem_extent = { 1, 1 },
+                  .gmem_extent = { 1, 1 },
                };
+               tu_calc_bin_visibility(cmd, &tile, fdm_offsets);
                if (has_fdm)
                   tu_calc_frag_area(cmd, &tile, fdm, fdm_offsets);
                else