From 37ce9a7a61287087b6bd5d7cc8d3628ff7e248b9 Mon Sep 17 00:00:00 2001
From: Emma Anholt <emma@anholt.net>
Date: Mon, 1 Aug 2022 13:08:39 -0700
Subject: [PATCH] turnip: Share gmem allocations between attachments.

If attachments are used in separate subpasses, then we can store them in
the same GMEM area.  That gives us more space per attachment and thus
larger tile sizes.  For gfxbench vk-5-normal's main renderpass it goes
from 128x128 to 160x128, improving perf by 2.125% +/- 0.194356% (n=4)

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17943>
---
 src/freedreno/vulkan/tu_pass.cc | 144 ++++++++++++++++++++++----------
 1 file changed, 98 insertions(+), 46 deletions(-)

diff --git a/src/freedreno/vulkan/tu_pass.cc b/src/freedreno/vulkan/tu_pass.cc
index d8caa651e99..9964ea444a0 100644
--- a/src/freedreno/vulkan/tu_pass.cc
+++ b/src/freedreno/vulkan/tu_pass.cc
@@ -599,6 +599,53 @@ tu_render_pass_check_ib2_skip(struct tu_render_pass *pass)
    }
 }
 
+struct tu_gmem_alloc {
+   uint32_t gmem_offset;
+   uint32_t cpp;
+   uint32_t first_subpass;
+   uint32_t last_subpass;
+};
+
+static struct tu_gmem_alloc *
+tu_gmem_alloc(struct tu_gmem_alloc *allocs,
+              uint32_t *num_allocs,
+              uint32_t cpp,
+              uint32_t first_subpass,
+              uint32_t last_subpass)
+{
+   struct tu_gmem_alloc *alloc = NULL;
+
+   /* Find an allocation that is free during our subpass range, with the
+    * closet usable cpp.
+    */
+   for (int i = 0; i < *num_allocs; i++) {
+      if (!(allocs[i].first_subpass > last_subpass ||
+            allocs[i].last_subpass < first_subpass)) {
+         continue;
+      }
+
+      if (allocs[i].cpp == cpp) {
+         alloc = &allocs[i];
+         break;
+      }
+      if (allocs[i].cpp > cpp && (!alloc || alloc->cpp > allocs[i].cpp))
+         alloc = &allocs[i];
+   }
+   if (alloc) {
+      /* Expand the range of the existing allocation */
+      alloc->first_subpass = MIN2(alloc->first_subpass, first_subpass);
+      alloc->last_subpass = MAX2(alloc->last_subpass, last_subpass);
+   } else {
+      /* Make a new gmem allocation for this cpp. */
+      alloc = &allocs[(*num_allocs)++];
+      alloc->cpp = cpp;
+      alloc->first_subpass = first_subpass;
+      alloc->last_subpass = last_subpass;
+   }
+
+   return alloc;
+}
+
 static void
 tu_render_pass_gmem_config(struct tu_render_pass *pass,
                            const struct tu_physical_device *phys_dev)
@@ -609,24 +656,31 @@ tu_render_pass_gmem_config(struct tu_render_pass *pass,
       /* log2(gmem_align/(tile_align_w*tile_align_h)) */
       uint32_t block_align_shift = 3;
       uint32_t tile_align_w = phys_dev->info->tile_align_w;
-      uint32_t gmem_align = (1 << block_align_shift) * tile_align_w *
-                            phys_dev->info->tile_align_h;
+      uint32_t gmem_align = (1 << block_align_shift) * tile_align_w * phys_dev->info->tile_align_h;
+
+      /* gmem allocations to make, possibly shared between attachments. Each
+       * attachment may have 2 allocations, to handle separate stencil.
+       */
+      struct tu_gmem_alloc gmem_alloc[2 * pass->attachment_count];
+      uint32_t num_gmem_alloc = 0;
+      struct tu_gmem_alloc *att_gmem_alloc[2 * pass->attachment_count];
+      for (int i = 0; i < ARRAY_SIZE(att_gmem_alloc); i++)
+         att_gmem_alloc[i] = NULL;
 
-      /* calculate total bytes per pixel */
-      uint32_t cpp_total = 0;
-      uint32_t min_cpp = UINT32_MAX;
       for (uint32_t i = 0; i < pass->attachment_count; i++) {
          struct tu_render_pass_attachment *att = &pass->attachments[i];
          bool cpp1 = (att->cpp == 1);
          if (att->gmem) {
-            cpp_total += att->cpp;
-            min_cpp = MIN2(min_cpp, att->cpp);
+            att_gmem_alloc[i * 2] =
+               tu_gmem_alloc(gmem_alloc, &num_gmem_alloc, att->cpp,
+                           att->first_subpass_idx, att->last_subpass_idx);
 
             /* take into account the separate stencil: */
             if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
-               min_cpp = MIN2(min_cpp, att->samples);
                cpp1 = (att->samples == 1);
-               cpp_total += att->samples;
+               att_gmem_alloc[i * 2 + 1] =
+                  tu_gmem_alloc(gmem_alloc, &num_gmem_alloc, att->samples,
+                              att->first_subpass_idx, att->last_subpass_idx);
             }
 
             /* texture pitch must be aligned to 64, use a tile_align_w that is
@@ -640,16 +694,22 @@ tu_render_pass_gmem_config(struct tu_render_pass *pass,
          }
       }
 
+      uint32_t cpp_total = 0;
+      uint32_t min_cpp = UINT32_MAX;
+      for (int i = 0; i < num_gmem_alloc; i++) {
+         cpp_total += gmem_alloc[i].cpp;
+         min_cpp = MIN2(min_cpp, gmem_alloc[i].cpp);
+      }
+
       pass->tile_align_w = tile_align_w;
       pass->min_cpp = min_cpp;
 
       /* no gmem attachments */
       if (cpp_total == 0) {
-         /* any value non-zero value so tiling config works with no
-          * attachments
-          */
-         pass->gmem_pixels[layout] = 1024 * 1024;
-         continue;
+         /* any non-zero value so tiling config works with no attachments */
+         for (int i = 0; i < ARRAY_SIZE(pass->gmem_pixels); i++)
+            pass->gmem_pixels[i] = 1024*1024;
+         return;
       }
 
       /* TODO: this algorithm isn't optimal
@@ -662,44 +722,36 @@ tu_render_pass_gmem_config(struct tu_render_pass *pass,
                               : phys_dev->ccu_offset_gmem;
       uint32_t gmem_blocks = gmem_size / gmem_align;
       uint32_t offset = 0, pixels = ~0u, i;
+      for (i = 0; i < num_gmem_alloc; i++) {
+         struct tu_gmem_alloc *alloc = &gmem_alloc[i];
+
+         uint32_t align = MAX2(1, alloc->cpp >> block_align_shift);
+         uint32_t nblocks = MAX2((gmem_blocks * alloc->cpp / cpp_total) & ~(align - 1), align);
+
+         if (nblocks > gmem_blocks) {
+            /* gmem layout impossible */
+            pass->gmem_pixels[layout] = 0;
+            continue;
+         }
+
+         gmem_blocks -= nblocks;
+         cpp_total -= alloc->cpp;
+         alloc->gmem_offset = offset;
+         offset += nblocks * gmem_align;
+         pixels = MIN2(pixels, nblocks * gmem_align / alloc->cpp);
+      }
+
+      pass->gmem_pixels[layout] = pixels;
+
       for (i = 0; i < pass->attachment_count; i++) {
          struct tu_render_pass_attachment *att = &pass->attachments[i];
          if (!att->gmem)
             continue;
 
-         att->gmem_offset[layout] = offset;
-
-         uint32_t align = MAX2(1, att->cpp >> block_align_shift);
-         uint32_t nblocks =
-            MAX2((gmem_blocks * att->cpp / cpp_total) & ~(align - 1), align);
-
-         if (nblocks > gmem_blocks)
-            break;
-
-         gmem_blocks -= nblocks;
-         cpp_total -= att->cpp;
-         offset += nblocks * gmem_align;
-         pixels = MIN2(pixels, nblocks * gmem_align / att->cpp);
-
-         /* repeat the same for separate stencil */
-         if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
-            att->gmem_offset_stencil[layout] = offset;
-
-            /* note: for s8_uint, block align is always 1 */
-            uint32_t nblocks = gmem_blocks * att->samples / cpp_total;
-            if (nblocks > gmem_blocks)
-               break;
-
-            gmem_blocks -= nblocks;
-            cpp_total -= att->samples;
-            offset += nblocks * gmem_align;
-            pixels = MIN2(pixels, nblocks * gmem_align / att->samples);
-         }
+         att->gmem_offset[layout] = att_gmem_alloc[2 * i]->gmem_offset;
+         if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT)
+            att->gmem_offset_stencil[layout] = att_gmem_alloc[2 * i + 1]->gmem_offset;
       }
-
-      /* if the loop didn't complete then the gmem config is impossible */
-      if (i == pass->attachment_count)
-         pass->gmem_pixels[layout] = pixels;
    }
 }