turnip: Share gmem allocations between attachments.

If attachments are used in separate subpasses, then we can store them in the same GMEM area. That gives us more space per attachment and thus larger tile sizes. For gfxbench vk-5-normal's main renderpass it goes from 128x128 to 160x128, improving perf by 2.125% +/- 0.194356% (n=4) Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17943>
2026-05-06 05:08:08 +02:00 · 2022-08-01 13:08:39 -07:00 · 2022-08-01 13:08:39 -07:00 · 37ce9a7a61
commit 37ce9a7a61
parent ba9d0ba9a0
1 changed files with 98 additions and 46 deletions
--- a/src/freedreno/vulkan/tu_pass.cc
+++ b/src/freedreno/vulkan/tu_pass.cc
@ -599,6 +599,53 @@ tu_render_pass_check_ib2_skip(struct tu_render_pass *pass)
   }
 }

+struct tu_gmem_alloc {
+   uint32_t gmem_offset;
+   uint32_t cpp;
+   uint32_t first_subpass;
+   uint32_t last_subpass;
+};
+
+static struct tu_gmem_alloc *
+tu_gmem_alloc(struct tu_gmem_alloc *allocs,
+              uint32_t *num_allocs,
+              uint32_t cpp,
+              uint32_t first_subpass,
+              uint32_t last_subpass)
+{
+   struct tu_gmem_alloc *alloc = NULL;
+
+   /* Find an allocation that is free during our subpass range, with the
+    * closet usable cpp.
+    */
+   for (int i = 0; i < *num_allocs; i++) {
+      if (!(allocs[i].first_subpass > last_subpass ||
+            allocs[i].last_subpass < first_subpass)) {
+         continue;
+      }
+
+      if (allocs[i].cpp == cpp) {
+         alloc = &allocs[i];
+         break;
+      }
+      if (allocs[i].cpp > cpp && (!alloc || alloc->cpp > allocs[i].cpp))
+         alloc = &allocs[i];
+   }
+   if (alloc) {
+      /* Expand the range of the existing allocation */
+      alloc->first_subpass = MIN2(alloc->first_subpass, first_subpass);
+      alloc->last_subpass = MAX2(alloc->last_subpass, last_subpass);
+   } else {
+      /* Make a new gmem allocation for this cpp. */
+      alloc = &allocs[(*num_allocs)++];
+      alloc->cpp = cpp;
+      alloc->first_subpass = first_subpass;
+      alloc->last_subpass = last_subpass;
+   }
+
+   return alloc;
+}
+
 static void
 tu_render_pass_gmem_config(struct tu_render_pass *pass,
                           const struct tu_physical_device *phys_dev)
@ -609,24 +656,31 @@ tu_render_pass_gmem_config(struct tu_render_pass *pass,
      /* log2(gmem_align/(tile_align_w*tile_align_h)) */
      uint32_t block_align_shift = 3;
      uint32_t tile_align_w = phys_dev->info->tile_align_w;
-      uint32_t gmem_align = (1 << block_align_shift) * tile_align_w *
-                            phys_dev->info->tile_align_h;
+      uint32_t gmem_align = (1 << block_align_shift) * tile_align_w * phys_dev->info->tile_align_h;
+
+      /* gmem allocations to make, possibly shared between attachments. Each
+       * attachment may have 2 allocations, to handle separate stencil.
+       */
+      struct tu_gmem_alloc gmem_alloc[2 * pass->attachment_count];
+      uint32_t num_gmem_alloc = 0;
+      struct tu_gmem_alloc *att_gmem_alloc[2 * pass->attachment_count];
+      for (int i = 0; i < ARRAY_SIZE(att_gmem_alloc); i++)
+         att_gmem_alloc[i] = NULL;

-      /* calculate total bytes per pixel */
-      uint32_t cpp_total = 0;
-      uint32_t min_cpp = UINT32_MAX;
      for (uint32_t i = 0; i < pass->attachment_count; i++) {
         struct tu_render_pass_attachment *att = &pass->attachments[i];
         bool cpp1 = (att->cpp == 1);
         if (att->gmem) {
-            cpp_total += att->cpp;
-            min_cpp = MIN2(min_cpp, att->cpp);
+            att_gmem_alloc[i * 2] =
+               tu_gmem_alloc(gmem_alloc, &num_gmem_alloc, att->cpp,
+                           att->first_subpass_idx, att->last_subpass_idx);

            /* take into account the separate stencil: */
            if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
-               min_cpp = MIN2(min_cpp, att->samples);
               cpp1 = (att->samples == 1);
-               cpp_total += att->samples;
+               att_gmem_alloc[i * 2 + 1] =
+                  tu_gmem_alloc(gmem_alloc, &num_gmem_alloc, att->samples,
+                              att->first_subpass_idx, att->last_subpass_idx);
            }

            /* texture pitch must be aligned to 64, use a tile_align_w that is
@ -640,16 +694,22 @@ tu_render_pass_gmem_config(struct tu_render_pass *pass,
         }
      }

+      uint32_t cpp_total = 0;
+      uint32_t min_cpp = UINT32_MAX;
+      for (int i = 0; i < num_gmem_alloc; i++) {
+         cpp_total += gmem_alloc[i].cpp;
+         min_cpp = MIN2(min_cpp, gmem_alloc[i].cpp);
+      }
+
      pass->tile_align_w = tile_align_w;
      pass->min_cpp = min_cpp;

      /* no gmem attachments */
      if (cpp_total == 0) {
-         /* any value non-zero value so tiling config works with no
-          * attachments
-          */
-         pass->gmem_pixels[layout] = 1024 * 1024;
-         continue;
+         /* any non-zero value so tiling config works with no attachments */
+         for (int i = 0; i < ARRAY_SIZE(pass->gmem_pixels); i++)
+            pass->gmem_pixels[i] = 1024*1024;
+         return;
      }

      /* TODO: this algorithm isn't optimal
@ -662,44 +722,36 @@ tu_render_pass_gmem_config(struct tu_render_pass *pass,
                              : phys_dev->ccu_offset_gmem;
      uint32_t gmem_blocks = gmem_size / gmem_align;
      uint32_t offset = 0, pixels = ~0u, i;
+      for (i = 0; i < num_gmem_alloc; i++) {
+         struct tu_gmem_alloc *alloc = &gmem_alloc[i];
+
+         uint32_t align = MAX2(1, alloc->cpp >> block_align_shift);
+         uint32_t nblocks = MAX2((gmem_blocks * alloc->cpp / cpp_total) & ~(align - 1), align);
+
+         if (nblocks > gmem_blocks) {
+            /* gmem layout impossible */
+            pass->gmem_pixels[layout] = 0;
+            continue;
+         }
+
+         gmem_blocks -= nblocks;
+         cpp_total -= alloc->cpp;
+         alloc->gmem_offset = offset;
+         offset += nblocks * gmem_align;
+         pixels = MIN2(pixels, nblocks * gmem_align / alloc->cpp);
+      }
+
+      pass->gmem_pixels[layout] = pixels;
+
      for (i = 0; i < pass->attachment_count; i++) {
         struct tu_render_pass_attachment *att = &pass->attachments[i];
         if (!att->gmem)
            continue;

-         att->gmem_offset[layout] = offset;
-
-         uint32_t align = MAX2(1, att->cpp >> block_align_shift);
-         uint32_t nblocks =
-            MAX2((gmem_blocks * att->cpp / cpp_total) & ~(align - 1), align);
-
-         if (nblocks > gmem_blocks)
-            break;
-
-         gmem_blocks -= nblocks;
-         cpp_total -= att->cpp;
-         offset += nblocks * gmem_align;
-         pixels = MIN2(pixels, nblocks * gmem_align / att->cpp);
-
-         /* repeat the same for separate stencil */
-         if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
-            att->gmem_offset_stencil[layout] = offset;
-
-            /* note: for s8_uint, block align is always 1 */
-            uint32_t nblocks = gmem_blocks * att->samples / cpp_total;
-            if (nblocks > gmem_blocks)
-               break;
-
-            gmem_blocks -= nblocks;
-            cpp_total -= att->samples;
-            offset += nblocks * gmem_align;
-            pixels = MIN2(pixels, nblocks * gmem_align / att->samples);
-         }
+         att->gmem_offset[layout] = att_gmem_alloc[2 * i]->gmem_offset;
+         if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT)
+            att->gmem_offset_stencil[layout] = att_gmem_alloc[2 * i + 1]->gmem_offset;
      }
-
-      /* if the loop didn't complete then the gmem config is impossible */
-      if (i == pass->attachment_count)
-         pass->gmem_pixels[layout] = pixels;
   }
 }