v3dv: skip unnecessary tile loads when blitting

If we are blitting to tile boundaries we don't need to emit tile loads. The exception to this is the case where we are blitting only a subset of the pixel components in the image (which we do for single aspect blits of D24S8), since in that case we need to preserve the components we are not writing. There is a corner case where some times we create framebuffers that alias subregions of a larger image. In that case the edge tiles are not padded and we can't skip the loads. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7247>
2026-01-03 07:10:15 +01:00 · 2020-10-08 13:24:13 +02:00 · 2020-10-08 13:24:13 +02:00 · 0a4fc19605
commit 0a4fc19605
parent c83d6ffa32
5 changed files with 132 additions and 64 deletions
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@ -1181,11 +1181,6 @@ emit_clip_window(struct v3dv_job *job, const VkRect2D *rect)
   }
 }

-/* Checks whether the render area rectangle covers a region that is aligned to
- * tile boundaries, which means that for all tiles covered by the render area
- * region, there are no uncovered pixels (unless they are also outside the
- * framebuffer).
- */
 static void
 cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
 {
@ -1200,24 +1195,11 @@ cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
    * always have framebuffer information available.
    */
   assert(cmd_buffer->state.framebuffer);
-
-   const VkExtent2D fb_extent = {
-      .width = cmd_buffer->state.framebuffer->width,
-      .height = cmd_buffer->state.framebuffer->height
-   };
-
-   VkExtent2D granularity;
-   v3dv_subpass_get_granularity(cmd_buffer->state.pass,
-                                cmd_buffer->state.subpass_idx,
-                                &granularity);
-
   cmd_buffer->state.tile_aligned_render_area =
-      rect->offset.x % granularity.width == 0 &&
-      rect->offset.y % granularity.height == 0 &&
-      (rect->extent.width % granularity.width == 0 ||
-       rect->offset.x + rect->extent.width >= fb_extent.width) &&
-      (rect->extent.height % granularity.height == 0 ||
-       rect->offset.y + rect->extent.height >= fb_extent.height);
+      v3dv_subpass_area_is_tile_aligned(rect,
+                                        cmd_buffer->state.framebuffer,
+                                        cmd_buffer->state.pass,
+                                        cmd_buffer->state.subpass_idx);

   if (!cmd_buffer->state.tile_aligned_render_area) {
      perf_debug("Render area for subpass %d of render pass %p doesn't "
@ -2023,7 +2005,6 @@ cmd_buffer_emit_render_pass_rcl(struct v3dv_cmd_buffer *cmd_buffer)
   assert(state->subpass_idx < state->pass->subpass_count);
   const struct v3dv_render_pass *pass = state->pass;
   const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
-
   struct v3dv_cl *rcl = &job->rcl;

   /* Comon config must be the first TILE_RENDERING_MODE_CFG and
@ -2031,7 +2012,6 @@ cmd_buffer_emit_render_pass_rcl(struct v3dv_cmd_buffer *cmd_buffer)
    * updates to the previous HW state.
    */
   const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
-
   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
      config.image_width_pixels = framebuffer->width;
      config.image_height_pixels = framebuffer->height;
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@ -1977,6 +1977,8 @@ v3dv_CreateFramebuffer(VkDevice _device,
   framebuffer->width = pCreateInfo->width;
   framebuffer->height = pCreateInfo->height;
   framebuffer->layers = pCreateInfo->layers;
+   framebuffer->has_edge_padding = true;
+
   framebuffer->attachment_count = pCreateInfo->attachmentCount;
   framebuffer->color_attachment_count = 0;
   for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
--- a/src/broadcom/vulkan/v3dv_meta_copy.c
+++ b/src/broadcom/vulkan/v3dv_meta_copy.c
@ -61,6 +61,7 @@ v3dv_meta_blit_finish(struct v3dv_device *device)
         struct v3dv_meta_blit_pipeline *item = entry->data;
         v3dv_DestroyPipeline(_device, item->pipeline, &device->alloc);
         v3dv_DestroyRenderPass(_device, item->pass, &device->alloc);
+         v3dv_DestroyRenderPass(_device, item->pass_no_load, &device->alloc);
         vk_free(&device->alloc, item);
      }
      _mesa_hash_table_destroy(device->meta.blit.cache[i], NULL);
@ -771,7 +772,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
            VkColorComponentFlags cmask,
            VkComponentMapping *cswizzle,
            const VkImageBlit *region,
-            VkFilter filter);
+            VkFilter filter,
+            bool dst_is_padded_image);

 /**
 * Returns true if the implementation supports the requested operation (even if
@ -998,7 +1000,7 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
                            v3dv_image_from_handle(buffer_image), dst_format,
                            image, src_format,
                            cmask, &cswizzle,
-                            &blit_region, VK_FILTER_NEAREST);
+                            &blit_region, VK_FILTER_NEAREST, false);
      if (!handled) {
         /* This is unexpected, we should have a supported blit spec */
         unreachable("Unable to blit buffer to destination image");
@ -1454,7 +1456,7 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                              dst, format,
                              src, format,
                              0, NULL,
-                              &blit_region, VK_FILTER_NEAREST);
+                              &blit_region, VK_FILTER_NEAREST, true);

   /* We should have selected formats that we can blit */
   assert(handled);
@ -2693,7 +2695,7 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                            image, dst_format,
                            v3dv_image_from_handle(buffer_image), src_format,
                            cmask, NULL,
-                            &blit_region, VK_FILTER_NEAREST);
+                            &blit_region, VK_FILTER_NEAREST, true);
      if (!handled) {
         /* This is unexpected, we should have a supported blit spec */
         unreachable("Unable to blit buffer to destination image");
@ -3101,20 +3103,15 @@ static bool
 create_blit_render_pass(struct v3dv_device *device,
                        VkFormat dst_format,
                        VkFormat src_format,
-                        VkRenderPass *pass)
+                        VkRenderPass *pass_load,
+                        VkRenderPass *pass_no_load)
 {
   const bool is_color_blit = vk_format_is_color(dst_format);

-   /* FIXME: if blitting to tile boundaries or to the whole image, we could
-    * use LOAD_DONT_CARE, but then we would have to include that in the
-    * pipeline hash key. Or maybe we should just create both render passes and
-    * use one or the other at draw time since they would both be compatible
-    * with the pipeline anyway
-    */
+   /* Attachment load operation is specified below */
   VkAttachmentDescription att = {
      .format = dst_format,
      .samples = VK_SAMPLE_COUNT_1_BIT,
-      .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
      .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
      .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
      .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
@ -3146,8 +3143,16 @@ create_blit_render_pass(struct v3dv_device *device,
      .pDependencies = NULL,
   };

-   VkResult result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
-                                           &info, &device->alloc, pass);
+   VkResult result;
+   att.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
+   result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
+                                  &info, &device->alloc, pass_load);
+   if (result != VK_SUCCESS)
+      return false;
+
+   att.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+   result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
+                                  &info, &device->alloc, pass_no_load);
   return result == VK_SUCCESS;
 }

@ -3763,10 +3768,14 @@ get_blit_pipeline(struct v3dv_device *device,
      goto fail;

   ok = create_blit_render_pass(device, dst_format, src_format,
-                                &(*pipeline)->pass);
+                                &(*pipeline)->pass,
+                                &(*pipeline)->pass_no_load);
   if (!ok)
      goto fail;

+   /* Create the pipeline using one of the render passes, they are both
+    * compatible, so we don't care which one we use here.
+    */
   ok = create_blit_pipeline(device,
                             dst_format,
                             src_format,
@ -3794,6 +3803,8 @@ fail:
   if (*pipeline) {
      if ((*pipeline)->pass)
         v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->alloc);
+      if ((*pipeline)->pass_no_load)
+         v3dv_DestroyRenderPass(_device, (*pipeline)->pass_no_load, &device->alloc);
      if ((*pipeline)->pipeline)
         v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->alloc);
      vk_free(&device->alloc, *pipeline);
@ -3896,7 +3907,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
            VkColorComponentFlags cmask,
            VkComponentMapping *cswizzle,
            const VkImageBlit *_region,
-            VkFilter filter)
+            VkFilter filter,
+            bool dst_is_padded_image)
 {
   bool handled = true;

@ -3907,7 +3919,6 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
          !vk_format_is_depth_or_stencil(dst_format));

   VkImageBlit region = *_region;
-
   /* Rewrite combined D/S blits to compatible color blits */
   if (vk_format_is_depth_or_stencil(dst_format)) {
      assert(src_format == dst_format);
@ -3940,12 +3951,12 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
      region.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
   }

-   if (cmask == 0) {
-      cmask = VK_COLOR_COMPONENT_R_BIT |
-              VK_COLOR_COMPONENT_G_BIT |
-              VK_COLOR_COMPONENT_B_BIT |
-              VK_COLOR_COMPONENT_A_BIT;
-   }
+   const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
+                                            VK_COLOR_COMPONENT_G_BIT |
+                                            VK_COLOR_COMPONENT_B_BIT |
+                                            VK_COLOR_COMPONENT_A_BIT;
+   if (cmask == 0)
+      cmask = full_cmask;

   VkComponentMapping ident_swizzle = {
      .r = VK_COMPONENT_SWIZZLE_IDENTITY,
@ -4072,7 +4083,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
                               &pipeline);
   if (!ok)
      return handled;
-   assert(pipeline && pipeline->pipeline && pipeline->pass);
+   assert(pipeline && pipeline->pipeline &&
+          pipeline->pass && pipeline->pass_no_load);

   struct v3dv_device *device = cmd_buffer->device;
   assert(cmd_buffer->meta.blit.dspool);
@ -4128,6 +4140,11 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
      if (result != VK_SUCCESS)
         goto fail;

+      struct v3dv_framebuffer *framebuffer = v3dv_framebuffer_from_handle(fb);
+      framebuffer->has_edge_padding = fb_info.width == dst_level_w &&
+                                      fb_info.height == dst_level_h &&
+                                      dst_is_padded_image;
+
      v3dv_cmd_buffer_add_private_obj(
         cmd_buffer, (uintptr_t)fb,
         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
@ -4208,15 +4225,30 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
      };
      v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);

+      /* If the region we are about to blit is tile-aligned, then we can
+       * use the render pass version that won't pre-load the tile buffer
+       * with the dst image contents before the blit. The exception is when we
+       * don't have a full color mask, since in that case we need to preserve
+       * the original value of some of the color components.
+       */
+      const VkRect2D render_area = {
+         .offset = { dst_x, dst_y },
+         .extent = { dst_w, dst_h },
+      };
+      struct v3dv_render_pass *pipeline_pass =
+         v3dv_render_pass_from_handle(pipeline->pass);
+      bool can_skip_tlb_load =
+         cmask == full_cmask &&
+         v3dv_subpass_area_is_tile_aligned(&render_area, framebuffer,
+                                           pipeline_pass, 0);
+
      /* Record blit */
      VkRenderPassBeginInfo rp_info = {
         .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
-         .renderPass = pipeline->pass,
+         .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
+                                           pipeline->pass,
         .framebuffer = fb,
-         .renderArea = {
-            .offset = { dst_x, dst_y },
-            .extent = { dst_w, dst_h }
-         },
+         .renderArea = render_area,
         .clearValueCount = 0,
      };

@ -4308,7 +4340,7 @@ v3dv_CmdBlitImage(VkCommandBuffer commandBuffer,
                      dst, dst->vk_format,
                      src, src->vk_format,
                      0, NULL,
-                      &pRegions[i], filter)) {
+                      &pRegions[i], filter, true)) {
         continue;
      }
      unreachable("Unsupported blit operation");
@ -4469,7 +4501,7 @@ resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                      dst, dst->vk_format,
                      src, src->vk_format,
                      0, NULL,
-                      &blit_region, VK_FILTER_NEAREST);
+                      &blit_region, VK_FILTER_NEAREST, true);
 }

 void
--- a/src/broadcom/vulkan/v3dv_pass.c
+++ b/src/broadcom/vulkan/v3dv_pass.c
@ -255,10 +255,10 @@ v3dv_DestroyRenderPass(VkDevice _device,
   vk_free2(&device->alloc, pAllocator, pass);
 }

-void
-v3dv_subpass_get_granularity(struct v3dv_render_pass *pass,
-                             uint32_t subpass_idx,
-                             VkExtent2D *granularity)
+static void
+subpass_get_granularity(struct v3dv_render_pass *pass,
+                        uint32_t subpass_idx,
+                        VkExtent2D *granularity)
 {
   static const uint8_t tile_sizes[] = {
      64, 64,
@ -321,8 +321,50 @@ v3dv_GetRenderAreaGranularity(VkDevice device,

   for (uint32_t i = 0; i < pass->subpass_count; i++) {
      VkExtent2D sg;
-      v3dv_subpass_get_granularity(pass, i, &sg);
+      subpass_get_granularity(pass, i, &sg);
      pGranularity->width = MIN2(pGranularity->width, sg.width);
      pGranularity->height = MIN2(pGranularity->height, sg.height);
   }
 }
+
+/* Checks whether the render area rectangle covers a region that is aligned to
+ * tile boundaries. This means that we are writing to all pixels covered by
+ * all tiles in that area (except for pixels on edge tiles that are outside
+ * the framebuffer dimensions).
+ *
+ * When our framebuffer is aligned to tile boundaries we know we are writing
+ * valid data to all all pixels in each tile and we can apply certain
+ * optimizations, like avoiding tile loads, since we know that none of the
+ * original pixel values in each tile for that area need to be preserved.
+ * We also use this to decide if we can use TLB clears, as these clear whole
+ * tiles so we can't use them if the render area is not aligned.
+ *
+ * Note that when an image is created it will possibly include padding blocks
+ * depending on its tiling layout. When the framebuffer dimensions are not
+ * aligned to tile boundaries then edge tiles are only partially covered by the
+ * framebuffer pixels, but tile stores still seem to store full tiles
+ * writing to the padded sections. This is important when the framebuffer
+ * is aliasing a smaller section of a larger image, as in that case the edge
+ * tiles of the framebuffer would overwrite valid pixels in the larger image.
+ * In that case, we can't flag the area as being aligned.
+ */
+bool
+v3dv_subpass_area_is_tile_aligned(const VkRect2D *area,
+                                  struct v3dv_framebuffer *fb,
+                                  struct v3dv_render_pass *pass,
+                                  uint32_t subpass_idx)
+{
+   assert(subpass_idx >= 0 && subpass_idx < pass->subpass_count);
+
+   VkExtent2D granularity;
+   subpass_get_granularity(pass, subpass_idx, &granularity);
+
+   return area->offset.x % granularity.width == 0 &&
+          area->offset.y % granularity.height == 0 &&
+         (area->extent.width % granularity.width == 0 ||
+          (fb->has_edge_padding &&
+           area->offset.x + area->extent.width >= fb->width)) &&
+         (area->extent.height % granularity.height == 0 ||
+          (fb->has_edge_padding &&
+           area->offset.y + area->extent.height >= fb->height));
+}
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@ -257,6 +257,7 @@ struct v3dv_meta_depth_clear_pipeline {
 struct v3dv_meta_blit_pipeline {
   VkPipeline pipeline;
   VkRenderPass pass;
+   VkRenderPass pass_no_load;
   uint8_t key[V3DV_META_BLIT_CACHE_KEY_SIZE];
 };

@ -555,15 +556,22 @@ struct v3dv_render_pass {
   struct v3dv_subpass_attachment *subpass_attachments;
 };

-void v3dv_subpass_get_granularity(struct v3dv_render_pass *pass,
-                                  uint32_t subpass_idx,
-                                  VkExtent2D *granularity);
-
 struct v3dv_framebuffer {
   uint32_t width;
   uint32_t height;
   uint32_t layers;

+   /* Typically, edge tiles in the framebuffer have padding depending on the
+    * underlying tiling layout. One consequnce of this is that when the
+    * framebuffer dimensions are not aligned to tile boundaries, tile stores
+    * would still write full tiles on the edges and write to the padded area.
+    * If the framebuffer is aliasing a smaller region of a larger image, then
+    * we need to be careful with this though, as we won't have padding on the
+    * edge tiles (which typically means that we need to load the tile buffer
+    * before we store).
+    */
+   bool has_edge_padding;
+
   uint32_t attachment_count;
   uint32_t color_attachment_count;
   struct v3dv_image_view *attachments[0];
@ -590,6 +598,10 @@ void v3dv_framebuffer_compute_internal_bpp_msaa(const struct v3dv_framebuffer *f
                                                const struct v3dv_subpass *subpass,
                                                uint8_t *max_bpp, bool *msaa);

+bool v3dv_subpass_area_is_tile_aligned(const VkRect2D *area,
+                                       struct v3dv_framebuffer *fb,
+                                       struct v3dv_render_pass *pass,
+                                       uint32_t subpass_idx);
 struct v3dv_cmd_pool {
   VkAllocationCallbacks alloc;
   struct list_head cmd_buffers;