diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index 229740d424f..14a97cc6539 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -1181,11 +1181,6 @@ emit_clip_window(struct v3dv_job *job, const VkRect2D *rect)
    }
 }
 
-/* Checks whether the render area rectangle covers a region that is aligned to
- * tile boundaries, which means that for all tiles covered by the render area
- * region, there are no uncovered pixels (unless they are also outside the
- * framebuffer).
- */
 static void
 cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
 {
@@ -1200,24 +1195,11 @@ cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
     * always have framebuffer information available.
     */
    assert(cmd_buffer->state.framebuffer);
-
-   const VkExtent2D fb_extent = {
-      .width = cmd_buffer->state.framebuffer->width,
-      .height = cmd_buffer->state.framebuffer->height
-   };
-
-   VkExtent2D granularity;
-   v3dv_subpass_get_granularity(cmd_buffer->state.pass,
-                                cmd_buffer->state.subpass_idx,
-                                &granularity);
-
    cmd_buffer->state.tile_aligned_render_area =
-      rect->offset.x % granularity.width == 0 &&
-      rect->offset.y % granularity.height == 0 &&
-      (rect->extent.width % granularity.width == 0 ||
-       rect->offset.x + rect->extent.width >= fb_extent.width) &&
-      (rect->extent.height % granularity.height == 0 ||
-       rect->offset.y + rect->extent.height >= fb_extent.height);
+      v3dv_subpass_area_is_tile_aligned(rect,
+                                        cmd_buffer->state.framebuffer,
+                                        cmd_buffer->state.pass,
+                                        cmd_buffer->state.subpass_idx);
 
    if (!cmd_buffer->state.tile_aligned_render_area) {
       perf_debug("Render area for subpass %d of render pass %p doesn't "
@@ -2023,7 +2005,6 @@ cmd_buffer_emit_render_pass_rcl(struct v3dv_cmd_buffer *cmd_buffer)
    assert(state->subpass_idx < state->pass->subpass_count);
    const struct v3dv_render_pass *pass = state->pass;
    const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
-
    struct v3dv_cl *rcl = &job->rcl;
 
    /* Comon config must be the first TILE_RENDERING_MODE_CFG and
@@ -2031,7 +2012,6 @@ cmd_buffer_emit_render_pass_rcl(struct v3dv_cmd_buffer *cmd_buffer)
     * updates to the previous HW state.
     */
    const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
-
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
       config.image_width_pixels = framebuffer->width;
       config.image_height_pixels = framebuffer->height;
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index 36d74b8c655..6e0eefb031d 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -1977,6 +1977,8 @@ v3dv_CreateFramebuffer(VkDevice _device,
    framebuffer->width = pCreateInfo->width;
    framebuffer->height = pCreateInfo->height;
    framebuffer->layers = pCreateInfo->layers;
+   framebuffer->has_edge_padding = true;
+
    framebuffer->attachment_count = pCreateInfo->attachmentCount;
    framebuffer->color_attachment_count = 0;
    for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c
index fe9e6a0c5c5..0ba96b84257 100644
--- a/src/broadcom/vulkan/v3dv_meta_copy.c
+++ b/src/broadcom/vulkan/v3dv_meta_copy.c
@@ -61,6 +61,7 @@ v3dv_meta_blit_finish(struct v3dv_device *device)
          struct v3dv_meta_blit_pipeline *item = entry->data;
          v3dv_DestroyPipeline(_device, item->pipeline, &device->alloc);
          v3dv_DestroyRenderPass(_device, item->pass, &device->alloc);
+         v3dv_DestroyRenderPass(_device, item->pass_no_load, &device->alloc);
          vk_free(&device->alloc, item);
       }
       _mesa_hash_table_destroy(device->meta.blit.cache[i], NULL);
@@ -771,7 +772,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
             VkColorComponentFlags cmask,
             VkComponentMapping *cswizzle,
             const VkImageBlit *region,
-            VkFilter filter);
+            VkFilter filter,
+            bool dst_is_padded_image);
 
 /**
  * Returns true if the implementation supports the requested operation (even if
@@ -998,7 +1000,7 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
                             v3dv_image_from_handle(buffer_image), dst_format,
                             image, src_format,
                             cmask, &cswizzle,
-                            &blit_region, VK_FILTER_NEAREST);
+                            &blit_region, VK_FILTER_NEAREST, false);
       if (!handled) {
          /* This is unexpected, we should have a supported blit spec */
          unreachable("Unable to blit buffer to destination image");
@@ -1454,7 +1456,7 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                               dst, format,
                               src, format,
                               0, NULL,
-                              &blit_region, VK_FILTER_NEAREST);
+                              &blit_region, VK_FILTER_NEAREST, true);
 
    /* We should have selected formats that we can blit */
    assert(handled);
@@ -2693,7 +2695,7 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                             image, dst_format,
                             v3dv_image_from_handle(buffer_image), src_format,
                             cmask, NULL,
-                            &blit_region, VK_FILTER_NEAREST);
+                            &blit_region, VK_FILTER_NEAREST, true);
       if (!handled) {
          /* This is unexpected, we should have a supported blit spec */
          unreachable("Unable to blit buffer to destination image");
@@ -3101,20 +3103,15 @@ static bool
 create_blit_render_pass(struct v3dv_device *device,
                         VkFormat dst_format,
                         VkFormat src_format,
-                        VkRenderPass *pass)
+                        VkRenderPass *pass_load,
+                        VkRenderPass *pass_no_load)
 {
    const bool is_color_blit = vk_format_is_color(dst_format);
 
-   /* FIXME: if blitting to tile boundaries or to the whole image, we could
-    * use LOAD_DONT_CARE, but then we would have to include that in the
-    * pipeline hash key. Or maybe we should just create both render passes and
-    * use one or the other at draw time since they would both be compatible
-    * with the pipeline anyway
-    */
+   /* Attachment load operation is specified below */
    VkAttachmentDescription att = {
       .format = dst_format,
       .samples = VK_SAMPLE_COUNT_1_BIT,
-      .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
       .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
       .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
@@ -3146,8 +3143,16 @@ create_blit_render_pass(struct v3dv_device *device,
       .pDependencies = NULL,
    };
 
-   VkResult result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
-                                           &info, &device->alloc, pass);
+   VkResult result;
+   att.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
+   result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
+                                  &info, &device->alloc, pass_load);
+   if (result != VK_SUCCESS)
+      return false;
+
+   att.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+   result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
+                                  &info, &device->alloc, pass_no_load);
    return result == VK_SUCCESS;
 }
 
@@ -3763,10 +3768,14 @@ get_blit_pipeline(struct v3dv_device *device,
       goto fail;
 
    ok = create_blit_render_pass(device, dst_format, src_format,
-                                &(*pipeline)->pass);
+                                &(*pipeline)->pass,
+                                &(*pipeline)->pass_no_load);
    if (!ok)
       goto fail;
 
+   /* Create the pipeline using one of the render passes, they are both
+    * compatible, so we don't care which one we use here.
+    */
    ok = create_blit_pipeline(device,
                              dst_format,
                              src_format,
@@ -3794,6 +3803,8 @@ fail:
    if (*pipeline) {
       if ((*pipeline)->pass)
          v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->alloc);
+      if ((*pipeline)->pass_no_load)
+         v3dv_DestroyRenderPass(_device, (*pipeline)->pass_no_load, &device->alloc);
       if ((*pipeline)->pipeline)
          v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->alloc);
       vk_free(&device->alloc, *pipeline);
@@ -3896,7 +3907,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
             VkColorComponentFlags cmask,
             VkComponentMapping *cswizzle,
             const VkImageBlit *_region,
-            VkFilter filter)
+            VkFilter filter,
+            bool dst_is_padded_image)
 {
    bool handled = true;
 
@@ -3907,7 +3919,6 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
           !vk_format_is_depth_or_stencil(dst_format));
 
    VkImageBlit region = *_region;
-
    /* Rewrite combined D/S blits to compatible color blits */
    if (vk_format_is_depth_or_stencil(dst_format)) {
       assert(src_format == dst_format);
@@ -3940,12 +3951,12 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
       region.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
    }
 
-   if (cmask == 0) {
-      cmask = VK_COLOR_COMPONENT_R_BIT |
-              VK_COLOR_COMPONENT_G_BIT |
-              VK_COLOR_COMPONENT_B_BIT |
-              VK_COLOR_COMPONENT_A_BIT;
-   }
+   const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
+                                            VK_COLOR_COMPONENT_G_BIT |
+                                            VK_COLOR_COMPONENT_B_BIT |
+                                            VK_COLOR_COMPONENT_A_BIT;
+   if (cmask == 0)
+      cmask = full_cmask;
 
    VkComponentMapping ident_swizzle = {
       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
@@ -4072,7 +4083,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
                                &pipeline);
    if (!ok)
       return handled;
-   assert(pipeline && pipeline->pipeline && pipeline->pass);
+   assert(pipeline && pipeline->pipeline &&
+          pipeline->pass && pipeline->pass_no_load);
 
    struct v3dv_device *device = cmd_buffer->device;
    assert(cmd_buffer->meta.blit.dspool);
@@ -4128,6 +4140,11 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
       if (result != VK_SUCCESS)
          goto fail;
 
+      struct v3dv_framebuffer *framebuffer = v3dv_framebuffer_from_handle(fb);
+      framebuffer->has_edge_padding = fb_info.width == dst_level_w &&
+                                      fb_info.height == dst_level_h &&
+                                      dst_is_padded_image;
+
       v3dv_cmd_buffer_add_private_obj(
          cmd_buffer, (uintptr_t)fb,
          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
@@ -4208,15 +4225,30 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
       };
       v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
 
+      /* If the region we are about to blit is tile-aligned, then we can
+       * use the render pass version that won't pre-load the tile buffer
+       * with the dst image contents before the blit. The exception is when we
+       * don't have a full color mask, since in that case we need to preserve
+       * the original value of some of the color components.
+       */
+      const VkRect2D render_area = {
+         .offset = { dst_x, dst_y },
+         .extent = { dst_w, dst_h },
+      };
+      struct v3dv_render_pass *pipeline_pass =
+         v3dv_render_pass_from_handle(pipeline->pass);
+      bool can_skip_tlb_load =
+         cmask == full_cmask &&
+         v3dv_subpass_area_is_tile_aligned(&render_area, framebuffer,
+                                           pipeline_pass, 0);
+
       /* Record blit */
       VkRenderPassBeginInfo rp_info = {
          .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
-         .renderPass = pipeline->pass,
+         .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
+                                           pipeline->pass,
          .framebuffer = fb,
-         .renderArea = {
-            .offset = { dst_x, dst_y },
-            .extent = { dst_w, dst_h }
-         },
+         .renderArea = render_area,
          .clearValueCount = 0,
       };
 
@@ -4308,7 +4340,7 @@ v3dv_CmdBlitImage(VkCommandBuffer commandBuffer,
                       dst, dst->vk_format,
                       src, src->vk_format,
                       0, NULL,
-                      &pRegions[i], filter)) {
+                      &pRegions[i], filter, true)) {
          continue;
       }
       unreachable("Unsupported blit operation");
@@ -4469,7 +4501,7 @@ resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                       dst, dst->vk_format,
                       src, src->vk_format,
                       0, NULL,
-                      &blit_region, VK_FILTER_NEAREST);
+                      &blit_region, VK_FILTER_NEAREST, true);
 }
 
 void
diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c
index 35f9c614289..a030b1c4e1d 100644
--- a/src/broadcom/vulkan/v3dv_pass.c
+++ b/src/broadcom/vulkan/v3dv_pass.c
@@ -255,10 +255,10 @@ v3dv_DestroyRenderPass(VkDevice _device,
    vk_free2(&device->alloc, pAllocator, pass);
 }
 
-void
-v3dv_subpass_get_granularity(struct v3dv_render_pass *pass,
-                             uint32_t subpass_idx,
-                             VkExtent2D *granularity)
+static void
+subpass_get_granularity(struct v3dv_render_pass *pass,
+                        uint32_t subpass_idx,
+                        VkExtent2D *granularity)
 {
    static const uint8_t tile_sizes[] = {
       64, 64,
@@ -321,8 +321,50 @@ v3dv_GetRenderAreaGranularity(VkDevice device,
 
    for (uint32_t i = 0; i < pass->subpass_count; i++) {
       VkExtent2D sg;
-      v3dv_subpass_get_granularity(pass, i, &sg);
+      subpass_get_granularity(pass, i, &sg);
       pGranularity->width = MIN2(pGranularity->width, sg.width);
       pGranularity->height = MIN2(pGranularity->height, sg.height);
    }
 }
+
+/* Checks whether the render area rectangle covers a region that is aligned to
+ * tile boundaries. This means that we are writing to all pixels covered by
+ * all tiles in that area (except for pixels on edge tiles that are outside
+ * the framebuffer dimensions).
+ *
+ * When our framebuffer is aligned to tile boundaries we know we are writing
+ * valid data to all all pixels in each tile and we can apply certain
+ * optimizations, like avoiding tile loads, since we know that none of the
+ * original pixel values in each tile for that area need to be preserved.
+ * We also use this to decide if we can use TLB clears, as these clear whole
+ * tiles so we can't use them if the render area is not aligned.
+ *
+ * Note that when an image is created it will possibly include padding blocks
+ * depending on its tiling layout. When the framebuffer dimensions are not
+ * aligned to tile boundaries then edge tiles are only partially covered by the
+ * framebuffer pixels, but tile stores still seem to store full tiles
+ * writing to the padded sections. This is important when the framebuffer
+ * is aliasing a smaller section of a larger image, as in that case the edge
+ * tiles of the framebuffer would overwrite valid pixels in the larger image.
+ * In that case, we can't flag the area as being aligned.
+ */
+bool
+v3dv_subpass_area_is_tile_aligned(const VkRect2D *area,
+                                  struct v3dv_framebuffer *fb,
+                                  struct v3dv_render_pass *pass,
+                                  uint32_t subpass_idx)
+{
+   assert(subpass_idx >= 0 && subpass_idx < pass->subpass_count);
+
+   VkExtent2D granularity;
+   subpass_get_granularity(pass, subpass_idx, &granularity);
+
+   return area->offset.x % granularity.width == 0 &&
+          area->offset.y % granularity.height == 0 &&
+         (area->extent.width % granularity.width == 0 ||
+          (fb->has_edge_padding &&
+           area->offset.x + area->extent.width >= fb->width)) &&
+         (area->extent.height % granularity.height == 0 ||
+          (fb->has_edge_padding &&
+           area->offset.y + area->extent.height >= fb->height));
+}
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 0528111148b..caa7ea699b5 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -257,6 +257,7 @@ struct v3dv_meta_depth_clear_pipeline {
 struct v3dv_meta_blit_pipeline {
    VkPipeline pipeline;
    VkRenderPass pass;
+   VkRenderPass pass_no_load;
    uint8_t key[V3DV_META_BLIT_CACHE_KEY_SIZE];
 };
 
@@ -555,15 +556,22 @@ struct v3dv_render_pass {
    struct v3dv_subpass_attachment *subpass_attachments;
 };
 
-void v3dv_subpass_get_granularity(struct v3dv_render_pass *pass,
-                                  uint32_t subpass_idx,
-                                  VkExtent2D *granularity);
-
 struct v3dv_framebuffer {
    uint32_t width;
    uint32_t height;
    uint32_t layers;
 
+   /* Typically, edge tiles in the framebuffer have padding depending on the
+    * underlying tiling layout. One consequnce of this is that when the
+    * framebuffer dimensions are not aligned to tile boundaries, tile stores
+    * would still write full tiles on the edges and write to the padded area.
+    * If the framebuffer is aliasing a smaller region of a larger image, then
+    * we need to be careful with this though, as we won't have padding on the
+    * edge tiles (which typically means that we need to load the tile buffer
+    * before we store).
+    */
+   bool has_edge_padding;
+
    uint32_t attachment_count;
    uint32_t color_attachment_count;
    struct v3dv_image_view *attachments[0];
@@ -590,6 +598,10 @@ void v3dv_framebuffer_compute_internal_bpp_msaa(const struct v3dv_framebuffer *f
                                                 const struct v3dv_subpass *subpass,
                                                 uint8_t *max_bpp, bool *msaa);
 
+bool v3dv_subpass_area_is_tile_aligned(const VkRect2D *area,
+                                       struct v3dv_framebuffer *fb,
+                                       struct v3dv_render_pass *pass,
+                                       uint32_t subpass_idx);
 struct v3dv_cmd_pool {
    VkAllocationCallbacks alloc;
    struct list_head cmd_buffers;