tu: Implement VK_QCOM_subpass_shader_resolve

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38451>
2026-05-08 11:18:08 +02:00 · 2025-05-05 13:37:14 -04:00 · 2025-05-05 13:37:14 -04:00 · ad84ae2719
commit ad84ae2719
parent 7691f1b70d
14 changed files with 403 additions and 88 deletions
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@ -763,6 +763,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
   case nir_intrinsic_load_const_ir3:
   case nir_intrinsic_load_frag_size_ir3:
   case nir_intrinsic_load_frag_offset_ir3:
+   case nir_intrinsic_load_gmem_frag_scale_ir3:
+   case nir_intrinsic_load_gmem_frag_offset_ir3:
   case nir_intrinsic_bindless_resource_ir3:
   case nir_intrinsic_ray_intersection_ir3:
   case nir_intrinsic_load_attribute_payload_intel:
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -1517,6 +1517,11 @@ intrinsic("load_frag_size_ir3", src_comp=[1], dest_comp=2, indices=[RANGE],
        flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32])
 intrinsic("load_frag_offset_ir3", src_comp=[1], dest_comp=2, indices=[RANGE],
        flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32])
+# Per-view GMEM FragCoord scale and offset.
+intrinsic("load_gmem_frag_scale_ir3", src_comp=[1], dest_comp=2, indices=[RANGE],
+        flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32])
+intrinsic("load_gmem_frag_offset_ir3", src_comp=[1], dest_comp=2, indices=[RANGE],
+        flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32])

 # IR3-specific load/store intrinsics. These access a buffer used to pass data
 # between geometry stages - perhaps it's explicit access to the vertex cache.
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@ -3113,26 +3113,42 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
      dst[0] = create_driver_param(ctx, IR3_DP_FS(frag_invocation_count));
      break;
   case nir_intrinsic_load_frag_size_ir3:
-   case nir_intrinsic_load_frag_offset_ir3: {
-      unsigned param =
-         intr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
-         IR3_DP_FS(frag_size) : IR3_DP_FS(frag_offset);
+   case nir_intrinsic_load_frag_offset_ir3:
+   case nir_intrinsic_load_gmem_frag_scale_ir3:
+   case nir_intrinsic_load_gmem_frag_offset_ir3: {
+      unsigned param;
+      switch (intr->intrinsic) {
+      case nir_intrinsic_load_frag_size_ir3:
+         param = IR3_DP_FS(frag_size);
+         break;
+      case nir_intrinsic_load_frag_offset_ir3:
+         param = IR3_DP_FS(frag_offset);
+         break;
+      case nir_intrinsic_load_gmem_frag_scale_ir3:
+         param = IR3_DP_FS(gmem_frag_scale);
+         break;
+      case nir_intrinsic_load_gmem_frag_offset_ir3:
+         param = IR3_DP_FS(gmem_frag_offset);
+         break;
+      default:
+         UNREACHABLE("bad intrinsic");
+      }
      if (nir_src_is_const(intr->src[0])) {
         uint32_t view = nir_src_as_uint(intr->src[0]);
         for (int i = 0; i < dest_components; i++) {
-            dst[i] = create_driver_param(ctx, param + 4 * view + i);
+            dst[i] = create_driver_param(ctx, param + 8 * view + i);
         }
         create_rpt = true;
      } else {
         struct ir3_instruction *view = ir3_get_src(ctx, &intr->src[0])[0];
         for (int i = 0; i < dest_components; i++) {
            dst[i] = create_driver_param_indirect(ctx, param + i,
-                                                  ir3_get_addr0(ctx, view, 4));
+                                                  ir3_get_addr0(ctx, view, 8));
         }
         ctx->so->constlen =
            MAX2(ctx->so->constlen,
                 const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].offset_vec4 +
-                    param / 4 + nir_intrinsic_range(intr));
+                    param / 4 + nir_intrinsic_range(intr) * 2);
      }
      break;
   }
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@ -1434,11 +1434,19 @@ ir3_get_driver_param_info(const nir_shader *shader, nir_intrinsic_instr *intr,
      break;
   case nir_intrinsic_load_frag_size_ir3:
      param_info->offset = IR3_DP_FS(frag_size);
-      param_info->extra_size = 4 * (nir_intrinsic_range(intr) - 1);
+      param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1);
      break;
   case nir_intrinsic_load_frag_offset_ir3:
      param_info->offset = IR3_DP_FS(frag_offset);
-      param_info->extra_size = 4 * (nir_intrinsic_range(intr) - 1);
+      param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1);
+      break;
+   case nir_intrinsic_load_gmem_frag_scale_ir3:
+      param_info->offset = IR3_DP_FS(gmem_frag_scale);
+      param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1);
+      break;
+   case nir_intrinsic_load_gmem_frag_offset_ir3:
+      param_info->offset = IR3_DP_FS(gmem_frag_offset);
+      param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1);
      break;
   case nir_intrinsic_load_frag_invocation_count:
      param_info->offset = IR3_DP_FS(frag_invocation_count);
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@ -107,7 +107,11 @@ struct ir3_driver_params_fs {
   uint32_t frag_size;
   uint32_t __pad_09;
   uint32_t frag_offset;
-   uint32_t __pad_11_12[2];
+   uint32_t __pad_11;
+   uint32_t gmem_frag_scale;
+   uint32_t __pad_13;
+   uint32_t gmem_frag_offset;
+   uint32_t __pad_15;
 };
 #define IR3_DP_FS(name) dword_offsetof(struct ir3_driver_params_fs, name)

--- a/src/freedreno/vulkan/tu_clear_blit.cc
+++ b/src/freedreno/vulkan/tu_clear_blit.cc
@ -4043,6 +4043,7 @@ struct apply_sysmem_clear_coords_state {
   unsigned layer;
   float z_clear_val;
   VkRect2D rect;
+   bool custom_resolve;
 };

 static void
@ -4053,7 +4054,8 @@ fdm_apply_sysmem_clear_coords(struct tu_cmd_buffer *cmd,
                              const VkOffset2D *hw_viewport_offsets,
                              unsigned views,
                              const VkExtent2D *frag_areas,
-                              const VkRect2D *bins)
+                              const VkRect2D *bins,
+                              bool binning)
 {
   const struct apply_sysmem_clear_coords_state *state =
      (const struct apply_sysmem_clear_coords_state *)data;
@ -4064,9 +4066,15 @@ fdm_apply_sysmem_clear_coords(struct tu_cmd_buffer *cmd,
      hw_viewport_offsets[MIN2(state->view, views - 1)];

   VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset);
+
   offset.x -= hw_viewport_offset.x;
   offset.y -= hw_viewport_offset.y;

+   if (state->custom_resolve && !binning) {
+      offset = (VkOffset2D) {};
+      frag_area = (VkExtent2D) { 1, 1 };
+   }
+
   unsigned x1 = state->rect.offset.x / frag_area.width + offset.x;
   unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width,
                              frag_area.width) + offset.x;
@ -4251,6 +4259,7 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
               .layer = rects[i].baseArrayLayer + layer,
               .z_clear_val = z_clear_val,
               .rect = rects[i].rect,
+               .custom_resolve = subpass->custom_resolve,
            };
            tu_create_fdm_bin_patchpoint(cmd, cs, 4, TU_FDM_NONE,
                                         fdm_apply_sysmem_clear_coords,
@ -4323,6 +4332,7 @@ clear_gmem_attachment(struct tu_cmd_buffer *cmd,
 struct apply_gmem_clear_coords_state {
   unsigned view;
   VkRect2D rect;
+   bool custom_resolve;
 };

 static void
@ -4333,7 +4343,8 @@ fdm_apply_gmem_clear_coords(struct tu_cmd_buffer *cmd,
                            const VkOffset2D *hw_viewport_offsets,
                            unsigned views,
                            const VkExtent2D *frag_areas,
-                            const VkRect2D *bins)
+                            const VkRect2D *bins,
+                            bool binning)
 {
   const struct apply_gmem_clear_coords_state *state =
      (const struct apply_gmem_clear_coords_state *)data;
@ -4343,6 +4354,11 @@ fdm_apply_gmem_clear_coords(struct tu_cmd_buffer *cmd,

   VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset);

+   if (state->custom_resolve) {
+      offset = (VkOffset2D) {};
+      frag_area = (VkExtent2D) { 1, 1 };
+   }
+
   unsigned x1 = state->rect.offset.x / frag_area.width + offset.x;
   unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width,
                              frag_area.width) + offset.x - 1;
@ -4491,6 +4507,7 @@ tu_clear_attachments(struct tu_cmd_buffer *cmd,
                     const VkClearRect *pRects)
 {
   struct tu_cs *cs = &cmd->draw_cs;
+   const struct tu_subpass *subpass = cmd->state.subpass;

   /* sysmem path behaves like a draw, note we don't have a way of using different
    * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
@ -4504,8 +4521,11 @@ tu_clear_attachments(struct tu_cmd_buffer *cmd,
    *
    * Similarly, we also use the 3D path when in a secondary command buffer that
    * doesn't know the GMEM layout that will be chosen by the primary.
+    *
+    * Don't use the GMEM path if we are in a custom resolve.
    */
-   if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
+   if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT ||
+       subpass->custom_resolve) {
      tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
      return;
   }
@ -4514,7 +4534,6 @@ tu_clear_attachments(struct tu_cmd_buffer *cmd,
    * binning time, then emit the clear as a 3D draw so that it contributes to
    * that visibility.
   */
-   const struct tu_subpass *subpass = cmd->state.subpass;
   for (uint32_t i = 0; i < attachmentCount; i++) {
      uint32_t a;
      if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
@ -4561,6 +4580,7 @@ tu7_clear_attachment_generic_single_rect(
            struct apply_gmem_clear_coords_state state = {
               .view = 0,
               .rect = rect->rect,
+               .custom_resolve = subpass->custom_resolve,
            };
            tu_create_fdm_bin_patchpoint(cmd, cs, 3, TU_FDM_SKIP_BINNING,
                                         fdm_apply_gmem_clear_coords, state);
@ -4589,6 +4609,7 @@ tu7_clear_attachment_generic_single_rect(
            struct apply_gmem_clear_coords_state state = {
               .view = layer,
               .rect = rect->rect,
+               .custom_resolve = subpass->custom_resolve,
            };
            tu_create_fdm_bin_patchpoint(cmd, cs, 3, TU_FDM_SKIP_BINNING,
                                         fdm_apply_gmem_clear_coords, state);
@ -4970,7 +4991,8 @@ fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
                      const VkOffset2D *hw_viewport_offsets,
                      unsigned views,
                      const VkExtent2D *frag_areas,
-                      const VkRect2D *bins)
+                      const VkRect2D *bins,
+                      bool binning)
 {
   const struct apply_load_coords_state *state =
      (const struct apply_load_coords_state *)data;
@ -5435,6 +5457,8 @@ tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
               subpass->color_attachments[j].attachment;
         if (tu_attachment_store_mismatched_mutability(cmd, a, gmem_a))
            cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
+         if (subpass->custom_resolve)
+            cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
      }
   }

@ -5454,7 +5478,8 @@ fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
                       const VkOffset2D *hw_viewport_offsets,
                       unsigned views,
                       const VkExtent2D *frag_areas,
-                       const VkRect2D *bins)
+                       const VkRect2D *bins,
+                       bool binning)
 {
   const struct apply_store_coords_state *state =
      (const struct apply_store_coords_state *)data;
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@ -1412,6 +1412,64 @@ tu_fdm_num_layers(const struct tu_cmd_buffer *cmd)
      (cmd->state.fdm_per_layer ? cmd->state.framebuffer->layers : 1);
 }

+template <chip CHIP>
+static void
+tu6_emit_bin_size_gmem(struct tu_cmd_buffer *cmd,
+                       struct tu_cs *cs,
+                       enum a6xx_buffers_location buffers_location,
+                       bool disable_lrz)
+{
+   struct tu_physical_device *phys_dev = cmd->device->physical_device;
+   const struct tu_tiling_config *tiling = cmd->state.tiling;
+   bool hw_binning = use_hw_binning(cmd);
+
+   tu6_emit_bin_size<CHIP>(
+      cs, buffers_location == BUFFERS_IN_GMEM ? tiling->tile0.width : 0,
+      buffers_location == BUFFERS_IN_GMEM ? tiling->tile0.height : 0,
+      {
+         .render_mode = RENDERING_PASS,
+         .force_lrz_write_dis = !phys_dev->info->props.has_lrz_feedback,
+         .buffers_location = buffers_location,
+         .lrz_feedback_zmode_mask =
+            phys_dev->info->props.has_lrz_feedback
+               ? (hw_binning ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_Z_LATE_Z :
+                  LRZ_FEEDBACK_EARLY_Z_LATE_Z)
+               : LRZ_FEEDBACK_NONE,
+         .force_lrz_dis = CHIP >= A7XX && disable_lrz,
+      });
+
+}
+
+/* Set always-identical registers used specifically for GMEM */
+template <chip CHIP>
+static void
+tu7_emit_tile_render_begin_regs(struct tu_cs *cs)
+{
+   tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP, 0x0));
+   tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_GMEM));
+}
+
+/* Set always-identical registers used specifically for sysmem */
+template <chip CHIP>
+static void
+tu7_emit_sysmem_render_begin_regs(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+   tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP,
+      .z_sysmem = true,
+      .s_sysmem = true,
+      .rt0_sysmem = true,
+      .rt1_sysmem = true,
+      .rt2_sysmem = true,
+      .rt3_sysmem = true,
+      .rt4_sysmem = true,
+      .rt5_sysmem = true,
+      .rt6_sysmem = true,
+      .rt7_sysmem = true,
+   ));
+
+   tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_SYSMEM));
+}
+
 template <chip CHIP>
 static void
 tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
@ -1419,7 +1477,6 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
                     const struct tu_tile_config *tile,
                     bool fdm, const VkOffset2D *fdm_offsets)
 {
-   struct tu_physical_device *phys_dev = cmd->device->physical_device;
   const struct tu_tiling_config *tiling = cmd->state.tiling;
   const struct tu_framebuffer *fb = cmd->state.framebuffer;
   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
@ -1470,19 +1527,15 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
   if (fdm_offsets && (tile->pos.x == 0 || tile->pos.y == 0))
      disable_lrz = true;

-   tu6_emit_bin_size<CHIP>(
-      cs, tiling->tile0.width, tiling->tile0.height,
-      {
-         .render_mode = RENDERING_PASS,
-         .force_lrz_write_dis = !phys_dev->info->props.has_lrz_feedback,
-         .buffers_location = BUFFERS_IN_GMEM,
-         .lrz_feedback_zmode_mask =
-            phys_dev->info->props.has_lrz_feedback && !bin_is_scaled
-               ? (hw_binning ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_Z_LATE_Z :
-                  LRZ_FEEDBACK_EARLY_Z_LATE_Z)
-               : LRZ_FEEDBACK_NONE,
-         .force_lrz_dis = CHIP >= A7XX && disable_lrz,
-      });
+   /* When using custom resolve we need to re-emit these regs as they are
+    * overwritten when switching to sysmem.
+    */
+   if (CHIP >= A7XX &&
+       cmd->state.pass->subpasses[cmd->state.pass->subpass_count - 1].custom_resolve) {
+      tu7_emit_tile_render_begin_regs<CHIP>(cs);
+   }
+
+   tu6_emit_bin_size_gmem<CHIP>(cmd, cs, BUFFERS_IN_GMEM, disable_lrz);

   tu_cs_emit_regs(cs,
                   A6XX_VFD_RENDER_MODE(RENDERING_PASS));
@ -1634,7 +1687,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
         tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
         tu_cs_emit_qw(cs, patch->iova);
         patch->apply(cmd, cs, patch->data, (VkOffset2D) { x1, y1 },
-                      frag_offsets, views, tile->frag_areas, bins);
+                      frag_offsets, views, tile->frag_areas, bins, false);
      }

      /* Make the CP wait until the CP_MEM_WRITE's to the command buffers
@ -2111,15 +2164,6 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
   tu_cs_emit(cs, A6XX_SP_VS_CONST_CONFIG_CONSTLEN(8) | A6XX_SP_VS_CONST_CONFIG_ENABLED);
 }

-/* Set always-identical registers used specifically for GMEM */
-template <chip CHIP>
-static void
-tu7_emit_tile_render_begin_regs(struct tu_cs *cs)
-{
-   tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP, 0x0));
-   tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_GMEM));
-}
-
 /* Emit the bin restore preamble, which runs in between bins when L1
 * preemption with skipsaverestore happens and we switch back to this context.
 * We need to restore static registers normally programmed at cmdbuf start
@ -2435,7 +2479,7 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
         tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
         tu_cs_emit_qw(cs, patch->iova);
         patch->apply(cmd, cs, patch->data, (VkOffset2D) {0, 0}, frag_offsets,
-                      num_views, unscaled_frag_areas, bins);
+                      num_views, unscaled_frag_areas, bins, true);
      }

      tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
@ -2532,6 +2576,8 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
                          bool gmem)
 {
   const struct tu_tiling_config *tiling = cmd->state.tiling;
+   uint32_t layers = MAX2(cmd->state.framebuffer->layers,
+                          cmd->state.pass->num_views);

   /* note: we can probably emit input attachments just once for the whole
    * renderpass, this would avoid emitting both sysmem/gmem versions
@ -2621,7 +2667,11 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
         gmem_offset = att->gmem_offset_stencil[cmd->state.gmem_layout];
      }

-      if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem) {
+      if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem ||
+          /* Skip GMEM patching when tiling is impossible as we may get
+           * assertion failures from register packing below.
+           */
+          !tiling->possible) {
         memcpy(&texture.map[i * A6XX_TEX_CONST_DWORDS], dst, sizeof(dst));
         continue;
      }
@ -2647,10 +2697,17 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
      dst[2] =
         A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
         A6XX_TEX_CONST_2_PITCH(tiling->tile0.width * cpp);
-      /* Note: it seems the HW implicitly calculates the array pitch with the
-       * GMEM tiling, so we don't need to specify the pitch ourselves.
+      /* Note: it seems the HW implicitly calculates the array pitch, except
+       * when rendering to sysmem (i.e. in a custom resolve subpass). We only
+       * guarantee the pitch is valid when there is more than 1 layer, so skip
+       * emitting it otherwise to avoid asserts.
       */
-      dst[3] = 0;
+      if (layers > 1) {
+         dst[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(tiling->tile0.width *
+                                               tiling->tile0.height * cpp);
+      } else {
+         dst[3] = 0;
+      }
      dst[4] = cmd->device->physical_device->gmem_base + gmem_offset;
      dst[5] &= A6XX_TEX_CONST_5_DEPTH__MASK;
      for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
@ -2985,20 +3042,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
   });

   if (CHIP == A7XX) {
-      tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP,
-         .z_sysmem = true,
-         .s_sysmem = true,
-         .rt0_sysmem = true,
-         .rt1_sysmem = true,
-         .rt2_sysmem = true,
-         .rt3_sysmem = true,
-         .rt4_sysmem = true,
-         .rt5_sysmem = true,
-         .rt6_sysmem = true,
-         .rt7_sysmem = true,
-      ));
-
-      tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_SYSMEM));
+      tu7_emit_sysmem_render_begin_regs<CHIP>(cmd, cs);
   }

   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
@ -6400,9 +6444,14 @@ static void
 tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resolve_group)
 {
   struct tu_cs *cs = &cmd->draw_cs;
-   uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
+   const struct tu_subpass *subpass = cmd->state.subpass;
+   uint32_t subpass_idx = subpass - cmd->state.pass->subpasses;
   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling);

+   /* Shader resolve subpasses don't use GMEM */
+   if (subpass->custom_resolve)
+      return;
+
   /* If we might choose to bin, then put the loads under a check for geometry
    * having been binned to this tile.  If we don't choose to bin in the end,
    * then we will have manually set those registers to say geometry is present.
@ -6496,9 +6545,11 @@ tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer *cmd)
      return;

   struct tu_cs *cs = &cmd->draw_cs;
-   uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
+   const struct tu_subpass *subpass = cmd->state.subpass;
+   uint32_t subpass_idx = subpass - cmd->state.pass->subpasses;

-   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
+   if (!subpass->custom_resolve)
+      tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);

   tu6_emit_sysmem_unresolves<CHIP>(cmd, cs, cmd->state.subpass);

@ -6508,7 +6559,8 @@ tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer *cmd)
         tu_clear_sysmem_attachment<CHIP>(cmd, cs, i);
   }

-   tu_cond_exec_end(cs); /* sysmem */
+   if (!subpass->custom_resolve)
+      tu_cond_exec_end(cs); /* sysmem */
 }

 static void
@ -6584,6 +6636,84 @@ tu7_emit_subpass_shading_rate(struct tu_cmd_buffer *cmd,
   cmd->prev_fsr_is_null = false;
 }

+/* If this is a shader resolve subpass, switch to writing to sysmem.
+ */
+template <chip CHIP>
+static void
+tu_emit_subpass_custom_resolve(struct tu_cmd_buffer *cmd)
+{
+   struct tu_cs *cs = &cmd->draw_cs;
+   const struct tu_subpass *subpass = cmd->state.subpass;
+   const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling = cmd->state.tiling;
+
+   if (!subpass->custom_resolve)
+      return;
+
+   /* Since a7xx, buffer location can be controlled per-buffer. We also have
+    * to update the steering register so that generic clears use sysmem.
+    */
+   if (CHIP >= A7XX) {
+      tu7_emit_sysmem_render_begin_regs<CHIP>(cmd, cs);
+
+      /* Disable foveation offset here. It's not necessary for custom resolve.
+       */
+      tu_cs_emit_regs(cs, GRAS_BIN_FOVEAT(CHIP));
+      tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP));
+   } else {
+      tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
+
+      /* On a6xx the location is set in *_BIN_CONTROL */
+      tu6_emit_bin_size_gmem<CHIP>(cmd, cs, BUFFERS_IN_SYSMEM, false);
+
+      tu_cond_exec_end(cs);
+   }
+
+   /* With FDM and non-subsampled images, we switch from rendering space to
+    * framebuffer space in the custom resolve subpass when not in the binning
+    * pass because we are writing directly to the user-visible attachment. We
+    * already aren't relying on the window scissor whenever FDM is enabled,
+    * but it can get in the way if FDM offset is being used because it is
+    * specified in rendering space, so the origin is shifted to the right and
+    * down compared to the framebuffer-space bin coordinates and part of the
+    * bin gets incorrectly clipped. Just disable it here by setting it to the
+    * entire framebuffer. Add an extra tile size for when we are in the
+    * binning pass and still using rendering space.
+    */
+   if (tu_enable_fdm_offset(cmd)) {
+      tu6_emit_window_scissor<CHIP>(cs, 0, 0,
+                                    fb->width + tiling->tile0.width - 1,
+                                    fb->height + tiling->tile0.height - 1);
+   }
+
+   /* If FDM is enabled, we need to re-emit all FDM-related state. */
+   if (cmd->state.pass->fragment_density_map.attachment !=
+       VK_ATTACHMENT_UNUSED) {
+      cmd->state.dirty |= TU_CMD_DIRTY_FDM;
+   }
+}
+
+/* If the last subpass is a shader resolve pass, emit flushes after switching
+ * to sysmem, similar to fixed-function 3D resolves. Our flushing code assumes
+ * that when in GMEM mode CCU isn't in use so we have to flush it ourselves.
+ */
+template<chip CHIP>
+static void
+tu_emit_custom_resolve_end(struct tu_cmd_buffer *cmd)
+{
+   struct tu_cs *cs = &cmd->draw_cs;
+
+   const struct tu_subpass *subpass = cmd->state.subpass;
+
+   if (!subpass->custom_resolve)
+      return;
+
+   if (subpass->color_count)
+      tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
+   if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED)
+      tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_DEPTH);
+}
+
 /* emit loads, clears, and mrt/zs/msaa/ubwc state for the subpass that is
 * starting (either at vkCmdBeginRenderPass2() or vkCmdNextSubpass2())
 *
@ -6599,6 +6729,7 @@ tu_emit_subpass_begin(struct tu_cmd_buffer *cmd)

   struct tu_resolve_group resolve_group = {};

+   tu_emit_subpass_custom_resolve<CHIP>(cmd);
   tu_emit_subpass_begin_gmem<CHIP>(cmd, &resolve_group);
   tu_emit_subpass_begin_sysmem<CHIP>(cmd);
   if (cmd->device->physical_device->info->props.has_generic_clear) {
@ -7582,6 +7713,7 @@ fs_params_size(struct tu_cmd_buffer *cmd)

 struct apply_fs_params_state {
   unsigned num_consts;
+   bool custom_resolve;
 };

 static void
@ -7592,13 +7724,14 @@ fdm_apply_fs_params(struct tu_cmd_buffer *cmd,
                    const VkOffset2D *hw_viewport_offsets,
                    unsigned views,
                    const VkExtent2D *frag_areas,
-                    const VkRect2D *bins)
+                    const VkRect2D *bins,
+                    bool binning)
 {
   const struct apply_fs_params_state *state =
      (const struct apply_fs_params_state *)data;
   unsigned num_consts = state->num_consts;

-   for (unsigned i = 0; i < num_consts; i++) {
+   for (unsigned i = 0; i < DIV_ROUND_UP(num_consts, 2); i++) {
      /* FDM per layer may be enabled in the shader but not in the renderpass,
       * in which case views will be 1 and we have to replicate the one view
       * to all of the layers.
@ -7607,10 +7740,38 @@ fdm_apply_fs_params(struct tu_cmd_buffer *cmd,
      VkRect2D bin = bins[MIN2(i, views - 1)];
      VkOffset2D offset = tu_fdm_per_bin_offset(area, bin, common_bin_offset);

-      tu_cs_emit(cs, area.width);
-      tu_cs_emit(cs, area.height);
-      tu_cs_emit(cs, fui(offset.x));
-      tu_cs_emit(cs, fui(offset.y));
+      /* For custom resolve, we switch to rendering directly to sysmem and so
+       * the fragment size becomes 1x1. This means we have to scale down
+       * FragCoord when accessing GMEM input attachments.
+       *
+       * TODO: When we support subsampled images, this should also only happen
+       * for non-subsampled images.
+       */
+      if (state->custom_resolve) {
+         tu_cs_emit(cs, 1 /* width */);
+         tu_cs_emit(cs, 1 /* height */);
+         tu_cs_emit(cs, fui(0.0));
+         tu_cs_emit(cs, fui(0.0));
+      } else {
+         tu_cs_emit(cs, area.width);
+         tu_cs_emit(cs, area.height);
+         tu_cs_emit(cs, fui(offset.x));
+         tu_cs_emit(cs, fui(offset.y));
+      }
+
+      if (i * 2 + 1 < num_consts) {
+         if (state->custom_resolve) {
+            tu_cs_emit(cs, fui(1. / area.width));
+            tu_cs_emit(cs, fui(1. / area.height));
+            tu_cs_emit(cs, fui(offset.x));
+            tu_cs_emit(cs, fui(offset.y));
+         } else {
+            tu_cs_emit(cs, fui(1.0));
+            tu_cs_emit(cs, fui(1.0));
+            tu_cs_emit(cs, fui(0.0));
+            tu_cs_emit(cs, fui(0.0));
+         }
+      }
   }
 }

@ -7632,16 +7793,23 @@ tu_emit_fdm_params(struct tu_cmd_buffer *cmd,
      if (fs->fs.has_fdm) {
         struct apply_fs_params_state state = {
            .num_consts = num_units - 1,
+            .custom_resolve = cmd->state.subpass->custom_resolve,
         };
         tu_create_fdm_bin_patchpoint(cmd, cs, 4 * (num_units - 1),
                                      TU_FDM_SKIP_BINNING,
                                      fdm_apply_fs_params, state);
      } else {
-         for (unsigned i = 1; i < num_units; i++) {
+         for (unsigned i = 0; i < DIV_ROUND_UP((num_units - 1), 2); i++) {
            tu_cs_emit(cs, 1);
            tu_cs_emit(cs, 1);
            tu_cs_emit(cs, fui(0.0f));
            tu_cs_emit(cs, fui(0.0f));
+            if (i * 2 + 1 < num_units - 1) {
+               tu_cs_emit(cs, fui(1.0));
+               tu_cs_emit(cs, fui(1.0));
+               tu_cs_emit(cs, fui(0.0));
+               tu_cs_emit(cs, fui(0.0));
+            }
         }
      }
   }
@ -9174,6 +9342,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
      fdm_offsets = test_offsets;
   }

+   TU_CALLX(cmd_buffer->device, tu_emit_custom_resolve_end)(cmd_buffer);
+
   tu_cs_end(&cmd_buffer->draw_cs);
   tu_cs_end(&cmd_buffer->draw_epilogue_cs);
   TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer, fdm_offsets);
@ -9202,6 +9372,8 @@ tu_CmdEndRendering2EXT(VkCommandBuffer commandBuffer,
       */
      TU_CALLX(cmd_buffer->device, tu_lrz_flush_valid_during_renderpass)
         (cmd_buffer, &cmd_buffer->draw_cs);
+   } else {
+      TU_CALLX(cmd_buffer->device, tu_emit_custom_resolve_end)(cmd_buffer);
   }

   const VkRenderPassFragmentDensityMapOffsetEndInfoEXT *fdm_offset_info =
--- a/src/freedreno/vulkan/tu_cmd_buffer.h
+++ b/src/freedreno/vulkan/tu_cmd_buffer.h
@ -858,7 +858,8 @@ typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd,
                                   const VkOffset2D *hw_viewport_offsets,
                                   unsigned views,
                                   const VkExtent2D *frag_areas,
-                                   const VkRect2D *bins);
+                                   const VkRect2D *bins,
+                                   bool binning);

 enum tu_fdm_flags {
   TU_FDM_NONE = 0,
@ -926,7 +927,7 @@ _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
      };
      hw_viewport_offsets[i] = (VkOffset2D) { 0, 0 };
   }
-   apply(cmd, cs, state, (VkOffset2D) {0, 0}, hw_viewport_offsets, num_views, unscaled_frag_areas, bins);
+   apply(cmd, cs, state, (VkOffset2D) {0, 0}, hw_viewport_offsets, num_views, unscaled_frag_areas, bins, false);
   assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));

   util_dynarray_append(&cmd->fdm_bin_patchpoints, patch);
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@ -350,6 +350,7 @@ get_device_extensions(const struct tu_physical_device *device,
      .IMG_filter_cubic = device->info->props.has_tex_filter_cubic,
      .NV_compute_shader_derivatives = device->info->chip >= 7,
      .QCOM_fragment_density_map_offset = true,
+      .QCOM_render_pass_shader_resolve = true,
      .VALVE_fragment_density_map_layered = true,
      .VALVE_mutable_descriptor_type = true,
   } };
--- a/src/freedreno/vulkan/tu_pass.cc
+++ b/src/freedreno/vulkan/tu_pass.cc
@ -969,7 +969,8 @@ tu_subpass_use_attachment(struct tu_render_pass *pass, int i, uint32_t a, const
   struct tu_subpass *subpass = &pass->subpasses[i];
   struct tu_render_pass_attachment *att = &pass->attachments[a];

-   att->gmem = true;
+   if (!subpass->custom_resolve)
+      att->gmem = true;
   update_samples(subpass, att->samples);
   att->used_views |= subpass->multiview_mask;

@ -1182,6 +1183,8 @@ tu_CreateRenderPass2(VkDevice _device,
      subpass->srgb_cntl = 0;
      subpass->legacy_dithering_enabled = desc->flags &
         VK_SUBPASS_DESCRIPTION_ENABLE_LEGACY_DITHERING_BIT_EXT;
+      subpass->custom_resolve = desc->flags &
+         VK_SUBPASS_DESCRIPTION_SHADER_RESOLVE_BIT_QCOM;

      const BITMASK_ENUM(VkSubpassDescriptionFlagBits) raster_order_access_bits =
         VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_COLOR_ACCESS_BIT_EXT |
--- a/src/freedreno/vulkan/tu_pass.h
+++ b/src/freedreno/vulkan/tu_pass.h
@ -82,6 +82,8 @@ struct tu_subpass
   bool depth_used;
   bool stencil_used;

+   bool custom_resolve;
+
   VkSampleCountFlagBits samples;

   uint32_t srgb_cntl;
--- a/src/freedreno/vulkan/tu_pipeline.cc
+++ b/src/freedreno/vulkan/tu_pipeline.cc
@ -1773,6 +1773,12 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
         ~attachments_referenced;
   }

+   if (builder->state &
+       VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
+      keys[MESA_SHADER_FRAGMENT].custom_resolve =
+         builder->graphics_state.rp->custom_resolve;
+   }
+
   if (builder->create_flags &
       VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT) {
      for (unsigned i = 0; i < builder->num_libraries; i++) {
@ -2578,6 +2584,7 @@ struct apply_viewport_state {
   bool share_scale;
   /* See tu_pipeline::fake_single_viewport */
   bool fake_single_viewport;
+   bool custom_resolve;
 };

 /* It's a hardware restriction that the window offset (i.e. common_bin_offset)
@ -2624,7 +2631,8 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
                    VkOffset2D common_bin_offset,
                    const VkOffset2D *hw_viewport_offsets,
                    unsigned views,
-                    const VkExtent2D *frag_areas, const VkRect2D *bins)
+                    const VkExtent2D *frag_areas, const VkRect2D *bins,
+                    bool binning)
 {
   const struct apply_viewport_state *state =
      (const struct apply_viewport_state *)data;
@ -2653,9 +2661,16 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
       */
      VkViewport viewport =
         state->fake_single_viewport ? state->vp.viewports[0] : state->vp.viewports[i];
-      if (frag_area.width == 1 && frag_area.height == 1 &&
-          common_bin_offset.x == bin.offset.x &&
-          common_bin_offset.y == bin.offset.y) {
+      if ((frag_area.width == 1 && frag_area.height == 1 &&
+           common_bin_offset.x == bin.offset.x &&
+           common_bin_offset.y == bin.offset.y) ||
+          /* When in a custom resolve operation (TODO: and using
+           * non-subsampled images) we switch to framebuffer coordinates so we
+           * shouldn't apply the transform.  However the binning pass isn't
+           * aware of this, so we have to keep applying the transform for
+           * binning.
+           */
+          (state->custom_resolve && !binning)) {
         vp.viewports[i] = viewport;
         continue;
      }
@ -2692,6 +2707,7 @@ tu6_emit_viewport_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
      .share_scale = !cmd->state.per_view_viewport &&
         !cmd->state.per_layer_viewport,
      .fake_single_viewport = cmd->state.fake_single_viewport,
+      .custom_resolve = cmd->state.subpass->custom_resolve,
   };
   if (cmd->state.per_view_viewport)
      state.vp.viewport_count = num_views;
@ -2753,7 +2769,8 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
                   VkOffset2D common_bin_offset,
                   const VkOffset2D *hw_viewport_offsets,
                   unsigned views,
-                   const VkExtent2D *frag_areas, const VkRect2D *bins)
+                   const VkExtent2D *frag_areas, const VkRect2D *bins,
+                   bool binning)
 {
   const struct apply_viewport_state *state =
      (const struct apply_viewport_state *)data;
@ -2781,6 +2798,19 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
                                                common_bin_offset);
      offset.x -= hw_viewport_offset.x;
      offset.y -= hw_viewport_offset.y;
+
+      /* Disable scaling and offset when doing a custom resolve to a
+       * non-subsampled image and not in the binning pass, because we
+       * use framebuffer coordinates.
+       *
+       * TODO: When we support subsampled images, only do this for
+       * non-subsampled images.
+       */
+      if (state->custom_resolve && !binning) {
+         offset = (VkOffset2D) {};
+         frag_area = (VkExtent2D) {1, 1};
+      }
+
      VkOffset2D min = {
         scissor.offset.x / frag_area.width + offset.x,
         scissor.offset.y / frag_area.width + offset.y,
@ -2791,12 +2821,20 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
      };

      /* Intersect scissor with the scaled bin, this essentially replaces the
-       * window scissor.
+       * window scissor. With custom resolve (TODO: and non-subsampled images)
+       * we have to use the unscaled bin instead.
       */
      uint32_t scaled_width = bin.extent.width / frag_area.width;
      uint32_t scaled_height = bin.extent.height / frag_area.height;
-      uint32_t bin_x = common_bin_offset.x - hw_viewport_offset.x;
-      uint32_t bin_y = common_bin_offset.y - hw_viewport_offset.y;
+      int32_t bin_x;
+      int32_t bin_y;
+      if (state->custom_resolve && !binning) {
+         bin_x = bin.offset.x;
+         bin_y = bin.offset.y;
+      } else {
+         bin_x = common_bin_offset.x - hw_viewport_offset.x;
+         bin_y = common_bin_offset.y - hw_viewport_offset.y;
+      }
      vp.scissors[i].offset.x = MAX2(min.x, bin_x);
      vp.scissors[i].offset.y = MAX2(min.y, bin_y);
      vp.scissors[i].extent.width =
@ -2818,6 +2856,7 @@ tu6_emit_scissor_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
      .share_scale = !cmd->state.per_view_viewport &&
         !cmd->state.per_layer_viewport,
      .fake_single_viewport = cmd->state.fake_single_viewport,
+      .custom_resolve = cmd->state.subpass->custom_resolve,
   };
   if (cmd->state.per_view_viewport)
      state.vp.scissor_count = num_views;
@ -4426,6 +4465,8 @@ tu_fill_render_pass_state(struct vk_render_pass_state *rp,
      rp->color_attachment_formats[i] = pass->attachments[a].format;
      rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
   }
+
+   rp->custom_resolve = subpass->custom_resolve;
 }

 static void
--- a/src/freedreno/vulkan/tu_shader.cc
+++ b/src/freedreno/vulkan/tu_shader.cc
@ -643,20 +643,37 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
      return true;

   case nir_intrinsic_load_frag_size_ir3:
-   case nir_intrinsic_load_frag_offset_ir3: {
+   case nir_intrinsic_load_frag_offset_ir3:
+   case nir_intrinsic_load_gmem_frag_scale_ir3:
+   case nir_intrinsic_load_gmem_frag_offset_ir3: {
      if (!dev->compiler->load_shader_consts_via_preamble)
         return false;

-      unsigned param =
-         instr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
-         IR3_DP_FS(frag_size) : IR3_DP_FS(frag_offset);
+      unsigned param;
+      switch (instr->intrinsic) {
+      case nir_intrinsic_load_frag_size_ir3:
+         param = IR3_DP_FS(frag_size);
+         break;
+      case nir_intrinsic_load_frag_offset_ir3:
+         param = IR3_DP_FS(frag_offset);
+         break;
+      case nir_intrinsic_load_gmem_frag_scale_ir3:
+         param = IR3_DP_FS(gmem_frag_scale);
+         break;
+      case nir_intrinsic_load_gmem_frag_offset_ir3:
+         param = IR3_DP_FS(gmem_frag_offset);
+         break;
+      default:
+         UNREACHABLE("bad intrinsic");
+      }

-      unsigned offset = param - IR3_DP_FS_DYNAMIC;
+      unsigned base = param - IR3_DP_FS_DYNAMIC;

      nir_def *view = instr->src[0].ssa;
+      nir_def *offset = nir_imul_imm(b, view, 2);
      nir_def *result =
         ir3_load_driver_ubo_indirect(b, 2, &shader->const_state.fdm_ubo,
-                                      offset, view, nir_intrinsic_range(instr));
+                                      base, offset, nir_intrinsic_range(instr) * 2);

      nir_def_replace(&instr->def, result);
      return true;
@ -1147,6 +1164,7 @@ struct lower_fdm_options {
   unsigned num_views;
   bool adjust_fragcoord;
   bool use_layer;
+   bool adjust_gmem_fragcoord;
 };

 static bool
@ -1211,7 +1229,22 @@ lower_fdm_instr(struct nir_builder *b, nir_instr *instr, void *data)
   }

   if (intrin->intrinsic == nir_intrinsic_load_frag_coord_gmem_ir3) {
-      return nir_load_frag_coord_unscaled_ir3(b);
+      nir_def *unscaled_coord = nir_load_frag_coord_unscaled_ir3(b);
+
+      if (!options->adjust_gmem_fragcoord)
+         return unscaled_coord;
+
+      nir_def *frag_offset =
+         nir_load_gmem_frag_offset_ir3(b, view, .range = options->num_views);
+      nir_def *frag_scale =
+         nir_load_gmem_frag_scale_ir3(b, view, .range = options->num_views);
+      nir_def *xy = nir_trim_vector(b, unscaled_coord, 2);
+      xy = nir_fadd(b, nir_fmul(b, xy, frag_scale), frag_offset);
+      return nir_vec4(b,
+                      nir_channel(b, xy, 0),
+                      nir_channel(b, xy, 1),
+                      nir_channel(b, unscaled_coord, 2),
+                      nir_channel(b, unscaled_coord, 3));
   }

   assert(intrin->intrinsic == nir_intrinsic_load_frag_size);
@ -2802,6 +2835,7 @@ tu_shader_create(struct tu_device *dev,
                        key->max_fdm_layers, 1),
      .adjust_fragcoord = key->fragment_density_map,
      .use_layer = !key->multiview_mask,
+      .adjust_gmem_fragcoord = key->fragment_density_map && key->custom_resolve,
   };
   NIR_PASS(_, nir, tu_nir_lower_fdm, &fdm_options);

--- a/src/freedreno/vulkan/tu_shader.h
+++ b/src/freedreno/vulkan/tu_shader.h
@ -128,6 +128,7 @@ struct tu_shader_key {
   bool robust_storage_access2;
   bool robust_uniform_access2;
   bool lower_view_index_to_device_index;
+   bool custom_resolve;
   enum ir3_wavesize_option api_wavesize, real_wavesize;
 };