tu: Implement VK_QCOM_fragment_density_map_offset

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33500>
2025-12-25 23:50:11 +01:00 · 2025-02-11 12:40:59 -05:00 · 2025-02-11 12:40:59 -05:00 · 75178c4655
commit 75178c4655
parent 7351f8d587
15 changed files with 490 additions and 132 deletions
--- a/docs/features.txt
+++ b/docs/features.txt
@ -694,6 +694,7 @@ Khronos extensions that are not part of any Vulkan version:
  VK_EXT_map_memory_placed                              DONE (anv, nvk, radv, tu)
  VK_MESA_image_alignment_control                       DONE (anv, nvk, radv)
  VK_EXT_legacy_dithering                               DONE (anv, tu, vn)
+  VK_QCOM_fragment_density_map_offset                   DONE (tu)


 Clover OpenCL 1.0 -- all DONE:
--- a/src/freedreno/vulkan/tu_clear_blit.cc
+++ b/src/freedreno/vulkan/tu_clear_blit.cc
@ -1363,6 +1363,22 @@ r3d_src_gmem(struct tu_cmd_buffer *cmd,
   if (!iview->view.is_mutable)
      desc[0] &= ~A6XX_TEX_CONST_0_SWAP__MASK;
   desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
+
+   /* If FDM offset is used, the last row and column extend beyond the
+    * framebuffer but are shifted over when storing. Expand the width and
+    * height to account for that.
+    */
+   if (tu_enable_fdm_offset(cmd)) {
+      uint32_t width = desc[1] & A6XX_TEX_CONST_1_WIDTH__MASK;
+      uint32_t height = (desc[1] & A6XX_TEX_CONST_1_HEIGHT__MASK) >>
+         A6XX_TEX_CONST_1_HEIGHT__SHIFT;
+      width += cmd->state.tiling->tile0.width;
+      height += cmd->state.tiling->tile0.height;
+      desc[1] = (desc[1] & ~(A6XX_TEX_CONST_1_WIDTH__MASK |
+                            A6XX_TEX_CONST_1_HEIGHT__MASK)) |
+         A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
+   }
+
   desc[2] =
      A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
      A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
@ -3910,17 +3926,19 @@ static void
 fdm_apply_sysmem_clear_coords(struct tu_cmd_buffer *cmd,
                              struct tu_cs *cs,
                              void *data,
-                              VkRect2D bin,
+                              VkOffset2D common_bin_offset,
                              unsigned views,
-                              const VkExtent2D *frag_areas)
+                              const VkExtent2D *frag_areas,
+                              const VkRect2D *bins)
 {
   const struct apply_sysmem_clear_coords_state *state =
      (const struct apply_sysmem_clear_coords_state *)data;
   assert(state->view < views);

   VkExtent2D frag_area = frag_areas[state->view];
+   VkRect2D bin = bins[state->view];

-   VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
+   VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset);

   unsigned x1 = state->rect.offset.x / frag_area.width + offset.x;
   unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width,
@ -4182,17 +4200,19 @@ static void
 fdm_apply_gmem_clear_coords(struct tu_cmd_buffer *cmd,
                            struct tu_cs *cs,
                            void *data,
-                            VkRect2D bin,
+                            VkOffset2D common_bin_offset,
                            unsigned views,
-                            const VkExtent2D *frag_areas)
+                            const VkExtent2D *frag_areas,
+                            const VkRect2D *bins)
 {
   const struct apply_gmem_clear_coords_state *state =
      (const struct apply_gmem_clear_coords_state *)data;
   assert(state->view < views);

   VkExtent2D frag_area = frag_areas[state->view];
+   VkRect2D bin = bins[state->view];

-   VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
+   VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset);

   unsigned x1 = state->rect.offset.x / frag_area.width + offset.x;
   unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width,
@ -4816,14 +4836,16 @@ static void
 fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
                      struct tu_cs *cs,
                      void *data,
-                      VkRect2D bin,
+                      VkOffset2D common_bin_offset,
                      unsigned views,
-                      const VkExtent2D *frag_areas)
+                      const VkExtent2D *frag_areas,
+                      const VkRect2D *bins)
 {
   const struct apply_load_coords_state *state =
      (const struct apply_load_coords_state *)data;
   assert(state->view < views);
   VkExtent2D frag_area = frag_areas[state->view];
+   VkRect2D bin = bins[state->view];

   assert(bin.extent.width % frag_area.width == 0);
   assert(bin.extent.height % frag_area.height == 0);
@ -4831,10 +4853,10 @@ fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
   uint32_t scaled_height = bin.extent.height / frag_area.height;

   const float coords[] = {
-      bin.offset.x,                    bin.offset.y,
-      bin.offset.x,                    bin.offset.y,
-      bin.offset.x + scaled_width,     bin.offset.y + scaled_height,
-      bin.offset.x + bin.extent.width, bin.offset.y + bin.extent.height,
+      common_bin_offset.x,                common_bin_offset.y,
+      bin.offset.x,                       bin.offset.y,
+      common_bin_offset.x + scaled_width, common_bin_offset.y + scaled_height,
+      bin.offset.x + bin.extent.width,    bin.offset.y + bin.extent.height,
   };
   r3d_coords_raw(cmd, cs, coords);
 }
@ -5050,6 +5072,19 @@ store_cp_blit(struct tu_cmd_buffer *cmd,
   enum a6xx_format format = fmt.fmt;
   fixup_src_format(&src_format, dst_format, &format);

+   uint32_t src_width = dst_iview->vk.extent.width;
+   uint32_t src_height = dst_iview->vk.extent.height;
+
+   /* With FDM offset, we may blit from an extra row/column of tiles whose
+    * source coordinates are outside of the attachment. Add an extra tile
+    * width/height to the size to avoid clipping the source.
+    */
+   if (tu_enable_fdm_offset(cmd)) {
+      const struct tu_tiling_config *tiling = cmd->state.tiling;
+      src_width += tiling->tile0.width;
+      src_height += tiling->tile0.height;
+   }
+
   tu_cs_emit_regs(cs,
                   SP_PS_2D_SRC_INFO(CHIP,
                      .color_format = format,
@ -5063,8 +5098,8 @@ store_cp_blit(struct tu_cmd_buffer *cmd,
                      .unk22 = 1,
                      .mutableen = src_iview->view.is_mutable),
                   SP_PS_2D_SRC_SIZE(CHIP,
-                      .width = dst_iview->vk.extent.width,
-                      .height = dst_iview->vk.extent.height),
+                      .width = src_width,
+                      .height = src_height),
                   SP_PS_2D_SRC(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset),
                   SP_PS_2D_SRC_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp));

@ -5274,14 +5309,16 @@ static void
 fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
                       struct tu_cs *cs,
                       void *data,
-                       VkRect2D bin,
+                       VkOffset2D common_bin_offset,
                       unsigned views,
-                       const VkExtent2D *frag_areas)
+                       const VkExtent2D *frag_areas,
+                       const VkRect2D *bins)
 {
   const struct apply_store_coords_state *state =
      (const struct apply_store_coords_state *)data;
   assert(state->view < views);
   VkExtent2D frag_area = frag_areas[state->view];
+   VkRect2D bin = bins[state->view];

   /* The bin width/height must be a multiple of the frag_area to make sure
    * that the scaling happens correctly. This means there may be some
@ -5299,10 +5336,10 @@ fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
      A6XX_GRAS_2D_DST_BR(.x = bin.offset.x + bin.extent.width - 1,
                          .y = bin.offset.y + bin.extent.height - 1));
   tu_cs_emit_regs(cs,
-                   A6XX_GRAS_2D_SRC_TL_X(bin.offset.x),
-                   A6XX_GRAS_2D_SRC_BR_X(bin.offset.x + scaled_width - 1),
-                   A6XX_GRAS_2D_SRC_TL_Y(bin.offset.y),
-                   A6XX_GRAS_2D_SRC_BR_Y(bin.offset.y + scaled_height - 1));
+                   A6XX_GRAS_2D_SRC_TL_X(common_bin_offset.x),
+                   A6XX_GRAS_2D_SRC_BR_X(common_bin_offset.x + scaled_width - 1),
+                   A6XX_GRAS_2D_SRC_TL_Y(common_bin_offset.y),
+                   A6XX_GRAS_2D_SRC_BR_Y(common_bin_offset.y + scaled_height - 1));
 }

 template <chip CHIP>
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@ -700,7 +700,8 @@ tu6_emit_render_cntl<A7XX>(struct tu_cmd_buffer *cmd,
 }

 static void
-tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
+tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align,
+                      bool used_by_sysmem)
 {
   struct tu_physical_device *phys_dev = cmd->device->physical_device;
   const VkRect2D *render_area = &cmd->state.render_area;
@ -727,9 +728,42 @@ tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
      y2 = ALIGN_POT(y2 + 1, phys_dev->info->gmem_align_h) - 1;
   }

-   tu_cs_emit_regs(cs,
-                   A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
-                   A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
+   /* With FDM offset, bins are shifted to the right in GMEM space compared to
+    * framebuffer space. We do not use RB_BLIT_SCISSOR_* for loads and stores
+    * because those do not use the fast path, but we do use it for
+    * LOAD_OP_CLEAR. Expand the render area so that GMEM clears work
+    * correctly. We may over-clear but that's ok because the store is clipped
+    * to the render area.
+    */
+   if (tu_enable_fdm_offset(cmd)) {
+      const struct tu_tiling_config *tiling = cmd->state.tiling;
+
+      /* If this is a generic clear that's also used in sysmem mode then we
+       * need to emit the unmodified render area in sysmem mode because
+       * over-clearing is not allowed.
+       */
+      if (used_by_sysmem) {
+         tu_cs_emit_regs(cs,
+                         A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
+                         A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
+         tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
+                                CP_COND_REG_EXEC_0_GMEM);
+      }
+
+      x2 += tiling->tile0.width;
+      y2 += tiling->tile0.height;
+      tu_cs_emit_regs(cs,
+                      A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
+                      A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
+
+      if (used_by_sysmem) {
+         tu_cond_exec_end(cs);
+      }
+   } else {
+      tu_cs_emit_regs(cs,
+                      A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
+                      A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
+   }
 }

 void
@ -950,12 +984,20 @@ tu6_update_msaa_disable(struct tu_cmd_buffer *cmd)
   }
 }

+static const struct tu_vsc_config *
+tu_vsc_config(struct tu_cmd_buffer *cmd, const struct tu_tiling_config *tiling)
+{
+   if (tu_enable_fdm_offset(cmd))
+      return &tiling->fdm_offset_vsc;
+   return &tiling->vsc;
+}
+
 static bool
 use_hw_binning(struct tu_cmd_buffer *cmd)
 {
   const struct tu_framebuffer *fb = cmd->state.framebuffer;
   const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout];
-   const struct tu_vsc_config *vsc = &tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);

   /* XFB commands are emitted for BINNING || SYSMEM, which makes it
    * incompatible with non-hw binning GMEM rendering. this is required because
@ -1014,7 +1056,7 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
      return true;
   }

-   const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling);

   /* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */
   if (cmd->state.rp.xfb_used && !vsc->binning_possible) {
@ -1059,7 +1101,7 @@ static void
 tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                              uint32_t pipe, uint32_t slot, bool skip_wfm)
 {
-   const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling);

   if (vsc->binning_possible &&
       cmd->state.pass->has_cond_load_store) {
@ -1080,16 +1122,48 @@ struct tu_tile_config {
   VkExtent2D frag_areas[MAX_VIEWS];
 };

+/* For bin offsetting we want to do "Euclidean division," where the remainder
+ * (i.e. the offset of the bin) is always positive. Unfortunately C/C++
+ * remainder and division don't do this, so we have to implement it ourselves.
+ *
+ * For example, we should have:
+ *
+ * euclid_rem(-3, 4) = 1
+ * euclid_rem(-4, 4) = 0
+ * euclid_rem(-4, 4) = 3
+ */
+
+static int32_t
+euclid_rem(int32_t divisor, int32_t divisend)
+{
+   if (divisor >= 0)
+      return divisor % divisend;
+   int32_t tmp = divisend - (-divisor % divisend);
+   return tmp == divisend ? 0 : tmp;
+}
+
+/* Calculate how much the bins for a given view should be shifted to the left
+ * and upwards, given the application-provided FDM offset.
+ */
+static VkOffset2D
+tu_bin_offset(VkOffset2D fdm_offset, const struct tu_tiling_config *tiling)
+{
+   return (VkOffset2D) {
+      euclid_rem(-fdm_offset.x, tiling->tile0.width),
+      euclid_rem(-fdm_offset.y, tiling->tile0.height),
+   };
+}
+
 template <chip CHIP>
 static void
 tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
                     struct tu_cs *cs,
                     const struct tu_tile_config *tile,
-                     bool fdm)
+                     bool fdm, const VkOffset2D *fdm_offsets)
 {
   struct tu_physical_device *phys_dev = cmd->device->physical_device;
   const struct tu_tiling_config *tiling = cmd->state.tiling;
-   const struct tu_vsc_config *vsc = &tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
   bool hw_binning = use_hw_binning(cmd);

   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
@ -1118,6 +1192,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,

   const uint32_t x1 = tiling->tile0.width * tile->pos.x;
   const uint32_t y1 = tiling->tile0.height * tile->pos.y;
+
   const uint32_t x2 = MIN2(x1 + tiling->tile0.width, MAX_VIEWPORT_SIZE);
   const uint32_t y2 = MIN2(y1 + tiling->tile0.height, MAX_VIEWPORT_SIZE);
   tu6_emit_window_scissor(cs, x1, y1, x2 - 1, y2 - 1);
@ -1161,11 +1236,29 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
         { x1, y1 },
         { (x2 - x1) * tile->extent.width, (y2 - y1) * tile->extent.height }
      };
+      VkRect2D bins[views];
+      for (unsigned i = 0; i < views; i++) {
+         if (!fdm_offsets || cmd->state.rp.shared_viewport) {
+            bins[i] = bin;
+            continue;
+         }
+
+         VkOffset2D bin_offset = tu_bin_offset(fdm_offsets[i], tiling);
+
+         bins[i].offset.x = MAX2(0, (int32_t)x1 - bin_offset.x);
+         bins[i].offset.y = MAX2(0, (int32_t)y1 - bin_offset.y);
+         bins[i].extent.width =
+            MAX2(MIN2((int32_t)x1 + bin.extent.width - bin_offset.x, MAX_VIEWPORT_SIZE) - bins[i].offset.x, 0);
+         bins[i].extent.height =
+            MAX2(MIN2((int32_t)y1 + bin.extent.height - bin_offset.y, MAX_VIEWPORT_SIZE) - bins[i].offset.y, 0);
+      }
+
      util_dynarray_foreach (&cmd->fdm_bin_patchpoints,
                             struct tu_fdm_bin_patchpoint, patch) {
         tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
         tu_cs_emit_qw(cs, patch->iova);
-         patch->apply(cmd, cs, patch->data, bin, views, tile->frag_areas);
+         patch->apply(cmd, cs, patch->data, (VkOffset2D) { x1, y1 }, views,
+                      tile->frag_areas, bins);
      }

      /* Make the CP wait until the CP_MEM_WRITE's to the command buffers
@ -1252,7 +1345,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
   const struct tu_render_pass *pass = cmd->state.pass;
   const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
   const struct tu_framebuffer *fb = cmd->state.framebuffer;
-   const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling);

   if (pass->has_fdm)
      tu_cs_set_writeable(cs, true);
@ -1261,7 +1354,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RESOLVE) |
                  A6XX_CP_SET_MARKER_0_USES_GMEM);

-   tu6_emit_blit_scissor(cmd, cs, true);
+   tu6_emit_blit_scissor(cmd, cs, true, false);

   struct tu_resolve_group resolve_group = {};

@ -1646,13 +1739,31 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
   tu_cs_sanity_check(cs);
 }

+bool
+tu_enable_fdm_offset(struct tu_cmd_buffer *cmd)
+{
+   if (!cmd->state.pass)
+      return false;
+
+   if (!cmd->state.pass->has_fdm)
+      return false;
+
+   unsigned fdm_a = cmd->state.pass->fragment_density_map.attachment;
+   if (fdm_a == VK_ATTACHMENT_UNUSED)
+      return TU_DEBUG(FDM_OFFSET);
+
+   const struct tu_image_view *fdm = cmd->state.attachments[fdm_a];
+   return fdm->image->vk.create_flags &
+      VK_IMAGE_CREATE_FRAGMENT_DENSITY_MAP_OFFSET_BIT_QCOM;
+}
+
 static void
 update_vsc_pipe(struct tu_cmd_buffer *cmd,
                struct tu_cs *cs,
                uint32_t num_vsc_pipes)
 {
   const struct tu_tiling_config *tiling = cmd->state.tiling;
-   const struct tu_vsc_config *vsc = &tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);

   tu_cs_emit_regs(cs,
                   A6XX_VSC_BIN_SIZE(.width = tiling->tile0.width,
@ -1680,7 +1791,7 @@ static void
 emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 {
   const struct tu_tiling_config *tiling = cmd->state.tiling;
-   const struct tu_vsc_config *vsc = &tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
   const uint32_t used_pipe_count =
      vsc->pipe_count.width * vsc->pipe_count.height;

@ -1711,36 +1822,70 @@ emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)

 template <chip CHIP>
 static void
-tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
+                      const VkOffset2D *fdm_offsets)
 {
   struct tu_physical_device *phys_dev = cmd->device->physical_device;
   const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling = cmd->state.tiling;

   /* If this command buffer may be executed multiple times, then
    * viewports/scissor states may have been changed by previous executions
-    * and we need to reset them before executing the binning IB.
+    * and we need to reset them before executing the binning IB. With FDM
+    * offset the viewport also needs to be transformed during the binning
+    * phase.
    */
-   if (!(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) &&
-       cmd->fdm_bin_patchpoints.size != 0) {
+   if ((!(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) ||
+        fdm_offsets) && cmd->fdm_bin_patchpoints.size != 0) {
      unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
      VkExtent2D unscaled_frag_areas[num_views];
-      for (unsigned i = 0; i < num_views; i++)
+      VkRect2D bins[num_views];
+      for (unsigned i = 0; i < num_views; i++) {
         unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
-      VkRect2D bin = { { 0, 0 }, { fb->width, fb->height } };
+         if (fdm_offsets && !cmd->state.rp.shared_viewport) {
+            /* We need to shift over the viewport and scissor during the
+             * binning pass to match the shift applied when rendering. The way
+             * to do this is to make the per-view bin start negative. In the
+             * actual rendering pass, the per-view bin start is shifted in a
+             * negative direction but the first bin is clipped so that the bin
+             * start is never negative, but we need to do this to avoid
+             * clipping the user scissor to a non-zero common bin start. We
+             * skip patching load/store below in order to avoid patching loads
+             * and stores to a crazy negative-offset bin. The parts of the
+             * framebuffer left or above the origin correspond to the
+             * non-visible parts of the left or top bins that will be
+             * discarded. The framebuffer still needs to extend to the
+             * original bottom and right, to avoid incorrectly clipping the
+             * user scissor, so we need to add to the width and height to
+             * compensate.
+             */
+            VkOffset2D bin_offset = tu_bin_offset(fdm_offsets[i], tiling);
+            bins[i] = {
+               { -bin_offset.x, -bin_offset.y },
+               { fb->width + bin_offset.x, fb->height + bin_offset.y },
+            };
+         } else {
+            bins[i] = { { 0, 0 }, { fb->width, fb->height } };
+         }
+      }
      util_dynarray_foreach (&cmd->fdm_bin_patchpoints,
                             struct tu_fdm_bin_patchpoint, patch) {
         if (patch->flags & TU_FDM_SKIP_BINNING)
            continue;
         tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
         tu_cs_emit_qw(cs, patch->iova);
-         patch->apply(cmd, cs, patch->data, bin, num_views, unscaled_frag_areas);
+         patch->apply(cmd, cs, patch->data, (VkOffset2D) {0, 0}, num_views,
+                      unscaled_frag_areas, bins);
      }

      tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
      tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
   }

-   tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
+   uint32_t width = fb->width + (fdm_offsets ? tiling->tile0.width : 0);
+   uint32_t height = fb->height + (fdm_offsets ? tiling->tile0.height : 0);
+
+   tu6_emit_window_scissor(cs, 0, 0, width - 1, height - 1);

   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_VISIBILITY));
@ -1929,6 +2074,22 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
      if (!iview->view.is_mutable)
         dst[0] &= ~A6XX_TEX_CONST_0_SWAP__MASK;
      dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
+
+      /* If FDM offset is used, the last row and column extend beyond the
+       * framebuffer but are shifted over when storing. Expand the width and
+       * height to account for that.
+       */
+      if (tu_enable_fdm_offset(cmd)) {
+         uint32_t width = dst[1] & A6XX_TEX_CONST_1_WIDTH__MASK;
+         uint32_t height = (dst[1] & A6XX_TEX_CONST_1_HEIGHT__MASK) >>
+            A6XX_TEX_CONST_1_HEIGHT__SHIFT;
+         width += cmd->state.tiling->tile0.width;
+         height += cmd->state.tiling->tile0.height;
+         dst[1] = (dst[1] & ~(A6XX_TEX_CONST_1_WIDTH__MASK |
+                              A6XX_TEX_CONST_1_HEIGHT__MASK)) |
+            A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
+      }
+
      dst[2] =
         A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
         A6XX_TEX_CONST_2_PITCH(tiling->tile0.width * cpp);
@ -2177,11 +2338,12 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      struct tu_renderpass_result *autotune_result)
+                      struct tu_renderpass_result *autotune_result,
+                      const VkOffset2D *fdm_offsets)
 {
   struct tu_physical_device *phys_dev = cmd->device->physical_device;
   const struct tu_tiling_config *tiling = cmd->state.tiling;
-   const struct tu_vsc_config *vsc = &tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
   tu_lrz_tiling_begin<CHIP>(cmd, cs);

   tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
@ -2225,7 +2387,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,

      tu6_emit_render_cntl<CHIP>(cmd, cmd->state.subpass, cs, true);

-      tu6_emit_binning_pass<CHIP>(cmd, cs);
+      tu6_emit_binning_pass<CHIP>(cmd, cs, fdm_offsets);

      if (CHIP == A6XX) {
         tu_cs_emit_regs(cs,
@ -2270,9 +2432,9 @@ template <chip CHIP>
 static void
 tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                const struct tu_tile_config *tile,
-                bool fdm)
+                bool fdm, const VkOffset2D *fdm_offsets)
 {
-   tu6_emit_tile_select<CHIP>(cmd, &cmd->cs, tile, fdm);
+   tu6_emit_tile_select<CHIP>(cmd, &cmd->cs, tile, fdm, fdm_offsets);
   tu_lrz_before_tile<CHIP>(cmd, &cmd->cs);

   trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs);
@ -2338,7 +2500,8 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 static void
 tu_calc_frag_area(struct tu_cmd_buffer *cmd,
                  struct tu_tile_config *tile,
-                  const struct tu_image_view *fdm)
+                  const struct tu_image_view *fdm,
+                  const VkOffset2D *fdm_offsets)
 {
   const struct tu_tiling_config *tiling = cmd->state.tiling;
   const uint32_t x1 = tiling->tile0.width * tile->pos.x;
@ -2351,11 +2514,71 @@ tu_calc_frag_area(struct tu_cmd_buffer *cmd,
   const struct tu_framebuffer *fb = cmd->state.framebuffer;
   struct tu_frag_area raw_areas[views];
   if (fdm) {
-      tu_fragment_density_map_sample(fdm,
-                                     (x1 + MIN2(x2, fb->width)) / 2,
-                                     (y1 + MIN2(y2, fb->height)) / 2,
-                                     fb->width, fb->height, views,
-                                     raw_areas);
+      for (unsigned i = 0; i < views; i++) {
+         VkOffset2D sample_pos = { 0, 0 };
+
+         /* Offsets less than a tile size are accomplished by sliding the
+          * tiles.  However once we shift a whole tile size then we reset the
+          * tiles back to where they were at the beginning and we need to
+          * adjust where each bin is sampling from:
+          *
+          * x offset = 0:
+          *
+          * ------------------------------------
+          * |   *   |   *   |   *   | (unused) |
+          * ------------------------------------
+          *
+          * x offset = 4:
+          *
+          * -------------------------
+          * | * |   *   |   *   | * |
+          * -------------------------
+          *
+          * x offset = 8:
+          *
+          * ------------------------------------
+          * |   *   |   *   |   *   | (unused) |
+          * ------------------------------------
+          *
+          * As the user's offset increases we slide the tiles to the right,
+          * until we reach the whole tile size and reset the tile positions.
+          * tu_bin_offset() returns an amount to shift to the left, negating
+          * the offset.
+          *
+          * If we were forced to use a shared viewport, then we must not shift
+          * over the tiles and instead must only shift when sampling because
+          * we cannot shift the tiles differently per view. This disables
+          * smooth transitions of the fragment density map and effectively
+          * negates the extension.
+          *
+          * Note that we cannot clamp x2/y2 to the framebuffer size, as we
+          * normally would do, because then tiles along the edge would
+          * incorrectly nudge the sample_pos towards the center of the
+          * framebuffer. If we shift one complete tile over towards the
+          * center and reset the tiles as above, the sample_pos would
+          * then shift back towards the edge and we could get a "pop" from
+          * suddenly changing density due to the slight shift.
+          */
+         if (fdm_offsets) {
+            VkOffset2D offset = fdm_offsets[i];
+            if (!cmd->state.rp.shared_viewport) {
+               VkOffset2D bin_offset = tu_bin_offset(fdm_offsets[i], tiling);
+               offset.x += bin_offset.x;
+               offset.y += bin_offset.y;
+            }
+            sample_pos.x = (x1 + x2) / 2 - offset.x;
+            sample_pos.y = (y1 + y2) / 2 - offset.y;
+         } else {
+            sample_pos.x = (x1 + MIN2(x2, fb->width)) / 2;
+            sample_pos.y = (y1 + MIN2(y2, fb->height)) / 2;
+         }
+
+         tu_fragment_density_map_sample(fdm,
+                                        sample_pos.x,
+                                        sample_pos.y,
+                                        fb->width, fb->height, i,
+                                        &raw_areas[i]);
+      }
   } else {
      for (unsigned i = 0; i < views; i++)
         raw_areas[i].width = raw_areas[i].height = 1.0f;
@ -2388,10 +2611,24 @@ tu_calc_frag_area(struct tu_cmd_buffer *cmd,
      width = 1u << util_logbase2(width);
      height = 1u << util_logbase2(height);

+      /* When FDM offset is enabled, the fragment area has to divide the
+       * offset to make sure that we don't have tiles with partial fragments.
+       * It would be bad to have the fragment area change as a function of the
+       * offset, because we'd get "popping" as the resolution changes with the
+       * offset, so just make sure it divides the offset granularity. This
+       * should mean it always divides the offset for any possible offset.
+       */
+      if (fdm_offsets) {
+         width = MIN2(width, TU_FDM_OFFSET_GRANULARITY);
+         height = MIN2(height, TU_FDM_OFFSET_GRANULARITY);
+      }
+
      /* Make sure that the width/height divides the tile width/height so
       * we don't have to do extra awkward clamping of the edges of each
-       * bin when resolving. Note that because the tile width is rounded to
-       * a multiple of 32 any power of two 32 or less will work.
+       * bin when resolving. It also has to divide the fdm offset, if any.
+       * Note that because the tile width is rounded to a multiple of 32 any
+       * power of two 32 or less will work, and if there is an offset then it
+       * must be a multiple of 4 so 2 or 4 will definitely work.
       *
       * TODO: Try to take advantage of the total area allowance here, too.
       */
@ -2486,7 +2723,8 @@ template <chip CHIP>
 void
 tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
                   uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2,
-                   const struct tu_image_view *fdm)
+                   const struct tu_image_view *fdm,
+                   const VkOffset2D *fdm_offsets)
 {
   uint32_t width = tx2 - tx1;
   uint32_t height = ty2 - ty1;
@ -2505,7 +2743,7 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
         tile->extent = { 1, 1 };
         tile->pipe = pipe;
         tile->slot_mask = 1u << (width * y + x);
-         tu_calc_frag_area(cmd, tile, fdm);
+         tu_calc_frag_area(cmd, tile, fdm, fdm_offsets);
      }
   }

@ -2549,7 +2787,8 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
         if (merged_tiles & (1u << tile_idx))
            continue;

-         tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx], true);
+         tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx],
+                               true, fdm_offsets);
      }
   }
 }
@ -2557,10 +2796,11 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
 template <chip CHIP>
 static void
 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
-                    struct tu_renderpass_result *autotune_result)
+                    struct tu_renderpass_result *autotune_result,
+                    const VkOffset2D *fdm_offsets)
 {
   const struct tu_tiling_config *tiling = cmd->state.tiling;
-   const struct tu_vsc_config *vsc = &tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
   const struct tu_image_view *fdm = NULL;

   if (cmd->state.pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) {
@ -2571,6 +2811,10 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
   bool merge_tiles = has_fdm && !TU_DEBUG(NO_BIN_MERGING) &&
      cmd->device->physical_device->info->a6xx.has_bin_mask;

+   /* If not using FDM make sure not to accidentally apply the offsets */
+   if (!has_fdm)
+      fdm_offsets = NULL;
+
   /* Create gmem stores now (at EndRenderPass time)) because they needed to
    * know whether to allow their conditional execution, which was tied to a
    * state that was known only at the end of the renderpass.  They will be
@ -2582,7 +2826,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,

   cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace);

-   tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result, fdm_offsets);

   /* Note: we reverse the order of walking the pipes and tiles on every
    * other row, to improve texture cache locality compared to raster order.
@ -2602,7 +2846,8 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
         uint32_t ty2 = MIN2(ty1 + vsc->pipe0.height, vsc->tile_count.height);

         if (merge_tiles) {
-            tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm);
+            tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm,
+                                     fdm_offsets);
            continue;
         }

@ -2623,9 +2868,10 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
                  .extent = { 1, 1 },
               };
               if (has_fdm)
-                  tu_calc_frag_area(cmd, &tile, fdm);
+                  tu_calc_frag_area(cmd, &tile, fdm, fdm_offsets);

-               tu6_render_tile<CHIP>(cmd, &cmd->cs, &tile, has_fdm);
+               tu6_render_tile<CHIP>(cmd, &cmd->cs, &tile, has_fdm,
+                                     fdm_offsets);
            }
            slot_row += tile_row_stride;
         }
@ -2676,7 +2922,8 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,

 template <chip CHIP>
 void
-tu_cmd_render(struct tu_cmd_buffer *cmd_buffer)
+tu_cmd_render(struct tu_cmd_buffer *cmd_buffer,
+              const VkOffset2D *fdm_offsets)
 {
   if (cmd_buffer->state.rp.has_tess)
      tu6_lazy_emit_tessfactor_addr<CHIP>(cmd_buffer);
@ -2685,7 +2932,7 @@ tu_cmd_render(struct tu_cmd_buffer *cmd_buffer)
   if (use_sysmem_rendering(cmd_buffer, &autotune_result))
      tu_cmd_render_sysmem<CHIP>(cmd_buffer, autotune_result);
   else
-      tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result);
+      tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result, fdm_offsets);

   /* Outside of renderpasses we assume all draw states are disabled. We do
    * this outside the draw CS for the normal case where 3d gmem stores aren't
@ -4771,7 +5018,7 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
                   */
                  tu_restore_suspended_pass(cmd, cmd);

-                  TU_CALLX(cmd->device, tu_cmd_render)(cmd);
+                  TU_CALLX(cmd->device, tu_cmd_render)(cmd, NULL);
                  if (cmd->state.suspend_resume == SR_IN_CHAIN)
                     cmd->state.suspend_resume = SR_NONE;
                  else
@ -4877,7 +5124,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
 {
   struct tu_cs *cs = &cmd->draw_cs;
   uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
-   const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling);

   /* If we might choose to bin, then put the loads under a check for geometry
    * having been binned to this tile.  If we don't choose to bin in the end,
@ -4902,7 +5149,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
      struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[i];
      if ((att->load || att->load_stencil) && att->first_subpass_idx == subpass_idx) {
         if (!emitted_scissor) {
-            tu6_emit_blit_scissor(cmd, cs, true);
+            tu6_emit_blit_scissor(cmd, cs, true, false);
            emitted_scissor = true;
         }
         tu_load_gmem_attachment<CHIP>(cmd, cs, resolve_group, i,
@ -4918,7 +5165,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
            &cmd->state.pass->attachments[i];
         if (att->clear_mask && att->first_subpass_idx == subpass_idx) {
            if (!emitted_scissor) {
-               tu6_emit_blit_scissor(cmd, cs, false);
+               tu6_emit_blit_scissor(cmd, cs, false, false);
               emitted_scissor = true;
            }
            tu_clear_gmem_attachment<CHIP>(cmd, cs, resolve_group, i);
@ -4969,7 +5216,7 @@ tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resol
         &cmd->state.pass->attachments[i];
      if (att->clear_mask && att->first_subpass_idx == subpass_idx) {
         if (!emitted_scissor) {
-            tu6_emit_blit_scissor(cmd, cs, false);
+            tu6_emit_blit_scissor(cmd, cs, false, true);
            emitted_scissor = true;
         }
         tu7_generic_clear_attachment(cmd, cs, resolve_group, i);
@ -5432,7 +5679,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
      tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);

      if (subpass->resolve_attachments) {
-         tu6_emit_blit_scissor(cmd, cs, true);
+         tu6_emit_blit_scissor(cmd, cs, true, false);

         struct tu_resolve_group resolve_group = {};

@ -5908,9 +6155,10 @@ static void
 fdm_apply_fs_params(struct tu_cmd_buffer *cmd,
                    struct tu_cs *cs,
                    void *data,
-                    VkRect2D bin,
+                    VkOffset2D common_bin_offset,
                    unsigned views,
-                    const VkExtent2D *frag_areas)
+                    const VkExtent2D *frag_areas,
+                    const VkRect2D *bins)
 {
   const struct apply_fs_params_state *state =
      (const struct apply_fs_params_state *)data;
@ -5919,7 +6167,8 @@ fdm_apply_fs_params(struct tu_cmd_buffer *cmd,
   for (unsigned i = 0; i < num_consts; i++) {
      assert(i < views);
      VkExtent2D area = frag_areas[i];
-      VkOffset2D offset = tu_fdm_per_bin_offset(area, bin);
+      VkRect2D bin = bins[i];
+      VkOffset2D offset = tu_fdm_per_bin_offset(area, bin, common_bin_offset);
      
      tu_cs_emit(cs, area.width);
      tu_cs_emit(cs, area.height);
@ -7443,9 +7692,25 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
      return;
   }

+   const VkSubpassFragmentDensityMapOffsetEndInfoQCOM *fdm_offset_info =
+      vk_find_struct_const(pSubpassEndInfo->pNext,
+                           SUBPASS_FRAGMENT_DENSITY_MAP_OFFSET_END_INFO_QCOM);
+   const VkOffset2D *fdm_offsets =
+      (fdm_offset_info && fdm_offset_info->fragmentDensityOffsetCount > 0) ?
+      fdm_offset_info->pFragmentDensityOffsets : NULL;
+
+   VkOffset2D test_offsets[MAX_VIEWS];
+   if (TU_DEBUG(FDM) && TU_DEBUG(FDM_OFFSET)) {
+      for (unsigned i = 0;
+           i < MAX2(cmd_buffer->state.pass->num_views, 1); i++) {
+         test_offsets[i] = { 64, 64 };
+      }
+      fdm_offsets = test_offsets;
+   }
+
   tu_cs_end(&cmd_buffer->draw_cs);
   tu_cs_end(&cmd_buffer->draw_epilogue_cs);
-   TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer);
+   TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer, fdm_offsets);

   cmd_buffer->state.cache.pending_flush_bits |=
      cmd_buffer->state.renderpass_cache.pending_flush_bits;
@ -7483,7 +7748,16 @@ tu_CmdEndRendering(VkCommandBuffer commandBuffer)
          */
         tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs);
      } else {
-         TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer);
+         VkOffset2D test_offsets[MAX_VIEWS];
+         const VkOffset2D *fdm_offsets = NULL;
+         if (TU_DEBUG(FDM) && TU_DEBUG(FDM_OFFSET)) {
+            for (unsigned i = 0;
+                 i < MAX2(cmd_buffer->state.pass->num_views, 1); i++) {
+               test_offsets[i] = { 64, 64 };
+            }
+            fdm_offsets = test_offsets;
+         }
+         TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer, fdm_offsets);
      }

      tu_reset_render_pass(cmd_buffer);
--- a/src/freedreno/vulkan/tu_cmd_buffer.h
+++ b/src/freedreno/vulkan/tu_cmd_buffer.h
@ -695,7 +695,7 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
                          struct tu_cmd_buffer *suspended);

 template <chip CHIP>
-void tu_cmd_render(struct tu_cmd_buffer *cmd);
+void tu_cmd_render(struct tu_cmd_buffer *cmd, const VkOffset2D *fdm_offsets);

 void tu_dispatch_unaligned(VkCommandBuffer commandBuffer,
                           uint32_t x, uint32_t y, uint32_t z);
@ -748,12 +748,15 @@ void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
                                       uint32_t *rb_depth_cntl);

+bool tu_enable_fdm_offset(struct tu_cmd_buffer *cmd);
+
 typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd,
                                   struct tu_cs *cs,
                                   void *data,
-                                   VkRect2D bin,
+                                   VkOffset2D common_bin_offset,
                                   unsigned views,
-                                   const VkExtent2D *frag_areas);
+                                   const VkExtent2D *frag_areas,
+                                   const VkRect2D *bins);

 enum tu_fdm_flags {
   TU_FDM_NONE = 0,
@ -807,13 +810,15 @@ _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
    */
   unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
   VkExtent2D unscaled_frag_areas[num_views];
+   VkRect2D bins[num_views];
   for (unsigned i = 0; i < num_views; i++) {
      unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
-   }
-   apply(cmd, cs, state, (VkRect2D) {
+      bins[i] = (VkRect2D) {
         { 0, 0 },
         { MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE },
-        }, num_views, unscaled_frag_areas);
+      };
+   }
+   apply(cmd, cs, state, (VkOffset2D) {0, 0}, num_views, unscaled_frag_areas, bins);
   assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));

   util_dynarray_append(&cmd->fdm_bin_patchpoints,
--- a/src/freedreno/vulkan/tu_common.h
+++ b/src/freedreno/vulkan/tu_common.h
@ -138,6 +138,18 @@
 #define MAX_FDM_TEXEL_SIZE_LOG2 10
 #define MAX_FDM_TEXEL_SIZE (1u << MAX_FDM_TEXEL_SIZE_LOG2)

+/* This granularity is arbitrary, but there are two competing concerns here:
+ * 
+ * - The fragment area has to always divide the offset, and we don't want the
+ *   fragment area changing with the offset, so we have to clamp the fragment
+ *   area to this granularity. Therefore larger granularities lead to lower
+ *   minimum resolution.
+ * - The larger the offset granularity, the choppier the motion is.
+ *
+ * Choose 8 as a compromise between the two.
+ */
+#define TU_FDM_OFFSET_GRANULARITY 8
+
 #define TU_GENX(FUNC_NAME) FD_GENX(FUNC_NAME)

 #define TU_CALLX(device, thing) FD_CALLX((device)->physical_device->info, thing)
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@ -333,6 +333,7 @@ get_device_extensions(const struct tu_physical_device *device,
      .GOOGLE_user_type = true,
      .IMG_filter_cubic = device->info->a6xx.has_tex_filter_cubic,
      .NV_compute_shader_derivatives = device->info->chip >= 7,
+      .QCOM_fragment_density_map_offset = true,
      .VALVE_mutable_descriptor_type = true,
   } };

@ -747,6 +748,9 @@ tu_get_features(struct tu_physical_device *pdevice,
   /* VK_KHR_subgroup_rotate */
   features->shaderSubgroupRotate = true;
   features->shaderSubgroupRotateClustered = true;
+
+   /* VK_QCOM_fragment_density_map_offset */
+   features->fragmentDensityMapOffset = true;
 }

 static void
@ -1385,6 +1389,11 @@ tu_get_properties(struct tu_physical_device *pdevice,
   props->degenerateLinesRasterized = false;
   props->fullyCoveredFragmentShaderInputVariable = false;
   props->conservativeRasterizationPostDepthCoverage = false;
+
+   /* VK_QCOM_fragment_density_map_offset */
+   props->fragmentDensityOffsetGranularity = (VkExtent2D) { 
+      TU_FDM_OFFSET_GRANULARITY, TU_FDM_OFFSET_GRANULARITY
+   };
 }

 static const struct vk_pipeline_cache_object_ops *const cache_import_ops[] = {
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@ -488,7 +488,7 @@ struct tu_tiling_config {
   /* Whether using GMEM is even possible with this configuration */
   bool possible;

-   struct tu_vsc_config vsc;
+   struct tu_vsc_config vsc, fdm_offset_vsc;
 };

 struct tu_framebuffer
--- a/src/freedreno/vulkan/tu_dynamic_rendering.cc
+++ b/src/freedreno/vulkan/tu_dynamic_rendering.cc
@ -152,7 +152,7 @@ tu_insert_dynamic_cmdbufs(struct tu_device *dev,
                                        old_cmds[i]->pre_chain.trace_renderpass_end);
         }

-         TU_CALLX(dev, tu_cmd_render)(cmd_buffer);
+         TU_CALLX(dev, tu_cmd_render)(cmd_buffer, NULL);

         tu_cs_emit_pkt7(&cmd_buffer->cs, CP_MEM_WRITE, 3);
         tu_cs_emit_qw(&cmd_buffer->cs,
--- a/src/freedreno/vulkan/tu_image.cc
+++ b/src/freedreno/vulkan/tu_image.cc
@ -1163,10 +1163,10 @@ tu_DestroyImageView(VkDevice _device,
 */
 void
 tu_fragment_density_map_sample(const struct tu_image_view *fdm,
-                               uint32_t x, uint32_t y,
+                               int32_t x, int32_t y,
                               uint32_t width, uint32_t height,
-                               uint32_t layers,
-                               struct tu_frag_area *areas)
+                               uint32_t layer,
+                               struct tu_frag_area *area)
 {
   assert(fdm->image->layout[0].tile_mode == TILE6_LINEAR);

@ -1176,20 +1176,19 @@ tu_fragment_density_map_sample(const struct tu_image_view *fdm,
   fdm_shift_x = CLAMP(fdm_shift_x, MIN_FDM_TEXEL_SIZE_LOG2, MAX_FDM_TEXEL_SIZE_LOG2);
   fdm_shift_y = CLAMP(fdm_shift_y, MIN_FDM_TEXEL_SIZE_LOG2, MAX_FDM_TEXEL_SIZE_LOG2);

-   uint32_t i = x >> fdm_shift_x;
-   uint32_t j = y >> fdm_shift_y;
+   int32_t i = x >> fdm_shift_x;
+   int32_t j = y >> fdm_shift_y;
+
+   i = CLAMP(i, 0, fdm->vk.extent.width - 1);
+   j = CLAMP(j, 0, fdm->vk.extent.height - 1);

   unsigned cpp = fdm->image->layout[0].cpp;
   unsigned pitch = fdm->view.pitch;

-   void *pixel = (char *)fdm->image->map + fdm->view.offset + cpp * i + pitch * j;
-   for (unsigned i = 0; i < layers; i++) {
-      float density_src[4], density[4];
-      util_format_unpack_rgba(fdm->view.format, density_src, pixel, 1);
-      pipe_swizzle_4f(density, density_src, fdm->swizzle);
-      areas[i].width = 1.0f / density[0];
-      areas[i].height = 1.0f / density[1];
-
-      pixel = (char *)pixel + fdm->view.layer_size;
-   }
+   void *pixel = (char *)fdm->image->map + fdm->view.offset + fdm->view.layer_size * layer + cpp * i + pitch * j;
+   float density_src[4], density[4];
+   util_format_unpack_rgba(fdm->view.format, density_src, pixel, 1);
+   pipe_swizzle_4f(density, density_src, fdm->swizzle);
+   area->width = 1.0f / density[0];
+   area->height = 1.0f / density[1];
 }
--- a/src/freedreno/vulkan/tu_image.h
+++ b/src/freedreno/vulkan/tu_image.h
@ -129,9 +129,9 @@ struct tu_frag_area {

 void
 tu_fragment_density_map_sample(const struct tu_image_view *fdm,
-                               uint32_t x, uint32_t y,
+                               int32_t x, int32_t y,
                               uint32_t width, uint32_t height,
-                               uint32_t layers, struct tu_frag_area *areas);
+                               uint32_t layer, struct tu_frag_area *area);

 VkResult
 tu_image_update_layout(struct tu_device *device, struct tu_image *image,
--- a/src/freedreno/vulkan/tu_lrz.cc
+++ b/src/freedreno/vulkan/tu_lrz.cc
@ -157,7 +157,7 @@ tu6_write_lrz_cntl(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                   struct A6XX_GRAS_LRZ_CNTL cntl)
 {
   if (CHIP >= A7XX) {
-      // A7XX split LRZ_CNTL into two seperate registers.
+      /* A7XX split LRZ_CNTL into two seperate registers. */
      struct tu_reg_value cntl2 = A7XX_GRAS_LRZ_CNTL2(
         .disable_on_wrong_dir = cntl.disable_on_wrong_dir,
         .fc_enable = cntl.fc_enable,
--- a/src/freedreno/vulkan/tu_pipeline.cc
+++ b/src/freedreno/vulkan/tu_pipeline.cc
@ -2548,44 +2548,49 @@ struct apply_viewport_state {
   bool share_scale;
 };

-/* It's a hardware restriction that the window offset (i.e. bin.offset) must
- * be the same for all views. This means that GMEM coordinates cannot be a
- * simple scaling of framebuffer coordinates, because this would require us to
- * scale the window offset and the scale may be different per view. Instead we
- * have to apply a per-bin offset to the GMEM coordinate transform to make
- * sure that the window offset maps to itself. Specifically we need an offset
- * o to the transform:
+/* It's a hardware restriction that the window offset (i.e. common_bin_offset)
+ * must be the same for all views. This means that GMEM coordinates cannot be
+ * a simple scaling of framebuffer coordinates, because this would require us
+ * to scale the window offset and the scale may be different per view. Instead
+ * we have to apply a per-bin offset to the GMEM coordinate transform to make
+ * sure that the window offset maps to the per-view bin coordinate, which will
+ * be the same if there is no offset. Specifically we need an offset o to the
+ * transform:
 *
 * x' = s * x + o
 *
- * so that when we plug in the bin start b_s:
+ * so that when we plug in the per-view bin start b_s and the common window
+ * offset b_cs:
 * 
- * b_s = s * b_s + o
+ * b_cs = s * b_s + o
 *
 * and we get:
 *
- * o = b_s - s * b_s
+ * o = b_cs - s * b_s
 *
- * We use this form exactly, because we know the bin offset is a multiple of
+ * We use this form exactly, because we know the bin start is a multiple of
 * the frag area so s * b_s is an integer and we can compute an exact result
- * easily.
+ * easily. We also have to make sure that the bin offset is a multiple of the
+ * frag area by restricting the frag area.
 */

 VkOffset2D
-tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin)
+tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin,
+                      VkOffset2D common_bin_offset)
 {
   assert(bin.offset.x % frag_area.width == 0);
   assert(bin.offset.y % frag_area.height == 0);

   return (VkOffset2D) {
-      bin.offset.x - bin.offset.x / frag_area.width,
-      bin.offset.y - bin.offset.y / frag_area.height
+      common_bin_offset.x - bin.offset.x / frag_area.width,
+      common_bin_offset.y - bin.offset.y / frag_area.height
   };
 }

 static void
 fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
-                    VkRect2D bin, unsigned views, const VkExtent2D *frag_areas)
+                    VkOffset2D common_bin_offset, unsigned views,
+                    const VkExtent2D *frag_areas, const VkRect2D *bins)
 {
   const struct apply_viewport_state *state =
      (const struct apply_viewport_state *)data;
@ -2603,9 +2608,12 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
       * replicate it across all viewports.
       */
      VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
+      VkRect2D bin = state->share_scale ? bins[0] : bins[i];
      VkViewport viewport =
         state->share_scale ? state->vp.viewports[i] : state->vp.viewports[0];
-      if (frag_area.width == 1 && frag_area.height == 1) {
+      if (frag_area.width == 1 && frag_area.height == 1 &&
+          common_bin_offset.x == bin.offset.x &&
+          common_bin_offset.y == bin.offset.y) {
         vp.viewports[i] = viewport;
         continue;
      }
@ -2618,7 +2626,8 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
      vp.viewports[i].width = viewport.width * scale_x;
      vp.viewports[i].height = viewport.height * scale_y;

-      VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
+      VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin,
+                                                common_bin_offset);

      vp.viewports[i].x = scale_x * viewport.x + offset.x;
      vp.viewports[i].y = scale_y * viewport.y + offset.y;
@ -2694,7 +2703,8 @@ tu6_emit_scissor(struct tu_cs *cs, const struct vk_viewport_state *vp)

 static void
 fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
-                   VkRect2D bin, unsigned views, const VkExtent2D *frag_areas)
+                   VkOffset2D common_bin_offset, unsigned views,
+                   const VkExtent2D *frag_areas, const VkRect2D *bins)
 {
   const struct apply_viewport_state *state =
      (const struct apply_viewport_state *)data;
@ -2703,12 +2713,9 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,

   for (unsigned i = 0; i < vp.scissor_count; i++) {
      VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
+      VkRect2D bin = state->share_scale ? bins[0] : bins[i];
      VkRect2D scissor =
         state->share_scale ? state->vp.scissors[i] : state->vp.scissors[0];
-      if (frag_area.width == 1 && frag_area.height == 1) {
-         vp.scissors[i] = scissor;
-         continue;
-      }

      /* Transform the scissor following the viewport. It's unclear how this
       * is supposed to handle cases where the scissor isn't aligned to the
@ -2716,7 +2723,8 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
       * fragments if the scissor size equals the framebuffer size and it
       * isn't aligned to the fragment area.
       */
-      VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
+      VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin,
+                                                common_bin_offset);
      VkOffset2D min = {
         scissor.offset.x / frag_area.width + offset.x,
         scissor.offset.y / frag_area.width + offset.y,
@ -2731,12 +2739,12 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
       */
      uint32_t scaled_width = bin.extent.width / frag_area.width;
      uint32_t scaled_height = bin.extent.height / frag_area.height;
-      vp.scissors[i].offset.x = MAX2(min.x, bin.offset.x);
-      vp.scissors[i].offset.y = MAX2(min.y, bin.offset.y);
+      vp.scissors[i].offset.x = MAX2(min.x, common_bin_offset.x);
+      vp.scissors[i].offset.y = MAX2(min.y, common_bin_offset.y);
      vp.scissors[i].extent.width =
-         MIN2(max.x, bin.offset.x + scaled_width) - vp.scissors[i].offset.x;
+         MIN2(max.x, common_bin_offset.x + scaled_width) - vp.scissors[i].offset.x;
      vp.scissors[i].extent.height =
-         MIN2(max.y, bin.offset.y + scaled_height) - vp.scissors[i].offset.y;
+         MIN2(max.y, common_bin_offset.y + scaled_height) - vp.scissors[i].offset.y;
   }

   TU_CALLX(cs->device, tu6_emit_scissor)(cs, &vp);
--- a/src/freedreno/vulkan/tu_pipeline.h
+++ b/src/freedreno/vulkan/tu_pipeline.h
@ -243,7 +243,8 @@ TU_DECL_PIPELINE_DOWNCAST(graphics, TU_PIPELINE_GRAPHICS)
 TU_DECL_PIPELINE_DOWNCAST(graphics_lib, TU_PIPELINE_GRAPHICS_LIB)
 TU_DECL_PIPELINE_DOWNCAST(compute, TU_PIPELINE_COMPUTE)

-VkOffset2D tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin);
+VkOffset2D tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin,
+                                 VkOffset2D common_bin_offset);

 template <chip CHIP>
 uint32_t tu_emit_draw_state(struct tu_cmd_buffer *cmd);
--- a/src/freedreno/vulkan/tu_util.cc
+++ b/src/freedreno/vulkan/tu_util.cc
@ -49,6 +49,7 @@ static const struct debug_control tu_debug_options[] = {
   { "dumpas", TU_DEBUG_DUMPAS },
   { "nobinmerging", TU_DEBUG_NO_BIN_MERGING },
   { "perfcraw", TU_DEBUG_PERFCRAW },
+   { "fdmoffset", TU_DEBUG_FDM_OFFSET },
   { NULL, 0 }
 };

@ -454,6 +455,16 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
      tu_tiling_config_update_pipe_layout(vsc, device, pass->has_fdm);
      tu_tiling_config_update_pipes(vsc, device);
      tu_tiling_config_update_binning(vsc, device);
+
+      if (pass->has_fdm) {
+         struct tu_vsc_config *fdm_offset_vsc = &tiling->fdm_offset_vsc;
+         fdm_offset_vsc->tile_count = (VkExtent2D) {
+            vsc->tile_count.width + 1, vsc->tile_count.height + 1
+         };
+         tu_tiling_config_update_pipe_layout(fdm_offset_vsc, device, true);
+         tu_tiling_config_update_pipes(fdm_offset_vsc, device);
+         tu_tiling_config_update_binning(fdm_offset_vsc, device);
+      }
   }
 }

--- a/src/freedreno/vulkan/tu_util.h
+++ b/src/freedreno/vulkan/tu_util.h
@ -69,6 +69,7 @@ enum tu_debug_flags : uint64_t
   TU_DEBUG_DUMPAS                   = BITFIELD64_BIT(28),
   TU_DEBUG_NO_BIN_MERGING           = BITFIELD64_BIT(29),
   TU_DEBUG_PERFCRAW                 = BITFIELD64_BIT(30),
+   TU_DEBUG_FDM_OFFSET               = BITFIELD64_BIT(31),
 };

 struct tu_env {