diff --git a/docs/features.txt b/docs/features.txt
index 4500c5b078e..796a09a2ca7 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -694,6 +694,7 @@ Khronos extensions that are not part of any Vulkan version:
   VK_EXT_map_memory_placed                              DONE (anv, nvk, radv, tu)
   VK_MESA_image_alignment_control                       DONE (anv, nvk, radv)
   VK_EXT_legacy_dithering                               DONE (anv, tu, vn)
+  VK_QCOM_fragment_density_map_offset                   DONE (tu)
 
 
 Clover OpenCL 1.0 -- all DONE:
diff --git a/src/freedreno/vulkan/tu_clear_blit.cc b/src/freedreno/vulkan/tu_clear_blit.cc
index a3c94f570aa..1222a4d7233 100644
--- a/src/freedreno/vulkan/tu_clear_blit.cc
+++ b/src/freedreno/vulkan/tu_clear_blit.cc
@@ -1363,6 +1363,22 @@ r3d_src_gmem(struct tu_cmd_buffer *cmd,
    if (!iview->view.is_mutable)
       desc[0] &= ~A6XX_TEX_CONST_0_SWAP__MASK;
    desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
+
+   /* If FDM offset is used, the last row and column extend beyond the
+    * framebuffer but are shifted over when storing. Expand the width and
+    * height to account for that.
+    */
+   if (tu_enable_fdm_offset(cmd)) {
+      uint32_t width = desc[1] & A6XX_TEX_CONST_1_WIDTH__MASK;
+      uint32_t height = (desc[1] & A6XX_TEX_CONST_1_HEIGHT__MASK) >>
+         A6XX_TEX_CONST_1_HEIGHT__SHIFT;
+      width += cmd->state.tiling->tile0.width;
+      height += cmd->state.tiling->tile0.height;
+      desc[1] = (desc[1] & ~(A6XX_TEX_CONST_1_WIDTH__MASK |
+                            A6XX_TEX_CONST_1_HEIGHT__MASK)) |
+         A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
+   }
+
    desc[2] =
       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
       A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
@@ -3910,17 +3926,19 @@ static void
 fdm_apply_sysmem_clear_coords(struct tu_cmd_buffer *cmd,
                               struct tu_cs *cs,
                               void *data,
-                              VkRect2D bin,
+                              VkOffset2D common_bin_offset,
                               unsigned views,
-                              const VkExtent2D *frag_areas)
+                              const VkExtent2D *frag_areas,
+                              const VkRect2D *bins)
 {
    const struct apply_sysmem_clear_coords_state *state =
       (const struct apply_sysmem_clear_coords_state *)data;
    assert(state->view < views);
 
    VkExtent2D frag_area = frag_areas[state->view];
+   VkRect2D bin = bins[state->view];
 
-   VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
+   VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset);
 
    unsigned x1 = state->rect.offset.x / frag_area.width + offset.x;
    unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width,
@@ -4182,17 +4200,19 @@ static void
 fdm_apply_gmem_clear_coords(struct tu_cmd_buffer *cmd,
                             struct tu_cs *cs,
                             void *data,
-                            VkRect2D bin,
+                            VkOffset2D common_bin_offset,
                             unsigned views,
-                            const VkExtent2D *frag_areas)
+                            const VkExtent2D *frag_areas,
+                            const VkRect2D *bins)
 {
    const struct apply_gmem_clear_coords_state *state =
       (const struct apply_gmem_clear_coords_state *)data;
    assert(state->view < views);
 
    VkExtent2D frag_area = frag_areas[state->view];
+   VkRect2D bin = bins[state->view];
 
-   VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
+   VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset);
 
    unsigned x1 = state->rect.offset.x / frag_area.width + offset.x;
    unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width,
@@ -4816,14 +4836,16 @@ static void
 fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
                       struct tu_cs *cs,
                       void *data,
-                      VkRect2D bin,
+                      VkOffset2D common_bin_offset,
                       unsigned views,
-                      const VkExtent2D *frag_areas)
+                      const VkExtent2D *frag_areas,
+                      const VkRect2D *bins)
 {
    const struct apply_load_coords_state *state =
       (const struct apply_load_coords_state *)data;
    assert(state->view < views);
    VkExtent2D frag_area = frag_areas[state->view];
+   VkRect2D bin = bins[state->view];
 
    assert(bin.extent.width % frag_area.width == 0);
    assert(bin.extent.height % frag_area.height == 0);
@@ -4831,10 +4853,10 @@ fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
    uint32_t scaled_height = bin.extent.height / frag_area.height;
 
    const float coords[] = {
-      bin.offset.x,                    bin.offset.y,
-      bin.offset.x,                    bin.offset.y,
-      bin.offset.x + scaled_width,     bin.offset.y + scaled_height,
-      bin.offset.x + bin.extent.width, bin.offset.y + bin.extent.height,
+      common_bin_offset.x,                common_bin_offset.y,
+      bin.offset.x,                       bin.offset.y,
+      common_bin_offset.x + scaled_width, common_bin_offset.y + scaled_height,
+      bin.offset.x + bin.extent.width,    bin.offset.y + bin.extent.height,
    };
    r3d_coords_raw(cmd, cs, coords);
 }
@@ -5050,6 +5072,19 @@ store_cp_blit(struct tu_cmd_buffer *cmd,
    enum a6xx_format format = fmt.fmt;
    fixup_src_format(&src_format, dst_format, &format);
 
+   uint32_t src_width = dst_iview->vk.extent.width;
+   uint32_t src_height = dst_iview->vk.extent.height;
+
+   /* With FDM offset, we may blit from an extra row/column of tiles whose
+    * source coordinates are outside of the attachment. Add an extra tile
+    * width/height to the size to avoid clipping the source.
+    */
+   if (tu_enable_fdm_offset(cmd)) {
+      const struct tu_tiling_config *tiling = cmd->state.tiling;
+      src_width += tiling->tile0.width;
+      src_height += tiling->tile0.height;
+   }
+
    tu_cs_emit_regs(cs,
                    SP_PS_2D_SRC_INFO(CHIP,
                       .color_format = format,
@@ -5063,8 +5098,8 @@ store_cp_blit(struct tu_cmd_buffer *cmd,
                       .unk22 = 1,
                       .mutableen = src_iview->view.is_mutable),
                    SP_PS_2D_SRC_SIZE(CHIP,
-                      .width = dst_iview->vk.extent.width,
-                      .height = dst_iview->vk.extent.height),
+                      .width = src_width,
+                      .height = src_height),
                    SP_PS_2D_SRC(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset),
                    SP_PS_2D_SRC_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp));
 
@@ -5274,14 +5309,16 @@ static void
 fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
                        struct tu_cs *cs,
                        void *data,
-                       VkRect2D bin,
+                       VkOffset2D common_bin_offset,
                        unsigned views,
-                       const VkExtent2D *frag_areas)
+                       const VkExtent2D *frag_areas,
+                       const VkRect2D *bins)
 {
    const struct apply_store_coords_state *state =
       (const struct apply_store_coords_state *)data;
    assert(state->view < views);
    VkExtent2D frag_area = frag_areas[state->view];
+   VkRect2D bin = bins[state->view];
 
    /* The bin width/height must be a multiple of the frag_area to make sure
     * that the scaling happens correctly. This means there may be some
@@ -5299,10 +5336,10 @@ fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
       A6XX_GRAS_2D_DST_BR(.x = bin.offset.x + bin.extent.width - 1,
                           .y = bin.offset.y + bin.extent.height - 1));
    tu_cs_emit_regs(cs,
-                   A6XX_GRAS_2D_SRC_TL_X(bin.offset.x),
-                   A6XX_GRAS_2D_SRC_BR_X(bin.offset.x + scaled_width - 1),
-                   A6XX_GRAS_2D_SRC_TL_Y(bin.offset.y),
-                   A6XX_GRAS_2D_SRC_BR_Y(bin.offset.y + scaled_height - 1));
+                   A6XX_GRAS_2D_SRC_TL_X(common_bin_offset.x),
+                   A6XX_GRAS_2D_SRC_BR_X(common_bin_offset.x + scaled_width - 1),
+                   A6XX_GRAS_2D_SRC_TL_Y(common_bin_offset.y),
+                   A6XX_GRAS_2D_SRC_BR_Y(common_bin_offset.y + scaled_height - 1));
 }
 
 template <chip CHIP>
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc
index 573475d558a..9b16cb2f71b 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@@ -700,7 +700,8 @@ tu6_emit_render_cntl<A7XX>(struct tu_cmd_buffer *cmd,
 }
 
 static void
-tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
+tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align,
+                      bool used_by_sysmem)
 {
    struct tu_physical_device *phys_dev = cmd->device->physical_device;
    const VkRect2D *render_area = &cmd->state.render_area;
@@ -727,9 +728,42 @@ tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
       y2 = ALIGN_POT(y2 + 1, phys_dev->info->gmem_align_h) - 1;
    }
 
-   tu_cs_emit_regs(cs,
-                   A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
-                   A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
+   /* With FDM offset, bins are shifted to the right in GMEM space compared to
+    * framebuffer space. We do not use RB_BLIT_SCISSOR_* for loads and stores
+    * because those do not use the fast path, but we do use it for
+    * LOAD_OP_CLEAR. Expand the render area so that GMEM clears work
+    * correctly. We may over-clear but that's ok because the store is clipped
+    * to the render area.
+    */
+   if (tu_enable_fdm_offset(cmd)) {
+      const struct tu_tiling_config *tiling = cmd->state.tiling;
+
+      /* If this is a generic clear that's also used in sysmem mode then we
+       * need to emit the unmodified render area in sysmem mode because
+       * over-clearing is not allowed.
+       */
+      if (used_by_sysmem) {
+         tu_cs_emit_regs(cs,
+                         A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
+                         A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
+         tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
+                                CP_COND_REG_EXEC_0_GMEM);
+      }
+
+      x2 += tiling->tile0.width;
+      y2 += tiling->tile0.height;
+      tu_cs_emit_regs(cs,
+                      A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
+                      A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
+
+      if (used_by_sysmem) {
+         tu_cond_exec_end(cs);
+      }
+   } else {
+      tu_cs_emit_regs(cs,
+                      A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
+                      A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
+   }
 }
 
 void
@@ -950,12 +984,20 @@ tu6_update_msaa_disable(struct tu_cmd_buffer *cmd)
    }
 }
 
+static const struct tu_vsc_config *
+tu_vsc_config(struct tu_cmd_buffer *cmd, const struct tu_tiling_config *tiling)
+{
+   if (tu_enable_fdm_offset(cmd))
+      return &tiling->fdm_offset_vsc;
+   return &tiling->vsc;
+}
+
 static bool
 use_hw_binning(struct tu_cmd_buffer *cmd)
 {
    const struct tu_framebuffer *fb = cmd->state.framebuffer;
    const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout];
-   const struct tu_vsc_config *vsc = &tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
 
    /* XFB commands are emitted for BINNING || SYSMEM, which makes it
     * incompatible with non-hw binning GMEM rendering. this is required because
@@ -1014,7 +1056,7 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
       return true;
    }
 
-   const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling);
 
    /* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */
    if (cmd->state.rp.xfb_used && !vsc->binning_possible) {
@@ -1059,7 +1101,7 @@ static void
 tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                               uint32_t pipe, uint32_t slot, bool skip_wfm)
 {
-   const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling);
 
    if (vsc->binning_possible &&
        cmd->state.pass->has_cond_load_store) {
@@ -1080,16 +1122,48 @@ struct tu_tile_config {
    VkExtent2D frag_areas[MAX_VIEWS];
 };
 
+/* For bin offsetting we want to do "Euclidean division," where the remainder
+ * (i.e. the offset of the bin) is always positive. Unfortunately C/C++
+ * remainder and division don't do this, so we have to implement it ourselves.
+ *
+ * For example, we should have:
+ *
+ * euclid_rem(-3, 4) = 1
+ * euclid_rem(-4, 4) = 0
+ * euclid_rem(-4, 4) = 3
+ */
+
+static int32_t
+euclid_rem(int32_t divisor, int32_t divisend)
+{
+   if (divisor >= 0)
+      return divisor % divisend;
+   int32_t tmp = divisend - (-divisor % divisend);
+   return tmp == divisend ? 0 : tmp;
+}
+
+/* Calculate how much the bins for a given view should be shifted to the left
+ * and upwards, given the application-provided FDM offset.
+ */
+static VkOffset2D
+tu_bin_offset(VkOffset2D fdm_offset, const struct tu_tiling_config *tiling)
+{
+   return (VkOffset2D) {
+      euclid_rem(-fdm_offset.x, tiling->tile0.width),
+      euclid_rem(-fdm_offset.y, tiling->tile0.height),
+   };
+}
+
 template <chip CHIP>
 static void
 tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
                      struct tu_cs *cs,
                      const struct tu_tile_config *tile,
-                     bool fdm)
+                     bool fdm, const VkOffset2D *fdm_offsets)
 {
    struct tu_physical_device *phys_dev = cmd->device->physical_device;
    const struct tu_tiling_config *tiling = cmd->state.tiling;
-   const struct tu_vsc_config *vsc = &tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
    bool hw_binning = use_hw_binning(cmd);
 
    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
@@ -1118,6 +1192,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
 
    const uint32_t x1 = tiling->tile0.width * tile->pos.x;
    const uint32_t y1 = tiling->tile0.height * tile->pos.y;
+
    const uint32_t x2 = MIN2(x1 + tiling->tile0.width, MAX_VIEWPORT_SIZE);
    const uint32_t y2 = MIN2(y1 + tiling->tile0.height, MAX_VIEWPORT_SIZE);
    tu6_emit_window_scissor(cs, x1, y1, x2 - 1, y2 - 1);
@@ -1161,11 +1236,29 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
          { x1, y1 },
          { (x2 - x1) * tile->extent.width, (y2 - y1) * tile->extent.height }
       };
+      VkRect2D bins[views];
+      for (unsigned i = 0; i < views; i++) {
+         if (!fdm_offsets || cmd->state.rp.shared_viewport) {
+            bins[i] = bin;
+            continue;
+         }
+
+         VkOffset2D bin_offset = tu_bin_offset(fdm_offsets[i], tiling);
+
+         bins[i].offset.x = MAX2(0, (int32_t)x1 - bin_offset.x);
+         bins[i].offset.y = MAX2(0, (int32_t)y1 - bin_offset.y);
+         bins[i].extent.width =
+            MAX2(MIN2((int32_t)x1 + bin.extent.width - bin_offset.x, MAX_VIEWPORT_SIZE) - bins[i].offset.x, 0);
+         bins[i].extent.height =
+            MAX2(MIN2((int32_t)y1 + bin.extent.height - bin_offset.y, MAX_VIEWPORT_SIZE) - bins[i].offset.y, 0);
+      }
+
       util_dynarray_foreach (&cmd->fdm_bin_patchpoints,
                              struct tu_fdm_bin_patchpoint, patch) {
          tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
          tu_cs_emit_qw(cs, patch->iova);
-         patch->apply(cmd, cs, patch->data, bin, views, tile->frag_areas);
+         patch->apply(cmd, cs, patch->data, (VkOffset2D) { x1, y1 }, views,
+                      tile->frag_areas, bins);
       }
 
       /* Make the CP wait until the CP_MEM_WRITE's to the command buffers
@@ -1252,7 +1345,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
    const struct tu_render_pass *pass = cmd->state.pass;
    const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
    const struct tu_framebuffer *fb = cmd->state.framebuffer;
-   const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling);
 
    if (pass->has_fdm)
       tu_cs_set_writeable(cs, true);
@@ -1261,7 +1354,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
    tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RESOLVE) |
                   A6XX_CP_SET_MARKER_0_USES_GMEM);
 
-   tu6_emit_blit_scissor(cmd, cs, true);
+   tu6_emit_blit_scissor(cmd, cs, true, false);
 
    struct tu_resolve_group resolve_group = {};
 
@@ -1646,13 +1739,31 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
    tu_cs_sanity_check(cs);
 }
 
+bool
+tu_enable_fdm_offset(struct tu_cmd_buffer *cmd)
+{
+   if (!cmd->state.pass)
+      return false;
+
+   if (!cmd->state.pass->has_fdm)
+      return false;
+
+   unsigned fdm_a = cmd->state.pass->fragment_density_map.attachment;
+   if (fdm_a == VK_ATTACHMENT_UNUSED)
+      return TU_DEBUG(FDM_OFFSET);
+
+   const struct tu_image_view *fdm = cmd->state.attachments[fdm_a];
+   return fdm->image->vk.create_flags &
+      VK_IMAGE_CREATE_FRAGMENT_DENSITY_MAP_OFFSET_BIT_QCOM;
+}
+
 static void
 update_vsc_pipe(struct tu_cmd_buffer *cmd,
                 struct tu_cs *cs,
                 uint32_t num_vsc_pipes)
 {
    const struct tu_tiling_config *tiling = cmd->state.tiling;
-   const struct tu_vsc_config *vsc = &tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
 
    tu_cs_emit_regs(cs,
                    A6XX_VSC_BIN_SIZE(.width = tiling->tile0.width,
@@ -1680,7 +1791,7 @@ static void
 emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 {
    const struct tu_tiling_config *tiling = cmd->state.tiling;
-   const struct tu_vsc_config *vsc = &tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
    const uint32_t used_pipe_count =
       vsc->pipe_count.width * vsc->pipe_count.height;
 
@@ -1711,36 +1822,70 @@ emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 
 template <chip CHIP>
 static void
-tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
+                      const VkOffset2D *fdm_offsets)
 {
    struct tu_physical_device *phys_dev = cmd->device->physical_device;
    const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling = cmd->state.tiling;
 
    /* If this command buffer may be executed multiple times, then
     * viewports/scissor states may have been changed by previous executions
-    * and we need to reset them before executing the binning IB.
+    * and we need to reset them before executing the binning IB. With FDM
+    * offset the viewport also needs to be transformed during the binning
+    * phase.
     */
-   if (!(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) &&
-       cmd->fdm_bin_patchpoints.size != 0) {
+   if ((!(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) ||
+        fdm_offsets) && cmd->fdm_bin_patchpoints.size != 0) {
       unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
       VkExtent2D unscaled_frag_areas[num_views];
-      for (unsigned i = 0; i < num_views; i++)
+      VkRect2D bins[num_views];
+      for (unsigned i = 0; i < num_views; i++) {
          unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
-      VkRect2D bin = { { 0, 0 }, { fb->width, fb->height } };
+         if (fdm_offsets && !cmd->state.rp.shared_viewport) {
+            /* We need to shift over the viewport and scissor during the
+             * binning pass to match the shift applied when rendering. The way
+             * to do this is to make the per-view bin start negative. In the
+             * actual rendering pass, the per-view bin start is shifted in a
+             * negative direction but the first bin is clipped so that the bin
+             * start is never negative, but we need to do this to avoid
+             * clipping the user scissor to a non-zero common bin start. We
+             * skip patching load/store below in order to avoid patching loads
+             * and stores to a crazy negative-offset bin. The parts of the
+             * framebuffer left or above the origin correspond to the
+             * non-visible parts of the left or top bins that will be
+             * discarded. The framebuffer still needs to extend to the
+             * original bottom and right, to avoid incorrectly clipping the
+             * user scissor, so we need to add to the width and height to
+             * compensate.
+             */
+            VkOffset2D bin_offset = tu_bin_offset(fdm_offsets[i], tiling);
+            bins[i] = {
+               { -bin_offset.x, -bin_offset.y },
+               { fb->width + bin_offset.x, fb->height + bin_offset.y },
+            };
+         } else {
+            bins[i] = { { 0, 0 }, { fb->width, fb->height } };
+         }
+      }
       util_dynarray_foreach (&cmd->fdm_bin_patchpoints,
                              struct tu_fdm_bin_patchpoint, patch) {
          if (patch->flags & TU_FDM_SKIP_BINNING)
             continue;
          tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
          tu_cs_emit_qw(cs, patch->iova);
-         patch->apply(cmd, cs, patch->data, bin, num_views, unscaled_frag_areas);
+         patch->apply(cmd, cs, patch->data, (VkOffset2D) {0, 0}, num_views,
+                      unscaled_frag_areas, bins);
       }
 
       tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
       tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
    }
 
-   tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
+   uint32_t width = fb->width + (fdm_offsets ? tiling->tile0.width : 0);
+   uint32_t height = fb->height + (fdm_offsets ? tiling->tile0.height : 0);
+
+   tu6_emit_window_scissor(cs, 0, 0, width - 1, height - 1);
 
    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
    tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_VISIBILITY));
@@ -1929,6 +2074,22 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
       if (!iview->view.is_mutable)
          dst[0] &= ~A6XX_TEX_CONST_0_SWAP__MASK;
       dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
+
+      /* If FDM offset is used, the last row and column extend beyond the
+       * framebuffer but are shifted over when storing. Expand the width and
+       * height to account for that.
+       */
+      if (tu_enable_fdm_offset(cmd)) {
+         uint32_t width = dst[1] & A6XX_TEX_CONST_1_WIDTH__MASK;
+         uint32_t height = (dst[1] & A6XX_TEX_CONST_1_HEIGHT__MASK) >>
+            A6XX_TEX_CONST_1_HEIGHT__SHIFT;
+         width += cmd->state.tiling->tile0.width;
+         height += cmd->state.tiling->tile0.height;
+         dst[1] = (dst[1] & ~(A6XX_TEX_CONST_1_WIDTH__MASK |
+                              A6XX_TEX_CONST_1_HEIGHT__MASK)) |
+            A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
+      }
+
       dst[2] =
          A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
          A6XX_TEX_CONST_2_PITCH(tiling->tile0.width * cpp);
@@ -2177,11 +2338,12 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 template <chip CHIP>
 static void
 tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      struct tu_renderpass_result *autotune_result)
+                      struct tu_renderpass_result *autotune_result,
+                      const VkOffset2D *fdm_offsets)
 {
    struct tu_physical_device *phys_dev = cmd->device->physical_device;
    const struct tu_tiling_config *tiling = cmd->state.tiling;
-   const struct tu_vsc_config *vsc = &tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
    tu_lrz_tiling_begin<CHIP>(cmd, cs);
 
    tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
@@ -2225,7 +2387,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 
       tu6_emit_render_cntl<CHIP>(cmd, cmd->state.subpass, cs, true);
 
-      tu6_emit_binning_pass<CHIP>(cmd, cs);
+      tu6_emit_binning_pass<CHIP>(cmd, cs, fdm_offsets);
 
       if (CHIP == A6XX) {
          tu_cs_emit_regs(cs,
@@ -2270,9 +2432,9 @@ template <chip CHIP>
 static void
 tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                 const struct tu_tile_config *tile,
-                bool fdm)
+                bool fdm, const VkOffset2D *fdm_offsets)
 {
-   tu6_emit_tile_select<CHIP>(cmd, &cmd->cs, tile, fdm);
+   tu6_emit_tile_select<CHIP>(cmd, &cmd->cs, tile, fdm, fdm_offsets);
    tu_lrz_before_tile<CHIP>(cmd, &cmd->cs);
 
    trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs);
@@ -2338,7 +2500,8 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 static void
 tu_calc_frag_area(struct tu_cmd_buffer *cmd,
                   struct tu_tile_config *tile,
-                  const struct tu_image_view *fdm)
+                  const struct tu_image_view *fdm,
+                  const VkOffset2D *fdm_offsets)
 {
    const struct tu_tiling_config *tiling = cmd->state.tiling;
    const uint32_t x1 = tiling->tile0.width * tile->pos.x;
@@ -2351,11 +2514,71 @@ tu_calc_frag_area(struct tu_cmd_buffer *cmd,
    const struct tu_framebuffer *fb = cmd->state.framebuffer;
    struct tu_frag_area raw_areas[views];
    if (fdm) {
-      tu_fragment_density_map_sample(fdm,
-                                     (x1 + MIN2(x2, fb->width)) / 2,
-                                     (y1 + MIN2(y2, fb->height)) / 2,
-                                     fb->width, fb->height, views,
-                                     raw_areas);
+      for (unsigned i = 0; i < views; i++) {
+         VkOffset2D sample_pos = { 0, 0 };
+
+         /* Offsets less than a tile size are accomplished by sliding the
+          * tiles.  However once we shift a whole tile size then we reset the
+          * tiles back to where they were at the beginning and we need to
+          * adjust where each bin is sampling from:
+          *
+          * x offset = 0:
+          *
+          * ------------------------------------
+          * |   *   |   *   |   *   | (unused) |
+          * ------------------------------------
+          *
+          * x offset = 4:
+          *
+          * -------------------------
+          * | * |   *   |   *   | * |
+          * -------------------------
+          *
+          * x offset = 8:
+          *
+          * ------------------------------------
+          * |   *   |   *   |   *   | (unused) |
+          * ------------------------------------
+          *
+          * As the user's offset increases we slide the tiles to the right,
+          * until we reach the whole tile size and reset the tile positions.
+          * tu_bin_offset() returns an amount to shift to the left, negating
+          * the offset.
+          *
+          * If we were forced to use a shared viewport, then we must not shift
+          * over the tiles and instead must only shift when sampling because
+          * we cannot shift the tiles differently per view. This disables
+          * smooth transitions of the fragment density map and effectively
+          * negates the extension.
+          *
+          * Note that we cannot clamp x2/y2 to the framebuffer size, as we
+          * normally would do, because then tiles along the edge would
+          * incorrectly nudge the sample_pos towards the center of the
+          * framebuffer. If we shift one complete tile over towards the
+          * center and reset the tiles as above, the sample_pos would
+          * then shift back towards the edge and we could get a "pop" from
+          * suddenly changing density due to the slight shift.
+          */
+         if (fdm_offsets) {
+            VkOffset2D offset = fdm_offsets[i];
+            if (!cmd->state.rp.shared_viewport) {
+               VkOffset2D bin_offset = tu_bin_offset(fdm_offsets[i], tiling);
+               offset.x += bin_offset.x;
+               offset.y += bin_offset.y;
+            }
+            sample_pos.x = (x1 + x2) / 2 - offset.x;
+            sample_pos.y = (y1 + y2) / 2 - offset.y;
+         } else {
+            sample_pos.x = (x1 + MIN2(x2, fb->width)) / 2;
+            sample_pos.y = (y1 + MIN2(y2, fb->height)) / 2;
+         }
+
+         tu_fragment_density_map_sample(fdm,
+                                        sample_pos.x,
+                                        sample_pos.y,
+                                        fb->width, fb->height, i,
+                                        &raw_areas[i]);
+      }
    } else {
       for (unsigned i = 0; i < views; i++)
          raw_areas[i].width = raw_areas[i].height = 1.0f;
@@ -2388,10 +2611,24 @@ tu_calc_frag_area(struct tu_cmd_buffer *cmd,
       width = 1u << util_logbase2(width);
       height = 1u << util_logbase2(height);
 
+      /* When FDM offset is enabled, the fragment area has to divide the
+       * offset to make sure that we don't have tiles with partial fragments.
+       * It would be bad to have the fragment area change as a function of the
+       * offset, because we'd get "popping" as the resolution changes with the
+       * offset, so just make sure it divides the offset granularity. This
+       * should mean it always divides the offset for any possible offset.
+       */
+      if (fdm_offsets) {
+         width = MIN2(width, TU_FDM_OFFSET_GRANULARITY);
+         height = MIN2(height, TU_FDM_OFFSET_GRANULARITY);
+      }
+
       /* Make sure that the width/height divides the tile width/height so
        * we don't have to do extra awkward clamping of the edges of each
-       * bin when resolving. Note that because the tile width is rounded to
-       * a multiple of 32 any power of two 32 or less will work.
+       * bin when resolving. It also has to divide the fdm offset, if any.
+       * Note that because the tile width is rounded to a multiple of 32 any
+       * power of two 32 or less will work, and if there is an offset then it
+       * must be a multiple of 4 so 2 or 4 will definitely work.
        *
        * TODO: Try to take advantage of the total area allowance here, too.
        */
@@ -2486,7 +2723,8 @@ template <chip CHIP>
 void
 tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
                    uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2,
-                   const struct tu_image_view *fdm)
+                   const struct tu_image_view *fdm,
+                   const VkOffset2D *fdm_offsets)
 {
    uint32_t width = tx2 - tx1;
    uint32_t height = ty2 - ty1;
@@ -2505,7 +2743,7 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
          tile->extent = { 1, 1 };
          tile->pipe = pipe;
          tile->slot_mask = 1u << (width * y + x);
-         tu_calc_frag_area(cmd, tile, fdm);
+         tu_calc_frag_area(cmd, tile, fdm, fdm_offsets);
       }
    }
 
@@ -2549,7 +2787,8 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
          if (merged_tiles & (1u << tile_idx))
             continue;
 
-         tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx], true);
+         tu6_render_tile<CHIP>(cmd, &cmd->cs, &tiles[tile_idx],
+                               true, fdm_offsets);
       }
    }
 }
@@ -2557,10 +2796,11 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe,
 template <chip CHIP>
 static void
 tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
-                    struct tu_renderpass_result *autotune_result)
+                    struct tu_renderpass_result *autotune_result,
+                    const VkOffset2D *fdm_offsets)
 {
    const struct tu_tiling_config *tiling = cmd->state.tiling;
-   const struct tu_vsc_config *vsc = &tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
    const struct tu_image_view *fdm = NULL;
 
    if (cmd->state.pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) {
@@ -2571,6 +2811,10 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
    bool merge_tiles = has_fdm && !TU_DEBUG(NO_BIN_MERGING) &&
       cmd->device->physical_device->info->a6xx.has_bin_mask;
 
+   /* If not using FDM make sure not to accidentally apply the offsets */
+   if (!has_fdm)
+      fdm_offsets = NULL;
+
    /* Create gmem stores now (at EndRenderPass time)) because they needed to
     * know whether to allow their conditional execution, which was tied to a
     * state that was known only at the end of the renderpass.  They will be
@@ -2582,7 +2826,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
 
    cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace);
 
-   tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result);
+   tu6_tile_render_begin<CHIP>(cmd, &cmd->cs, autotune_result, fdm_offsets);
 
    /* Note: we reverse the order of walking the pipes and tiles on every
     * other row, to improve texture cache locality compared to raster order.
@@ -2602,7 +2846,8 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
          uint32_t ty2 = MIN2(ty1 + vsc->pipe0.height, vsc->tile_count.height);
 
          if (merge_tiles) {
-            tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm);
+            tu_render_pipe_fdm<CHIP>(cmd, pipe, tx1, ty1, tx2, ty2, fdm,
+                                     fdm_offsets);
             continue;
          }
 
@@ -2623,9 +2868,10 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
                   .extent = { 1, 1 },
                };
                if (has_fdm)
-                  tu_calc_frag_area(cmd, &tile, fdm);
+                  tu_calc_frag_area(cmd, &tile, fdm, fdm_offsets);
 
-               tu6_render_tile<CHIP>(cmd, &cmd->cs, &tile, has_fdm);
+               tu6_render_tile<CHIP>(cmd, &cmd->cs, &tile, has_fdm,
+                                     fdm_offsets);
             }
             slot_row += tile_row_stride;
          }
@@ -2676,7 +2922,8 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
 
 template <chip CHIP>
 void
-tu_cmd_render(struct tu_cmd_buffer *cmd_buffer)
+tu_cmd_render(struct tu_cmd_buffer *cmd_buffer,
+              const VkOffset2D *fdm_offsets)
 {
    if (cmd_buffer->state.rp.has_tess)
       tu6_lazy_emit_tessfactor_addr<CHIP>(cmd_buffer);
@@ -2685,7 +2932,7 @@ tu_cmd_render(struct tu_cmd_buffer *cmd_buffer)
    if (use_sysmem_rendering(cmd_buffer, &autotune_result))
       tu_cmd_render_sysmem<CHIP>(cmd_buffer, autotune_result);
    else
-      tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result);
+      tu_cmd_render_tiles<CHIP>(cmd_buffer, autotune_result, fdm_offsets);
 
    /* Outside of renderpasses we assume all draw states are disabled. We do
     * this outside the draw CS for the normal case where 3d gmem stores aren't
@@ -4771,7 +5018,7 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
                    */
                   tu_restore_suspended_pass(cmd, cmd);
 
-                  TU_CALLX(cmd->device, tu_cmd_render)(cmd);
+                  TU_CALLX(cmd->device, tu_cmd_render)(cmd, NULL);
                   if (cmd->state.suspend_resume == SR_IN_CHAIN)
                      cmd->state.suspend_resume = SR_NONE;
                   else
@@ -4877,7 +5124,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
 {
    struct tu_cs *cs = &cmd->draw_cs;
    uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
-   const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc;
+   const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling);
 
    /* If we might choose to bin, then put the loads under a check for geometry
     * having been binned to this tile.  If we don't choose to bin in the end,
@@ -4902,7 +5149,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
       struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[i];
       if ((att->load || att->load_stencil) && att->first_subpass_idx == subpass_idx) {
          if (!emitted_scissor) {
-            tu6_emit_blit_scissor(cmd, cs, true);
+            tu6_emit_blit_scissor(cmd, cs, true, false);
             emitted_scissor = true;
          }
          tu_load_gmem_attachment<CHIP>(cmd, cs, resolve_group, i,
@@ -4918,7 +5165,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
             &cmd->state.pass->attachments[i];
          if (att->clear_mask && att->first_subpass_idx == subpass_idx) {
             if (!emitted_scissor) {
-               tu6_emit_blit_scissor(cmd, cs, false);
+               tu6_emit_blit_scissor(cmd, cs, false, false);
                emitted_scissor = true;
             }
             tu_clear_gmem_attachment<CHIP>(cmd, cs, resolve_group, i);
@@ -4969,7 +5216,7 @@ tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resol
          &cmd->state.pass->attachments[i];
       if (att->clear_mask && att->first_subpass_idx == subpass_idx) {
          if (!emitted_scissor) {
-            tu6_emit_blit_scissor(cmd, cs, false);
+            tu6_emit_blit_scissor(cmd, cs, false, true);
             emitted_scissor = true;
          }
          tu7_generic_clear_attachment(cmd, cs, resolve_group, i);
@@ -5432,7 +5679,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
       tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
 
       if (subpass->resolve_attachments) {
-         tu6_emit_blit_scissor(cmd, cs, true);
+         tu6_emit_blit_scissor(cmd, cs, true, false);
 
          struct tu_resolve_group resolve_group = {};
 
@@ -5908,9 +6155,10 @@ static void
 fdm_apply_fs_params(struct tu_cmd_buffer *cmd,
                     struct tu_cs *cs,
                     void *data,
-                    VkRect2D bin,
+                    VkOffset2D common_bin_offset,
                     unsigned views,
-                    const VkExtent2D *frag_areas)
+                    const VkExtent2D *frag_areas,
+                    const VkRect2D *bins)
 {
    const struct apply_fs_params_state *state =
       (const struct apply_fs_params_state *)data;
@@ -5919,7 +6167,8 @@ fdm_apply_fs_params(struct tu_cmd_buffer *cmd,
    for (unsigned i = 0; i < num_consts; i++) {
       assert(i < views);
       VkExtent2D area = frag_areas[i];
-      VkOffset2D offset = tu_fdm_per_bin_offset(area, bin);
+      VkRect2D bin = bins[i];
+      VkOffset2D offset = tu_fdm_per_bin_offset(area, bin, common_bin_offset);
       
       tu_cs_emit(cs, area.width);
       tu_cs_emit(cs, area.height);
@@ -7443,9 +7692,25 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
       return;
    }
 
+   const VkSubpassFragmentDensityMapOffsetEndInfoQCOM *fdm_offset_info =
+      vk_find_struct_const(pSubpassEndInfo->pNext,
+                           SUBPASS_FRAGMENT_DENSITY_MAP_OFFSET_END_INFO_QCOM);
+   const VkOffset2D *fdm_offsets =
+      (fdm_offset_info && fdm_offset_info->fragmentDensityOffsetCount > 0) ?
+      fdm_offset_info->pFragmentDensityOffsets : NULL;
+
+   VkOffset2D test_offsets[MAX_VIEWS];
+   if (TU_DEBUG(FDM) && TU_DEBUG(FDM_OFFSET)) {
+      for (unsigned i = 0;
+           i < MAX2(cmd_buffer->state.pass->num_views, 1); i++) {
+         test_offsets[i] = { 64, 64 };
+      }
+      fdm_offsets = test_offsets;
+   }
+
    tu_cs_end(&cmd_buffer->draw_cs);
    tu_cs_end(&cmd_buffer->draw_epilogue_cs);
-   TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer);
+   TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer, fdm_offsets);
 
    cmd_buffer->state.cache.pending_flush_bits |=
       cmd_buffer->state.renderpass_cache.pending_flush_bits;
@@ -7483,7 +7748,16 @@ tu_CmdEndRendering(VkCommandBuffer commandBuffer)
           */
          tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs);
       } else {
-         TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer);
+         VkOffset2D test_offsets[MAX_VIEWS];
+         const VkOffset2D *fdm_offsets = NULL;
+         if (TU_DEBUG(FDM) && TU_DEBUG(FDM_OFFSET)) {
+            for (unsigned i = 0;
+                 i < MAX2(cmd_buffer->state.pass->num_views, 1); i++) {
+               test_offsets[i] = { 64, 64 };
+            }
+            fdm_offsets = test_offsets;
+         }
+         TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer, fdm_offsets);
       }
 
       tu_reset_render_pass(cmd_buffer);
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h
index 5afad09f62a..3cb5476c108 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.h
+++ b/src/freedreno/vulkan/tu_cmd_buffer.h
@@ -695,7 +695,7 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
                           struct tu_cmd_buffer *suspended);
 
 template <chip CHIP>
-void tu_cmd_render(struct tu_cmd_buffer *cmd);
+void tu_cmd_render(struct tu_cmd_buffer *cmd, const VkOffset2D *fdm_offsets);
 
 void tu_dispatch_unaligned(VkCommandBuffer commandBuffer,
                            uint32_t x, uint32_t y, uint32_t z);
@@ -748,12 +748,15 @@ void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
                                        uint32_t *rb_depth_cntl);
 
+bool tu_enable_fdm_offset(struct tu_cmd_buffer *cmd);
+
 typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd,
                                    struct tu_cs *cs,
                                    void *data,
-                                   VkRect2D bin,
+                                   VkOffset2D common_bin_offset,
                                    unsigned views,
-                                   const VkExtent2D *frag_areas);
+                                   const VkExtent2D *frag_areas,
+                                   const VkRect2D *bins);
 
 enum tu_fdm_flags {
    TU_FDM_NONE = 0,
@@ -807,13 +810,15 @@ _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
     */
    unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
    VkExtent2D unscaled_frag_areas[num_views];
+   VkRect2D bins[num_views];
    for (unsigned i = 0; i < num_views; i++) {
       unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
-   }
-   apply(cmd, cs, state, (VkRect2D) {
+      bins[i] = (VkRect2D) {
          { 0, 0 },
          { MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE },
-        }, num_views, unscaled_frag_areas);
+      };
+   }
+   apply(cmd, cs, state, (VkOffset2D) {0, 0}, num_views, unscaled_frag_areas, bins);
    assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));
 
    util_dynarray_append(&cmd->fdm_bin_patchpoints,
diff --git a/src/freedreno/vulkan/tu_common.h b/src/freedreno/vulkan/tu_common.h
index af4a1aaf539..a8700e276c3 100644
--- a/src/freedreno/vulkan/tu_common.h
+++ b/src/freedreno/vulkan/tu_common.h
@@ -138,6 +138,18 @@
 #define MAX_FDM_TEXEL_SIZE_LOG2 10
 #define MAX_FDM_TEXEL_SIZE (1u << MAX_FDM_TEXEL_SIZE_LOG2)
 
+/* This granularity is arbitrary, but there are two competing concerns here:
+ * 
+ * - The fragment area has to always divide the offset, and we don't want the
+ *   fragment area changing with the offset, so we have to clamp the fragment
+ *   area to this granularity. Therefore larger granularities lead to lower
+ *   minimum resolution.
+ * - The larger the offset granularity, the choppier the motion is.
+ *
+ * Choose 8 as a compromise between the two.
+ */
+#define TU_FDM_OFFSET_GRANULARITY 8
+
 #define TU_GENX(FUNC_NAME) FD_GENX(FUNC_NAME)
 
 #define TU_CALLX(device, thing) FD_CALLX((device)->physical_device->info, thing)
diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc
index 6573d83bf22..9fdcf803159 100644
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@@ -333,6 +333,7 @@ get_device_extensions(const struct tu_physical_device *device,
       .GOOGLE_user_type = true,
       .IMG_filter_cubic = device->info->a6xx.has_tex_filter_cubic,
       .NV_compute_shader_derivatives = device->info->chip >= 7,
+      .QCOM_fragment_density_map_offset = true,
       .VALVE_mutable_descriptor_type = true,
    } };
 
@@ -747,6 +748,9 @@ tu_get_features(struct tu_physical_device *pdevice,
    /* VK_KHR_subgroup_rotate */
    features->shaderSubgroupRotate = true;
    features->shaderSubgroupRotateClustered = true;
+
+   /* VK_QCOM_fragment_density_map_offset */
+   features->fragmentDensityMapOffset = true;
 }
 
 static void
@@ -1385,6 +1389,11 @@ tu_get_properties(struct tu_physical_device *pdevice,
    props->degenerateLinesRasterized = false;
    props->fullyCoveredFragmentShaderInputVariable = false;
    props->conservativeRasterizationPostDepthCoverage = false;
+
+   /* VK_QCOM_fragment_density_map_offset */
+   props->fragmentDensityOffsetGranularity = (VkExtent2D) { 
+      TU_FDM_OFFSET_GRANULARITY, TU_FDM_OFFSET_GRANULARITY
+   };
 }
 
 static const struct vk_pipeline_cache_object_ops *const cache_import_ops[] = {
diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h
index e8277967a0b..fc9b898eea0 100644
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@@ -488,7 +488,7 @@ struct tu_tiling_config {
    /* Whether using GMEM is even possible with this configuration */
    bool possible;
 
-   struct tu_vsc_config vsc;
+   struct tu_vsc_config vsc, fdm_offset_vsc;
 };
 
 struct tu_framebuffer
diff --git a/src/freedreno/vulkan/tu_dynamic_rendering.cc b/src/freedreno/vulkan/tu_dynamic_rendering.cc
index 1ff7d4b631e..e7618b47281 100644
--- a/src/freedreno/vulkan/tu_dynamic_rendering.cc
+++ b/src/freedreno/vulkan/tu_dynamic_rendering.cc
@@ -152,7 +152,7 @@ tu_insert_dynamic_cmdbufs(struct tu_device *dev,
                                         old_cmds[i]->pre_chain.trace_renderpass_end);
          }
 
-         TU_CALLX(dev, tu_cmd_render)(cmd_buffer);
+         TU_CALLX(dev, tu_cmd_render)(cmd_buffer, NULL);
 
          tu_cs_emit_pkt7(&cmd_buffer->cs, CP_MEM_WRITE, 3);
          tu_cs_emit_qw(&cmd_buffer->cs,
diff --git a/src/freedreno/vulkan/tu_image.cc b/src/freedreno/vulkan/tu_image.cc
index da5e1e520a4..16da996a4d0 100644
--- a/src/freedreno/vulkan/tu_image.cc
+++ b/src/freedreno/vulkan/tu_image.cc
@@ -1163,10 +1163,10 @@ tu_DestroyImageView(VkDevice _device,
  */
 void
 tu_fragment_density_map_sample(const struct tu_image_view *fdm,
-                               uint32_t x, uint32_t y,
+                               int32_t x, int32_t y,
                                uint32_t width, uint32_t height,
-                               uint32_t layers,
-                               struct tu_frag_area *areas)
+                               uint32_t layer,
+                               struct tu_frag_area *area)
 {
    assert(fdm->image->layout[0].tile_mode == TILE6_LINEAR);
 
@@ -1176,20 +1176,19 @@ tu_fragment_density_map_sample(const struct tu_image_view *fdm,
    fdm_shift_x = CLAMP(fdm_shift_x, MIN_FDM_TEXEL_SIZE_LOG2, MAX_FDM_TEXEL_SIZE_LOG2);
    fdm_shift_y = CLAMP(fdm_shift_y, MIN_FDM_TEXEL_SIZE_LOG2, MAX_FDM_TEXEL_SIZE_LOG2);
 
-   uint32_t i = x >> fdm_shift_x;
-   uint32_t j = y >> fdm_shift_y;
+   int32_t i = x >> fdm_shift_x;
+   int32_t j = y >> fdm_shift_y;
+
+   i = CLAMP(i, 0, fdm->vk.extent.width - 1);
+   j = CLAMP(j, 0, fdm->vk.extent.height - 1);
 
    unsigned cpp = fdm->image->layout[0].cpp;
    unsigned pitch = fdm->view.pitch;
 
-   void *pixel = (char *)fdm->image->map + fdm->view.offset + cpp * i + pitch * j;
-   for (unsigned i = 0; i < layers; i++) {
-      float density_src[4], density[4];
-      util_format_unpack_rgba(fdm->view.format, density_src, pixel, 1);
-      pipe_swizzle_4f(density, density_src, fdm->swizzle);
-      areas[i].width = 1.0f / density[0];
-      areas[i].height = 1.0f / density[1];
-
-      pixel = (char *)pixel + fdm->view.layer_size;
-   }
+   void *pixel = (char *)fdm->image->map + fdm->view.offset + fdm->view.layer_size * layer + cpp * i + pitch * j;
+   float density_src[4], density[4];
+   util_format_unpack_rgba(fdm->view.format, density_src, pixel, 1);
+   pipe_swizzle_4f(density, density_src, fdm->swizzle);
+   area->width = 1.0f / density[0];
+   area->height = 1.0f / density[1];
 }
diff --git a/src/freedreno/vulkan/tu_image.h b/src/freedreno/vulkan/tu_image.h
index 5d47327b7c9..6bfb48c7bc0 100644
--- a/src/freedreno/vulkan/tu_image.h
+++ b/src/freedreno/vulkan/tu_image.h
@@ -129,9 +129,9 @@ struct tu_frag_area {
 
 void
 tu_fragment_density_map_sample(const struct tu_image_view *fdm,
-                               uint32_t x, uint32_t y,
+                               int32_t x, int32_t y,
                                uint32_t width, uint32_t height,
-                               uint32_t layers, struct tu_frag_area *areas);
+                               uint32_t layer, struct tu_frag_area *area);
 
 VkResult
 tu_image_update_layout(struct tu_device *device, struct tu_image *image,
diff --git a/src/freedreno/vulkan/tu_lrz.cc b/src/freedreno/vulkan/tu_lrz.cc
index 9c352346fc3..b26cfe3d7c9 100644
--- a/src/freedreno/vulkan/tu_lrz.cc
+++ b/src/freedreno/vulkan/tu_lrz.cc
@@ -157,7 +157,7 @@ tu6_write_lrz_cntl(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                    struct A6XX_GRAS_LRZ_CNTL cntl)
 {
    if (CHIP >= A7XX) {
-      // A7XX split LRZ_CNTL into two seperate registers.
+      /* A7XX split LRZ_CNTL into two seperate registers. */
       struct tu_reg_value cntl2 = A7XX_GRAS_LRZ_CNTL2(
          .disable_on_wrong_dir = cntl.disable_on_wrong_dir,
          .fc_enable = cntl.fc_enable,
diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc
index 47ac853229f..52c9303e12b 100644
--- a/src/freedreno/vulkan/tu_pipeline.cc
+++ b/src/freedreno/vulkan/tu_pipeline.cc
@@ -2548,44 +2548,49 @@ struct apply_viewport_state {
    bool share_scale;
 };
 
-/* It's a hardware restriction that the window offset (i.e. bin.offset) must
- * be the same for all views. This means that GMEM coordinates cannot be a
- * simple scaling of framebuffer coordinates, because this would require us to
- * scale the window offset and the scale may be different per view. Instead we
- * have to apply a per-bin offset to the GMEM coordinate transform to make
- * sure that the window offset maps to itself. Specifically we need an offset
- * o to the transform:
+/* It's a hardware restriction that the window offset (i.e. common_bin_offset)
+ * must be the same for all views. This means that GMEM coordinates cannot be
+ * a simple scaling of framebuffer coordinates, because this would require us
+ * to scale the window offset and the scale may be different per view. Instead
+ * we have to apply a per-bin offset to the GMEM coordinate transform to make
+ * sure that the window offset maps to the per-view bin coordinate, which will
+ * be the same if there is no offset. Specifically we need an offset o to the
+ * transform:
  *
  * x' = s * x + o
  *
- * so that when we plug in the bin start b_s:
+ * so that when we plug in the per-view bin start b_s and the common window
+ * offset b_cs:
  * 
- * b_s = s * b_s + o
+ * b_cs = s * b_s + o
  *
  * and we get:
  *
- * o = b_s - s * b_s
+ * o = b_cs - s * b_s
  *
- * We use this form exactly, because we know the bin offset is a multiple of
+ * We use this form exactly, because we know the bin start is a multiple of
  * the frag area so s * b_s is an integer and we can compute an exact result
- * easily.
+ * easily. We also have to make sure that the bin offset is a multiple of the
+ * frag area by restricting the frag area.
  */
 
 VkOffset2D
-tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin)
+tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin,
+                      VkOffset2D common_bin_offset)
 {
    assert(bin.offset.x % frag_area.width == 0);
    assert(bin.offset.y % frag_area.height == 0);
 
    return (VkOffset2D) {
-      bin.offset.x - bin.offset.x / frag_area.width,
-      bin.offset.y - bin.offset.y / frag_area.height
+      common_bin_offset.x - bin.offset.x / frag_area.width,
+      common_bin_offset.y - bin.offset.y / frag_area.height
    };
 }
 
 static void
 fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
-                    VkRect2D bin, unsigned views, const VkExtent2D *frag_areas)
+                    VkOffset2D common_bin_offset, unsigned views,
+                    const VkExtent2D *frag_areas, const VkRect2D *bins)
 {
    const struct apply_viewport_state *state =
       (const struct apply_viewport_state *)data;
@@ -2603,9 +2608,12 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
        * replicate it across all viewports.
        */
       VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
+      VkRect2D bin = state->share_scale ? bins[0] : bins[i];
       VkViewport viewport =
          state->share_scale ? state->vp.viewports[i] : state->vp.viewports[0];
-      if (frag_area.width == 1 && frag_area.height == 1) {
+      if (frag_area.width == 1 && frag_area.height == 1 &&
+          common_bin_offset.x == bin.offset.x &&
+          common_bin_offset.y == bin.offset.y) {
          vp.viewports[i] = viewport;
          continue;
       }
@@ -2618,7 +2626,8 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
       vp.viewports[i].width = viewport.width * scale_x;
       vp.viewports[i].height = viewport.height * scale_y;
 
-      VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
+      VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin,
+                                                common_bin_offset);
 
       vp.viewports[i].x = scale_x * viewport.x + offset.x;
       vp.viewports[i].y = scale_y * viewport.y + offset.y;
@@ -2694,7 +2703,8 @@ tu6_emit_scissor(struct tu_cs *cs, const struct vk_viewport_state *vp)
 
 static void
 fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
-                   VkRect2D bin, unsigned views, const VkExtent2D *frag_areas)
+                   VkOffset2D common_bin_offset, unsigned views,
+                   const VkExtent2D *frag_areas, const VkRect2D *bins)
 {
    const struct apply_viewport_state *state =
       (const struct apply_viewport_state *)data;
@@ -2703,12 +2713,9 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
 
    for (unsigned i = 0; i < vp.scissor_count; i++) {
       VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
+      VkRect2D bin = state->share_scale ? bins[0] : bins[i];
       VkRect2D scissor =
          state->share_scale ? state->vp.scissors[i] : state->vp.scissors[0];
-      if (frag_area.width == 1 && frag_area.height == 1) {
-         vp.scissors[i] = scissor;
-         continue;
-      }
 
       /* Transform the scissor following the viewport. It's unclear how this
        * is supposed to handle cases where the scissor isn't aligned to the
@@ -2716,7 +2723,8 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
        * fragments if the scissor size equals the framebuffer size and it
        * isn't aligned to the fragment area.
        */
-      VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
+      VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin,
+                                                common_bin_offset);
       VkOffset2D min = {
          scissor.offset.x / frag_area.width + offset.x,
          scissor.offset.y / frag_area.width + offset.y,
@@ -2731,12 +2739,12 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
        */
       uint32_t scaled_width = bin.extent.width / frag_area.width;
       uint32_t scaled_height = bin.extent.height / frag_area.height;
-      vp.scissors[i].offset.x = MAX2(min.x, bin.offset.x);
-      vp.scissors[i].offset.y = MAX2(min.y, bin.offset.y);
+      vp.scissors[i].offset.x = MAX2(min.x, common_bin_offset.x);
+      vp.scissors[i].offset.y = MAX2(min.y, common_bin_offset.y);
       vp.scissors[i].extent.width =
-         MIN2(max.x, bin.offset.x + scaled_width) - vp.scissors[i].offset.x;
+         MIN2(max.x, common_bin_offset.x + scaled_width) - vp.scissors[i].offset.x;
       vp.scissors[i].extent.height =
-         MIN2(max.y, bin.offset.y + scaled_height) - vp.scissors[i].offset.y;
+         MIN2(max.y, common_bin_offset.y + scaled_height) - vp.scissors[i].offset.y;
    }
 
    TU_CALLX(cs->device, tu6_emit_scissor)(cs, &vp);
diff --git a/src/freedreno/vulkan/tu_pipeline.h b/src/freedreno/vulkan/tu_pipeline.h
index f16fed7d8c1..547c14756c6 100644
--- a/src/freedreno/vulkan/tu_pipeline.h
+++ b/src/freedreno/vulkan/tu_pipeline.h
@@ -243,7 +243,8 @@ TU_DECL_PIPELINE_DOWNCAST(graphics, TU_PIPELINE_GRAPHICS)
 TU_DECL_PIPELINE_DOWNCAST(graphics_lib, TU_PIPELINE_GRAPHICS_LIB)
 TU_DECL_PIPELINE_DOWNCAST(compute, TU_PIPELINE_COMPUTE)
 
-VkOffset2D tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin);
+VkOffset2D tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin,
+                                 VkOffset2D common_bin_offset);
 
 template <chip CHIP>
 uint32_t tu_emit_draw_state(struct tu_cmd_buffer *cmd);
diff --git a/src/freedreno/vulkan/tu_util.cc b/src/freedreno/vulkan/tu_util.cc
index cf4b88772f1..36e6c6d561a 100644
--- a/src/freedreno/vulkan/tu_util.cc
+++ b/src/freedreno/vulkan/tu_util.cc
@@ -49,6 +49,7 @@ static const struct debug_control tu_debug_options[] = {
    { "dumpas", TU_DEBUG_DUMPAS },
    { "nobinmerging", TU_DEBUG_NO_BIN_MERGING },
    { "perfcraw", TU_DEBUG_PERFCRAW },
+   { "fdmoffset", TU_DEBUG_FDM_OFFSET },
    { NULL, 0 }
 };
 
@@ -454,6 +455,16 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
       tu_tiling_config_update_pipe_layout(vsc, device, pass->has_fdm);
       tu_tiling_config_update_pipes(vsc, device);
       tu_tiling_config_update_binning(vsc, device);
+
+      if (pass->has_fdm) {
+         struct tu_vsc_config *fdm_offset_vsc = &tiling->fdm_offset_vsc;
+         fdm_offset_vsc->tile_count = (VkExtent2D) {
+            vsc->tile_count.width + 1, vsc->tile_count.height + 1
+         };
+         tu_tiling_config_update_pipe_layout(fdm_offset_vsc, device, true);
+         tu_tiling_config_update_pipes(fdm_offset_vsc, device);
+         tu_tiling_config_update_binning(fdm_offset_vsc, device);
+      }
    }
 }
 
diff --git a/src/freedreno/vulkan/tu_util.h b/src/freedreno/vulkan/tu_util.h
index 173c8ace984..5ebdcd26a9d 100644
--- a/src/freedreno/vulkan/tu_util.h
+++ b/src/freedreno/vulkan/tu_util.h
@@ -69,6 +69,7 @@ enum tu_debug_flags : uint64_t
    TU_DEBUG_DUMPAS                   = BITFIELD64_BIT(28),
    TU_DEBUG_NO_BIN_MERGING           = BITFIELD64_BIT(29),
    TU_DEBUG_PERFCRAW                 = BITFIELD64_BIT(30),
+   TU_DEBUG_FDM_OFFSET               = BITFIELD64_BIT(31),
 };
 
 struct tu_env {