diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc
index dc9a63351dd..b4a357e8b3f 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@@ -592,9 +592,13 @@ tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
     * value as the last draw. This means that if the descriptor sets change
     * but not the pipeline, we'd try to re-execute the same buffer which
     * the firmware would ignore and we wouldn't pre-load the new
-    * descriptors. Set the DIRTY bit to avoid this optimization
+    * descriptors. Set the DIRTY bit to avoid this optimization.
+    *
+    * We also need to set this bit for draw states which may be patched by the
+    * GPU, because their underlying memory may change between setting the draw
+    * state.
     */
-   if (id == TU_DRAW_STATE_DESC_SETS_LOAD)
+   if (id == TU_DRAW_STATE_DESC_SETS_LOAD || state.writeable)
       enable_mask |= CP_SET_DRAW_STATE__0_DIRTY;
 
    tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(state.size) |
@@ -767,7 +771,8 @@ tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 static void
 tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
                      struct tu_cs *cs,
-                     uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot)
+                     uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot,
+                     const struct tu_image_view *fdm)
 {
    const struct tu_tiling_config *tiling = cmd->state.tiling;
 
@@ -776,9 +781,9 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
 
    const uint32_t x1 = tiling->tile0.width * tx;
    const uint32_t y1 = tiling->tile0.height * ty;
-   const uint32_t x2 = MIN2(x1 + tiling->tile0.width - 1, MAX_VIEWPORT_SIZE - 1);
-   const uint32_t y2 = MIN2(y1 + tiling->tile0.height - 1, MAX_VIEWPORT_SIZE - 1);
-   tu6_emit_window_scissor(cs, x1, y1, x2, y2);
+   const uint32_t x2 = MIN2(x1 + tiling->tile0.width, MAX_VIEWPORT_SIZE);
+   const uint32_t y2 = MIN2(y1 + tiling->tile0.height, MAX_VIEWPORT_SIZE);
+   tu6_emit_window_scissor(cs, x1, y1, x2 - 1, y2 - 1);
    tu6_emit_window_offset(cs, x1, y1);
 
    bool hw_binning = use_hw_binning(cmd);
@@ -804,6 +809,70 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
 
    tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
    tu_cs_emit(cs, 0x0);
+
+   if (fdm || (TU_DEBUG(FDM) && cmd->state.pass->has_fdm)) {
+      unsigned views =
+         cmd->state.pass->num_views ? cmd->state.pass->num_views : 1;
+      const struct tu_framebuffer *fb = cmd->state.framebuffer;
+      struct tu_frag_area raw_areas[views];
+      if (fdm) {
+         tu_fragment_density_map_sample(fdm,
+                                        (x1 + MIN2(x2, fb->width)) / 2,
+                                        (y1 + MIN2(y2, fb->height)) / 2,
+                                        fb->width, fb->height, views,
+                                        raw_areas);
+      } else {
+         for (unsigned i = 0; i < views; i++)
+            raw_areas[i].width = raw_areas[i].height = 1.0f;
+      }
+
+      VkExtent2D frag_areas[views];
+      for (unsigned i = 0; i < views; i++) {
+         float floor_x, floor_y;
+         float area = raw_areas[i].width * raw_areas[i].height;
+         float frac_x = modff(raw_areas[i].width, &floor_x);
+         float frac_y = modff(raw_areas[i].height, &floor_y);
+         /* The spec allows rounding up one of the axes as long as the total
+          * area is less than or equal to the original area. Take advantage of
+          * this to try rounding up the number with the largest fraction.
+          */
+         if ((frac_x > frac_y ? (floor_x + 1.f) * floor_y :
+                                 floor_x * (floor_y + 1.f)) <= area) {
+            if (frac_x > frac_y)
+               floor_x += 1.f;
+            else
+               floor_y += 1.f;
+         }
+         frag_areas[i].width = floor_x;
+         frag_areas[i].height = floor_y;
+
+         /* Make sure that the width/height divides the tile width/height so
+          * we don't have to do extra awkward clamping of the edges of each
+          * bin when resolving. Note that because the tile width is rounded to
+          * a multiple of 32 any power of two 32 or less will work.
+          *
+          * TODO: Try to take advantage of the total area allowance here, too.
+          */
+         while (tiling->tile0.width % frag_areas[i].width != 0)
+            frag_areas[i].width--;
+         while (tiling->tile0.height % frag_areas[i].height != 0)
+            frag_areas[i].height--;
+      }
+
+      VkRect2D bin = { { x1, y1 }, { x2 - x1, y2 - y1 } };
+      util_dynarray_foreach (&cmd->fdm_bin_patchpoints,
+                             struct tu_fdm_bin_patchpoint, patch) {
+         tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
+         tu_cs_emit_qw(cs, patch->iova);
+         patch->apply(cs, patch->data, bin, views, frag_areas);
+      }
+
+      /* Make the CP wait until the CP_MEM_WRITE's to the command buffers
+       * land.
+       */
+      tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
+      tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
+   }
 }
 
 static void
@@ -1110,6 +1179,28 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
    struct tu_physical_device *phys_dev = cmd->device->physical_device;
    const struct tu_framebuffer *fb = cmd->state.framebuffer;
 
+   /* If this command buffer may be executed multiple times, then
+    * viewports/scissor states may have been changed by previous executions
+    * and we need to reset them before executing the binning IB.
+    */
+   if (!(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) &&
+       cmd->fdm_bin_patchpoints.size != 0) {
+      unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
+      VkExtent2D unscaled_frag_areas[num_views];
+      for (unsigned i = 0; i < num_views; i++)
+         unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
+      VkRect2D bin = { { 0, 0 }, { fb->width, fb->height } };
+      util_dynarray_foreach (&cmd->fdm_bin_patchpoints,
+                             struct tu_fdm_bin_patchpoint, patch) {
+         tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
+         tu_cs_emit_qw(cs, patch->iova);
+         patch->apply(cs, patch->data, bin, num_views, unscaled_frag_areas);
+      }
+
+      tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
+      tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
+   }
+
    tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
 
    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
@@ -1487,9 +1578,10 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 
 static void
 tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot)
+                uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot,
+                const struct tu_image_view *fdm)
 {
-   tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot);
+   tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot, fdm);
 
    trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs);
 
@@ -1546,6 +1638,11 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
                     struct tu_renderpass_result *autotune_result)
 {
    const struct tu_tiling_config *tiling = cmd->state.tiling;
+   const struct tu_image_view *fdm = NULL;
+
+   if (cmd->state.pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) {
+      fdm = cmd->state.attachments[cmd->state.pass->fragment_density_map.attachment];
+   }
 
    /* Create gmem stores now (at EndRenderPass time)) because they needed to
     * know whether to allow their conditional execution, which was tied to a
@@ -1586,7 +1683,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
                else
                   tx = tile_row_i;
                uint32_t slot = slot_row + tx;
-               tu6_render_tile(cmd, &cmd->cs, tx1 + tx, ty, pipe, slot);
+               tu6_render_tile(cmd, &cmd->cs, tx1 + tx, ty, pipe, slot, fdm);
             }
             slot_row += tile_row_stride;
          }
@@ -1666,6 +1763,11 @@ static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
    /* LRZ is not valid next time we use it */
    cmd_buffer->state.lrz.valid = false;
    cmd_buffer->state.dirty |= TU_CMD_DIRTY_LRZ;
+
+   /* Patchpoints have been executed */
+   util_dynarray_clear(&cmd_buffer->fdm_bin_patchpoints);
+   ralloc_free(cmd_buffer->patchpoints_ctx);
+   cmd_buffer->patchpoints_ctx = NULL;
 }
 
 static VkResult
@@ -1738,6 +1840,9 @@ tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
               cmd_buffer->descriptors[i].push_set.mapped_ptr);
    }
 
+   ralloc_free(cmd_buffer->patchpoints_ctx);
+   util_dynarray_fini(&cmd_buffer->fdm_bin_patchpoints);
+
    vk_command_buffer_finish(&cmd_buffer->vk);
    vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->vk.pool->alloc,
             cmd_buffer);
@@ -1781,6 +1886,10 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
    cmd_buffer->state.last_prim_params.valid = false;
 
    cmd_buffer->vsc_initialized = false;
+
+   ralloc_free(cmd_buffer->patchpoints_ctx);
+   cmd_buffer->patchpoints_ctx = NULL;
+   util_dynarray_clear(&cmd_buffer->fdm_bin_patchpoints);
 }
 
 const struct vk_command_buffer_ops tu_cmd_buffer_ops = {
@@ -1891,6 +2000,8 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
                &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
          }
 
+         cmd_buffer->patchpoints_ctx = ralloc_parent(NULL);
+
          /* We can't set the gmem layout here, because the state.pass only has
           * to be compatible (same formats/sample counts) with the primary's
           * renderpass, rather than exactly equal.
@@ -4046,6 +4157,8 @@ tu_append_pre_chain(struct tu_cmd_buffer *cmd,
                               &secondary->pre_chain.state);
    tu_clone_trace_range(cmd, &cmd->draw_cs, secondary->pre_chain.trace_renderpass_start,
          secondary->pre_chain.trace_renderpass_end);
+   util_dynarray_append_dynarray(&cmd->fdm_bin_patchpoints,
+                                 &secondary->pre_chain.fdm_bin_patchpoints);
 }
 
 /* Take the saved post-chain in "secondary" and copy it to "cmd".
@@ -4060,6 +4173,8 @@ tu_append_post_chain(struct tu_cmd_buffer *cmd,
    tu_clone_trace_range(cmd, &cmd->draw_cs, secondary->trace_renderpass_start,
          secondary->trace_renderpass_end);
    cmd->state.rp = secondary->state.rp;
+   util_dynarray_append_dynarray(&cmd->fdm_bin_patchpoints,
+                                 &secondary->fdm_bin_patchpoints);
 }
 
 /* Assuming "secondary" is just a sequence of suspended and resuming passes,
@@ -4079,6 +4194,8 @@ tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
          secondary->trace_renderpass_end);
    tu_render_pass_state_merge(&cmd->state.rp,
                               &secondary->state.rp);
+   util_dynarray_append_dynarray(&cmd->fdm_bin_patchpoints,
+                                 &secondary->fdm_bin_patchpoints);
 }
 
 /* Take the current render pass state and save it to "pre_chain" to be
@@ -4096,6 +4213,10 @@ tu_save_pre_chain(struct tu_cmd_buffer *cmd)
    cmd->pre_chain.trace_renderpass_end =
       cmd->trace_renderpass_end;
    cmd->pre_chain.state = cmd->state.rp;
+   util_dynarray_append_dynarray(&cmd->pre_chain.fdm_bin_patchpoints,
+                                 &cmd->fdm_bin_patchpoints);
+   cmd->pre_chain.patchpoints_ctx = cmd->patchpoints_ctx;
+   cmd->patchpoints_ctx = NULL;
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -4145,6 +4266,8 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
 
          tu_clone_trace(cmd, &cmd->draw_cs, &secondary->trace);
          tu_render_pass_state_merge(&cmd->state.rp, &secondary->state.rp);
+         util_dynarray_append_dynarray(&cmd->fdm_bin_patchpoints,
+                                       &secondary->fdm_bin_patchpoints);
       } else {
          switch (secondary->state.suspend_resume) {
          case SR_NONE:
@@ -4389,6 +4512,9 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
 
    tu_emit_renderpass_begin(cmd, pRenderPassBegin->pClearValues);
    tu_emit_subpass_begin(cmd);
+
+   if (pass->has_fdm)
+      cmd->patchpoints_ctx = ralloc_parent(NULL);
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -4465,6 +4591,9 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
       cmd->state.attachments[a] = view;
    }
 
+   if (cmd->dynamic_pass.has_fdm)
+      cmd->patchpoints_ctx = ralloc_context(NULL);
+
    tu_choose_gmem_layout(cmd);
 
    cmd->state.renderpass_cache.pending_flush_bits =
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h
index e1125c7d729..9216d1e4caa 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.h
+++ b/src/freedreno/vulkan/tu_cmd_buffer.h
@@ -558,6 +558,9 @@ struct tu_cmd_buffer
    struct list_head renderpass_autotune_results;
    struct tu_autotune_results_buffer* autotune_buffer;
 
+   void *patchpoints_ctx;
+   struct util_dynarray fdm_bin_patchpoints;
+
    VkCommandBufferUsageFlags usage_flags;
 
    VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
@@ -602,6 +605,9 @@ struct tu_cmd_buffer
       struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
 
       struct tu_render_pass_state state;
+
+      struct util_dynarray fdm_bin_patchpoints;
+      void *patchpoints_ctx;
    } pre_chain;
 
    uint32_t vsc_draw_strm_pitch;
@@ -691,4 +697,55 @@ void tu6_apply_depth_bounds_workaround(struct tu_device *device,
 void
 update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
 
+typedef void (*tu_fdm_bin_apply_t)(struct tu_cs *cs, void *data, VkRect2D bin,
+                                   unsigned views, VkExtent2D *frag_areas);
+
+struct tu_fdm_bin_patchpoint {
+   uint64_t iova;
+   uint32_t size;
+   void *data;
+   tu_fdm_bin_apply_t apply;
+};
+
+static inline void
+_tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
+                              struct tu_cs *cs,
+                              unsigned size,
+                              tu_fdm_bin_apply_t apply,
+                              void *state,
+                              unsigned state_size)
+{
+   void *data = ralloc_size(cmd->patchpoints_ctx, state_size);
+   memcpy(data, state, state_size);
+   assert(cs->writeable);
+   tu_cs_reserve_space(cs, size);
+   struct tu_fdm_bin_patchpoint patch = {
+      .iova = tu_cs_get_cur_iova(cs),
+      .size = size,
+      .data = data,
+      .apply = apply,
+   };
+
+   /* Apply the "default" setup where there is no scaling. This is used if
+    * sysmem is required, and uses up the dwords that have been reserved.
+    */
+   unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
+   VkExtent2D unscaled_frag_areas[num_views];
+   for (unsigned i = 0; i < num_views; i++) {
+      unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
+   }
+   apply(cs, state, (VkRect2D) {
+         { 0, 0 },
+         { MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE },
+        }, num_views, unscaled_frag_areas);
+   assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));
+
+   util_dynarray_append(&cmd->fdm_bin_patchpoints,
+                        struct tu_fdm_bin_patchpoint,
+                        patch);
+}
+
+#define tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, state) \
+   _tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, &state, sizeof(state))
+
 #endif /* TU_CMD_BUFFER_H */
diff --git a/src/freedreno/vulkan/tu_cs.h b/src/freedreno/vulkan/tu_cs.h
index 096064ec7f8..419d82842d8 100644
--- a/src/freedreno/vulkan/tu_cs.h
+++ b/src/freedreno/vulkan/tu_cs.h
@@ -66,8 +66,9 @@ struct tu_cs_memory {
 };
 
 struct tu_draw_state {
-   uint64_t iova : 48;
-   uint32_t size : 16;
+   uint64_t iova;
+   uint16_t size;
+   bool writeable;
 };
 
 struct tu_bo_array {
@@ -164,6 +165,7 @@ tu_cs_end_draw_state(struct tu_cs *cs, struct tu_cs *sub_cs)
    return (struct tu_draw_state) {
       .iova = entry.bo->iova + entry.offset,
       .size = entry.size / sizeof(uint32_t),
+      .writeable = sub_cs->writeable,
    };
 }
 
@@ -188,6 +190,7 @@ tu_cs_draw_state(struct tu_cs *sub_cs, struct tu_cs *cs, uint32_t size)
    return (struct tu_draw_state) {
       .iova = memory.iova,
       .size = size,
+      .writeable = sub_cs->writeable,
    };
 }