From fec372dfa502f588e894c19e9c2a8ba2c73d6c9a Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Mon, 21 Nov 2022 14:52:44 +0100
Subject: [PATCH] tu: Implement FDM viewport patching

We scale the actual rendering by patching the viewport state. This is
helped by a HW bit to make the viewport index equal to the view index,
so that we can have a different scaling per-view.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20304>
---
 src/freedreno/vulkan/tu_cmd_buffer.cc | 236 +++++++++++++++++++++++++-
 src/freedreno/vulkan/tu_cmd_buffer.h  |   2 +
 src/freedreno/vulkan/tu_pipeline.cc   |  70 +++++++-
 src/freedreno/vulkan/tu_pipeline.h    |   5 +
 4 files changed, 306 insertions(+), 7 deletions(-)

diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc
index b4a357e8b3f..adb68195e26 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@@ -859,6 +859,21 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
             frag_areas[i].height--;
       }
 
+      /* If at any point we were forced to use the same scaling for all
+       * viewports, we need to make sure that any users *not* using shared
+       * scaling, including loads/stores, also consistently share the scaling. 
+       */
+      if (cmd->state.rp.shared_viewport) {
+         VkExtent2D frag_area = { UINT32_MAX, UINT32_MAX };
+         for (unsigned i = 0; i < views; i++) {
+            frag_area.width = MIN2(frag_area.width, frag_areas[i].width);
+            frag_area.height = MIN2(frag_area.height, frag_areas[i].height);
+         }
+
+         for (unsigned i = 0; i < views; i++)
+            frag_areas[i] = frag_area;
+      }
+
       VkRect2D bin = { { x1, y1 }, { x2 - x1, y2 - y1 } };
       util_dynarray_foreach (&cmd->fdm_bin_patchpoints,
                              struct tu_fdm_bin_patchpoint, patch) {
@@ -1455,6 +1470,15 @@ tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,
       tu_clear_sysmem_attachment(cmd, cs, i, &clear_values[i]);
 
    tu_cond_exec_end(cs);
+
+   /* We need to re-emit any draw states that are patched in order for them to
+    * be correctly added to the per-renderpass patchpoint list, even if they
+    * are the same as before.
+    */
+   if (cmd->state.pass->has_fdm) {
+      cmd->state.dirty |=
+         TU_CMD_DIRTY_VIEWPORTS | TU_CMD_DIRTY_SCISSORS;
+   }
 }
 
 static void
@@ -1694,6 +1718,15 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
 
    trace_end_render_pass(&cmd->trace, &cmd->cs);
 
+   /* We have trashed the dynamically-emitted viewport, scissor, and FS params
+    * via the patchpoints, so we need to re-emit them if they are reused for a
+    * later render pass.
+    */
+   if (cmd->state.pass->has_fdm) {
+      cmd->state.dirty |=
+         TU_CMD_DIRTY_VIEWPORTS | TU_CMD_DIRTY_SCISSORS;
+   }
+
    /* tu6_render_tile has cloned these tracepoints for each tile */
    if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end))
       u_trace_disable_event_range(cmd->trace_renderpass_start,
@@ -2911,6 +2944,15 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
       cmd->state.dirty |= TU_CMD_DIRTY_VIEWPORTS;
    }
 
+   if (pipeline->viewport.set_dynamic_scissor_to_static) {
+      memcpy(cmd->state.scissor, pipeline->viewport.scissors,
+             pipeline->viewport.num_viewports *
+             sizeof(pipeline->viewport.scissors[0]));
+
+      cmd->state.scissor_count = pipeline->viewport.num_scissors;
+      cmd->state.dirty |= TU_CMD_DIRTY_SCISSORS;
+   }
+
    if ((pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_VIEWPORT)) &&
        !(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VIEWPORT_COUNT)) &&
        cmd->state.viewport_count != pipeline->viewport.num_viewports) {
@@ -2925,6 +2967,14 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
       cmd->state.dirty |= TU_CMD_DIRTY_SCISSORS;
    }
 
+   if (pipeline->viewport.per_view_viewport != cmd->state.per_view_viewport) {
+      cmd->state.per_view_viewport = pipeline->viewport.per_view_viewport;
+      if (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_VIEWPORT))
+         cmd->state.dirty |= TU_CMD_DIRTY_VIEWPORTS;
+      if (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_SCISSOR))
+         cmd->state.dirty |= TU_CMD_DIRTY_SCISSORS;
+   }
+
    if (!(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_VIEWPORT)))
       cmd->state.dirty &= ~TU_CMD_DIRTY_VIEWPORTS;
 
@@ -4122,6 +4172,7 @@ tu_render_pass_state_merge(struct tu_render_pass_state *dst,
    dst->disable_gmem |= src->disable_gmem;
    dst->sysmem_single_prim_mode |= src->sysmem_single_prim_mode;
    dst->draw_cs_writes_to_cond_pred |= src->draw_cs_writes_to_cond_pred;
+   dst->shared_viewport |= src->shared_viewport;
 
    dst->drawcall_count += src->drawcall_count;
    dst->drawcall_bandwidth_per_sample_sum +=
@@ -5060,6 +5111,146 @@ tu6_emit_blend(struct tu_cs *cs, struct tu_cmd_buffer *cmd)
                    ~pipeline->blend.rb_blend_cntl_mask));
 }
 
+struct apply_viewport_state {
+   VkViewport viewports[MAX_VIEWPORTS];
+   unsigned num_viewports;
+   bool z_negative_one_to_one;
+   bool share_scale;
+};
+
+/* It's a hardware restriction that the window offset (i.e. bin.offset) must
+ * be the same for all views. This means that GMEM coordinates cannot be a
+ * simple scaling of framebuffer coordinates, because this would require us to
+ * scale the window offset and the scale may be different per view. Instead we
+ * have to apply a per-bin offset to the GMEM coordinate transform to make
+ * sure that the window offset maps to itself. Specifically we need an offset
+ * o to the transform:
+ *
+ * x' = s * x + o
+ *
+ * so that when we plug in the bin start b_s:
+ * 
+ * b_s = s * b_s + o
+ *
+ * and we get:
+ *
+ * o = b_s - s * b_s
+ *
+ * We use this form exactly, because we know the bin offset is a multiple of
+ * the frag area so s * b_s is an integer and we can compute an exact result
+ * easily.
+ */
+
+static VkOffset2D
+fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin)
+{
+   assert(bin.offset.x % frag_area.width == 0);
+   assert(bin.offset.y % frag_area.height == 0);
+
+   return (VkOffset2D) {
+      bin.offset.x - bin.offset.x / frag_area.width,
+      bin.offset.y - bin.offset.y / frag_area.height
+   };
+}
+
+static void
+fdm_apply_viewports(struct tu_cs *cs, void *data, VkRect2D bin, unsigned views,
+                    VkExtent2D *frag_areas)
+{
+   VkViewport viewports[MAX_VIEWPORTS];
+   const struct apply_viewport_state *state =
+      (const struct apply_viewport_state *)data;
+
+   for (unsigned i = 0; i < state->num_viewports; i++) {
+      /* Note: If we're using shared scaling, the scale should already be the
+       * same across all views, we can pick any view. However the number
+       * of viewports and number of views is not guaranteed the same, so we
+       * need to pick the 0'th view which always exists to be safe.
+       *
+       * Conversly, if we're not using shared scaling then the rasterizer in
+       * the original pipeline is using only the first viewport, so we need to
+       * replicate it across all viewports.
+       */
+      VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
+      VkViewport viewport =
+         state->share_scale ? state->viewports[i] : state->viewports[0];
+      if (frag_area.width == 1 && frag_area.height == 1) {
+         viewports[i] = viewport;
+         continue;
+      }
+
+      float scale_x = (float) 1.0f / frag_area.width;
+      float scale_y = (float) 1.0f / frag_area.height;
+
+      viewports[i].minDepth = viewport.minDepth;
+      viewports[i].maxDepth = viewport.maxDepth;
+      viewports[i].width = viewport.width * scale_x;
+      viewports[i].height = viewport.height * scale_y;
+
+      VkOffset2D offset = fdm_per_bin_offset(frag_area, bin);
+
+      viewports[i].x = scale_x * viewport.x + offset.x;
+      viewports[i].y = scale_y * viewport.y + offset.y;
+   }
+
+   tu6_emit_viewport(cs, viewports, state->num_viewports, state->z_negative_one_to_one);
+}
+
+struct apply_scissor_state {
+   VkRect2D scissors[MAX_VIEWPORTS];
+   unsigned num_scissors;
+   bool share_scale;
+};
+
+static void
+fdm_apply_scissors(struct tu_cs *cs, void *data, VkRect2D bin, unsigned views,
+                   VkExtent2D *frag_areas)
+{
+   VkRect2D scissors[MAX_VIEWPORTS];
+   const struct apply_scissor_state *state =
+      (const struct apply_scissor_state *)data;
+
+   for (unsigned i = 0; i < state->num_scissors; i++) {
+      VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
+      VkRect2D scissor =
+         state->share_scale ? state->scissors[i] : state->scissors[0];
+      if (frag_area.width == 1 && frag_area.height == 1) {
+         scissors[i] = scissor;
+         continue;
+      }
+
+      /* Transform the scissor following the viewport. It's unclear how this
+       * is supposed to handle cases where the scissor isn't aligned to the
+       * fragment area, but we round outwards to always render partial
+       * fragments if the scissor size equals the framebuffer size and it
+       * isn't aligned to the fragment area.
+       */
+      VkOffset2D offset = fdm_per_bin_offset(frag_area, bin);
+      VkOffset2D min = {
+         scissor.offset.x / frag_area.width + offset.x,
+         scissor.offset.y / frag_area.width + offset.y,
+      };
+      VkOffset2D max = {
+         DIV_ROUND_UP(scissor.offset.x + scissor.extent.width, frag_area.width) + offset.x,
+         DIV_ROUND_UP(scissor.offset.y + scissor.extent.height, frag_area.height) + offset.y,
+      };
+
+      /* Intersect scissor with the scaled bin, this essentially replaces the
+       * window scissor.
+       */
+      uint32_t scaled_width = bin.extent.width / frag_area.width;
+      uint32_t scaled_height = bin.extent.height / frag_area.height;
+      scissors[i].offset.x = MAX2(min.x, bin.offset.x);
+      scissors[i].offset.y = MAX2(min.y, bin.offset.y);
+      scissors[i].extent.width =
+         MIN2(max.x, bin.offset.x + scaled_width) - scissors[i].offset.x;
+      scissors[i].extent.height =
+         MIN2(max.y, bin.offset.y + scaled_height) - scissors[i].offset.y;
+   }
+
+   tu6_emit_scissor(cs, scissors, state->num_scissors);
+}
+
 static VkResult
 tu6_draw_common(struct tu_cmd_buffer *cmd,
                 struct tu_cs *cs,
@@ -5190,14 +5381,49 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
       cmd->state.shader_const = tu6_emit_consts(cmd, pipeline, false);
 
    if (dirty & TU_CMD_DIRTY_VIEWPORTS) {
-      struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * cmd->state.viewport_count);
-      tu6_emit_viewport(&cs, cmd->state.viewport, cmd->state.viewport_count,
-                        cmd->state.z_negative_one_to_one);
+      if (pipeline->fs.fragment_density_map) {
+         unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
+         unsigned num_viewports = cmd->state.per_view_viewport ?
+            num_views : cmd->state.viewport_count;
+         struct apply_viewport_state state = {
+            .num_viewports = num_viewports,
+            .z_negative_one_to_one = cmd->state.z_negative_one_to_one,
+            .share_scale = !cmd->state.per_view_viewport,
+         };
+         memcpy(&state.viewports, cmd->state.viewport, sizeof(state.viewports));
+         tu_cs_set_writeable(&cmd->sub_cs, true);
+         struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * num_viewports);
+         tu_cs_set_writeable(&cmd->sub_cs, false);
+         tu_create_fdm_bin_patchpoint(cmd, &cs, 8 + 10 * num_viewports,
+                                      fdm_apply_viewports, state);
+         cmd->state.rp.shared_viewport |= !cmd->state.per_view_viewport;
+      } else {
+         struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * cmd->state.viewport_count);
+         tu6_emit_viewport(&cs, cmd->state.viewport, cmd->state.viewport_count,
+                           cmd->state.z_negative_one_to_one);
+      }
    }
 
    if (dirty & TU_CMD_DIRTY_SCISSORS) {
-      struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * cmd->state.scissor_count);
-      tu6_emit_scissor(&cs, cmd->state.scissor, cmd->state.scissor_count);
+      if (pipeline->fs.fragment_density_map) {
+         unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
+         unsigned num_scissors = cmd->state.per_view_viewport ?
+            num_views : cmd->state.scissor_count;
+         struct apply_scissor_state state = {
+            .num_scissors = num_scissors,
+            .share_scale = !cmd->state.per_view_viewport,
+         };
+         memcpy(&state.scissors, cmd->state.scissor, sizeof(state.scissors));
+         tu_cs_set_writeable(&cmd->sub_cs, true);
+         struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * num_scissors);
+         tu_cs_set_writeable(&cmd->sub_cs, false);
+         tu_create_fdm_bin_patchpoint(cmd, &cs, 1 + 2 * num_scissors,
+                                      fdm_apply_scissors, state);
+         cmd->state.rp.shared_viewport |= !cmd->state.per_view_viewport;
+      } else {
+         struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * cmd->state.scissor_count);
+         tu6_emit_scissor(&cs, cmd->state.scissor, cmd->state.scissor_count);
+      }
    }
 
    if (dirty & TU_CMD_DIRTY_BLEND) {
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h
index 9216d1e4caa..29db0ba9740 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.h
+++ b/src/freedreno/vulkan/tu_cmd_buffer.h
@@ -283,6 +283,7 @@ struct tu_render_pass_state
    bool has_prim_generated_query_in_rp;
    bool disable_gmem;
    bool sysmem_single_prim_mode;
+   bool shared_viewport;
 
    /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
    bool draw_cs_writes_to_cond_pred;
@@ -428,6 +429,7 @@ struct tu_cmd_state
    VkViewport viewport[MAX_VIEWPORTS];
    VkRect2D scissor[MAX_SCISSORS];
    uint32_t viewport_count, scissor_count;
+   bool per_view_viewport;
 
    /* for dynamic states that can't be emitted directly */
    uint32_t dynamic_stencil_mask;
diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc
index df61e70f2f9..bccdff4e8a9 100644
--- a/src/freedreno/vulkan/tu_pipeline.cc
+++ b/src/freedreno/vulkan/tu_pipeline.cc
@@ -4093,6 +4093,8 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
 
    struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX];
    struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
+   struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL];
+   struct ir3_shader_variant *gs = builder->variants[MESA_SHADER_GEOMETRY];
    if (hs) {
       pipeline->program.vs_param_stride = vs->output_size;
       pipeline->program.hs_param_stride = hs->output_size;
@@ -4117,6 +4119,16 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
                                        pipeline->tess.patch_control_points);
       }
    }
+
+   struct ir3_shader_variant *last_shader;
+   if (gs)
+      last_shader = gs;
+   else if (ds)
+      last_shader = ds;
+   else
+      last_shader = vs;
+
+   pipeline->program.writes_viewport = last_shader->writes_viewport;
 }
 
 static void
@@ -4251,16 +4263,34 @@ tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder *builder,
 
    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * vp_info->viewportCount)) {
       tu6_emit_viewport(&cs, vp_info->pViewports, vp_info->viewportCount, pipeline->viewport.z_negative_one_to_one);
-   } else if (pipeline->viewport.set_dynamic_vp_to_static) {
+   }
+
+   /* We have to save the static viewports if set_dynamic_vp_to_static is set,
+    * but it may also be set later during pipeline linking if viewports are
+    * static state becuase FDM also enables set_dynamic_vp_to_static but in a
+    * different pipeline stage. Therefore we also have to save them if the
+    * viewport state is static, even though we emit them above.
+    */
+   if (pipeline->viewport.set_dynamic_vp_to_static ||
+       !(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_VIEWPORT))) {
       memcpy(pipeline->viewport.viewports, vp_info->pViewports,
              vp_info->viewportCount * sizeof(*vp_info->pViewports));
    }
 
    pipeline->viewport.num_viewports = vp_info->viewportCount;
 
-   if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * vp_info->scissorCount))
+   assert(!pipeline->viewport.set_dynamic_scissor_to_static);
+   if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * vp_info->scissorCount)) {
       tu6_emit_scissor(&cs, vp_info->pScissors, vp_info->scissorCount);
 
+      /* Similarly to the above we need to save off the static scissors if
+       * they were originally static, but nothing sets
+       * set_dynamic_scissor_to_static except FDM.
+       */
+      memcpy(pipeline->viewport.scissors, vp_info->pScissors,
+             vp_info->scissorCount * sizeof(*vp_info->pScissors));
+   }
+
    pipeline->viewport.num_scissors = vp_info->scissorCount;
 }
 
@@ -4519,6 +4549,42 @@ tu_pipeline_builder_parse_rast_ds(struct tu_pipeline_builder *builder,
       else
          tu_cs_emit(&cs, pipeline->rast_ds.rb_depth_cntl);
    }
+
+   /* With FDM we have to overwrite the viewport and scissor so they have to
+    * be set dynamically. This can only be done once we know the output state
+    * and whether viewport/scissor is dynamic. We also have to figure out
+    * whether we can use per-view viewports to and enable that if true.
+    */
+   if (pipeline->fs.fragment_density_map) {
+      if (!(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_VIEWPORT))) {
+         pipeline->viewport.set_dynamic_vp_to_static = true;
+         pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_VIEWPORT);
+      }
+
+      if (!(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_SCISSOR))) {
+         pipeline->viewport.set_dynamic_scissor_to_static = true;
+         pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_SCISSOR);
+      }
+
+      /* We can use per-view viewports if the last geometry stage doesn't
+       * write its own viewport.
+       */
+      pipeline->viewport.per_view_viewport =
+         !pipeline->program.writes_viewport &&
+         builder->device->physical_device->info->a6xx.has_per_view_viewport;
+
+      /* Fixup GRAS_SU_CNTL and re-emit rast state if necessary. */
+      if (pipeline->viewport.per_view_viewport) {
+         pipeline->rast.gras_su_cntl |= A6XX_GRAS_SU_CNTL_VIEWPORTINDEXINCR;
+
+         if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RAST,
+                                      tu6_rast_size(builder->device))) {
+            tu6_emit_rast(&cs, pipeline->rast.gras_su_cntl,
+                          pipeline->rast.gras_cl_cntl,
+                          pipeline->rast.polygon_mode);
+         }
+      }
+   }
 }
 
 static void
diff --git a/src/freedreno/vulkan/tu_pipeline.h b/src/freedreno/vulkan/tu_pipeline.h
index e04ed774f3f..b95ae6589ec 100644
--- a/src/freedreno/vulkan/tu_pipeline.h
+++ b/src/freedreno/vulkan/tu_pipeline.h
@@ -233,6 +233,8 @@ struct tu_pipeline
       uint32_t hs_param_dwords;
       uint32_t hs_vertices_out;
       uint32_t cs_instrlen;
+
+      bool writes_viewport;
    } program;
 
    struct
@@ -258,9 +260,12 @@ struct tu_pipeline
 
    struct {
       VkViewport viewports[MAX_VIEWPORTS];
+      VkRect2D scissors[MAX_SCISSORS];
       unsigned num_viewports, num_scissors;
       bool set_dynamic_vp_to_static;
+      bool set_dynamic_scissor_to_static;
       bool z_negative_one_to_one;
+      bool per_view_viewport;
    } viewport;
 
    /* Used only for libraries. compiled_shaders only contains variants compiled