v3dv: choose a larger CSD supergroup size if possible

Each supergroup executes a number batches. Each batch has 16 elements (one per QPU lane), except possibly the last batch which might be incomplete. Until now, we packed a single workgroup in each supergroup, which can lead to more incomplete batches and less efficient use of the QPUs depending on the configuration of workgroups being dispatched. This patch computes a number of workgroups per supergroup so that we reduce or completely eliminate incomplete batches if possible. It should be noted however, that TSY barriers act on supergroups, so larger supergroups lead to larger syncpoints on barriers too. A follow-up patch will try to find a good balance for compute shaders that use such barriers. This improves performance of the Sascha Willem's computecloth demo by ~13%. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10541>
2026-05-08 11:18:08 +02:00 · 2021-04-28 11:09:04 +02:00 · 2021-04-28 11:09:04 +02:00 · 2e0f6e5705
commit 2e0f6e5705
parent aebb47b7d1
2 changed files with 45 additions and 7 deletions
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@ -5268,6 +5268,37 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job(
   }
 }

+/* Choose a number of workgroups per supergroup that maximizes
+ * lane occupancy. We can pack up to 16 workgroups into a supergroup.
+ */
+static uint32_t
+choose_workgroups_per_supergroup(uint32_t num_wgs, uint32_t wg_size)
+{
+   uint32_t best_wgs_per_sg = 1;
+   uint32_t best_unused_lanes = 16;
+   for (uint32_t wgs_per_sg = 1; wgs_per_sg <= 16; wgs_per_sg++) {
+      /* Don't try to pack more workgroups per supergroup than the total amount
+       * of workgroups dispatched.
+       */
+      if (wgs_per_sg > num_wgs)
+         return best_wgs_per_sg;
+
+      /* Compute wasted lines for this configuration and keep track of the
+       * config with less waste.
+       */
+      uint32_t unused_lanes = (16 - ((wgs_per_sg * wg_size) % 16)) & 0x0f;
+      if (unused_lanes == 0)
+         return wgs_per_sg;
+
+      if (unused_lanes < best_unused_lanes) {
+         best_wgs_per_sg = wgs_per_sg;
+         best_unused_lanes = unused_lanes;
+      }
+   }
+
+   return best_wgs_per_sg;
+}
+
 static struct v3dv_job *
 cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
                          uint32_t group_count_x,
@ -5305,20 +5336,25 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
   const struct v3d_compute_prog_data *cpd =
      cs_variant->prog_data.cs;

-   const uint32_t wgs_per_sg = 1; /* FIXME */
+   const uint32_t num_wgs = group_count_x * group_count_y * group_count_z;
   const uint32_t wg_size = cpd->local_size[0] *
                            cpd->local_size[1] *
                            cpd->local_size[2];
-   submit->cfg[3] |= wgs_per_sg << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
-   submit->cfg[3] |= ((DIV_ROUND_UP(wgs_per_sg * wg_size, 16) - 1) <<
-                       V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT);
+
+   uint32_t wgs_per_sg = choose_workgroups_per_supergroup(num_wgs, wg_size);
+   uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16);
+   uint32_t whole_sgs = num_wgs / wgs_per_sg;
+   uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg;
+   uint32_t num_batches = batches_per_sg * whole_sgs +
+                          DIV_ROUND_UP(rem_wgs * wg_size, 16);
+
+   submit->cfg[3] |= (wgs_per_sg & 0xf) << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
+   submit->cfg[3] |= (batches_per_sg - 1) << V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT;
   submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
   if (wg_size_out)
      *wg_size_out = wg_size;

-   uint32_t batches_per_wg = DIV_ROUND_UP(wg_size, 16);
-   submit->cfg[4] = batches_per_wg *
-                    (group_count_x * group_count_y * group_count_z) - 1;
+   submit->cfg[4] = num_batches - 1;
   assert(submit->cfg[4] != ~0);

   assert(pipeline->shared_data->assembly_bo);
--- a/src/broadcom/vulkan/v3dv_pipeline.c
+++ b/src/broadcom/vulkan/v3dv_pipeline.c
@ -233,6 +233,8 @@ const nir_shader_compiler_options v3dv_nir_options = {
   .vertex_id_zero_based = false, /* FIXME: to set this to true, the intrinsic
                                   * needs to be supported */
   .lower_interpolate_at = true,
+   .divergence_analysis_options =
+      nir_divergence_multiple_workgroup_per_compute_subgroup
 };

 const nir_shader_compiler_options *