From 2e0f6e570596438560cdde5a61e0bd0b45267aa6 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Wed, 28 Apr 2021 11:09:04 +0200 Subject: [PATCH] v3dv: choose a larger CSD supergroup size if possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each supergroup executes a number batches. Each batch has 16 elements (one per QPU lane), except possibly the last batch which might be incomplete. Until now, we packed a single workgroup in each supergroup, which can lead to more incomplete batches and less efficient use of the QPUs depending on the configuration of workgroups being dispatched. This patch computes a number of workgroups per supergroup so that we reduce or completely eliminate incomplete batches if possible. It should be noted however, that TSY barriers act on supergroups, so larger supergroups lead to larger syncpoints on barriers too. A follow-up patch will try to find a good balance for compute shaders that use such barriers. This improves performance of the Sascha Willem's computecloth demo by ~13%. Reviewed-by: Alejandro PiƱeiro Part-of: --- src/broadcom/vulkan/v3dv_cmd_buffer.c | 50 +++++++++++++++++++++++---- src/broadcom/vulkan/v3dv_pipeline.c | 2 ++ 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c index a2666b39f6b..d1e641b9316 100644 --- a/src/broadcom/vulkan/v3dv_cmd_buffer.c +++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c @@ -5268,6 +5268,37 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job( } } +/* Choose a number of workgroups per supergroup that maximizes + * lane occupancy. We can pack up to 16 workgroups into a supergroup. + */ +static uint32_t +choose_workgroups_per_supergroup(uint32_t num_wgs, uint32_t wg_size) +{ + uint32_t best_wgs_per_sg = 1; + uint32_t best_unused_lanes = 16; + for (uint32_t wgs_per_sg = 1; wgs_per_sg <= 16; wgs_per_sg++) { + /* Don't try to pack more workgroups per supergroup than the total amount + * of workgroups dispatched. + */ + if (wgs_per_sg > num_wgs) + return best_wgs_per_sg; + + /* Compute wasted lines for this configuration and keep track of the + * config with less waste. + */ + uint32_t unused_lanes = (16 - ((wgs_per_sg * wg_size) % 16)) & 0x0f; + if (unused_lanes == 0) + return wgs_per_sg; + + if (unused_lanes < best_unused_lanes) { + best_wgs_per_sg = wgs_per_sg; + best_unused_lanes = unused_lanes; + } + } + + return best_wgs_per_sg; +} + static struct v3dv_job * cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, uint32_t group_count_x, @@ -5305,20 +5336,25 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, const struct v3d_compute_prog_data *cpd = cs_variant->prog_data.cs; - const uint32_t wgs_per_sg = 1; /* FIXME */ + const uint32_t num_wgs = group_count_x * group_count_y * group_count_z; const uint32_t wg_size = cpd->local_size[0] * cpd->local_size[1] * cpd->local_size[2]; - submit->cfg[3] |= wgs_per_sg << V3D_CSD_CFG3_WGS_PER_SG_SHIFT; - submit->cfg[3] |= ((DIV_ROUND_UP(wgs_per_sg * wg_size, 16) - 1) << - V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT); + + uint32_t wgs_per_sg = choose_workgroups_per_supergroup(num_wgs, wg_size); + uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16); + uint32_t whole_sgs = num_wgs / wgs_per_sg; + uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg; + uint32_t num_batches = batches_per_sg * whole_sgs + + DIV_ROUND_UP(rem_wgs * wg_size, 16); + + submit->cfg[3] |= (wgs_per_sg & 0xf) << V3D_CSD_CFG3_WGS_PER_SG_SHIFT; + submit->cfg[3] |= (batches_per_sg - 1) << V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT; submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT; if (wg_size_out) *wg_size_out = wg_size; - uint32_t batches_per_wg = DIV_ROUND_UP(wg_size, 16); - submit->cfg[4] = batches_per_wg * - (group_count_x * group_count_y * group_count_z) - 1; + submit->cfg[4] = num_batches - 1; assert(submit->cfg[4] != ~0); assert(pipeline->shared_data->assembly_bo); diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c index 6a573a78841..e6bebd905f6 100644 --- a/src/broadcom/vulkan/v3dv_pipeline.c +++ b/src/broadcom/vulkan/v3dv_pipeline.c @@ -233,6 +233,8 @@ const nir_shader_compiler_options v3dv_nir_options = { .vertex_id_zero_based = false, /* FIXME: to set this to true, the intrinsic * needs to be supported */ .lower_interpolate_at = true, + .divergence_analysis_options = + nir_divergence_multiple_workgroup_per_compute_subgroup }; const nir_shader_compiler_options *