diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c index a2666b39f6b..d1e641b9316 100644 --- a/src/broadcom/vulkan/v3dv_cmd_buffer.c +++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c @@ -5268,6 +5268,37 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job( } } +/* Choose a number of workgroups per supergroup that maximizes + * lane occupancy. We can pack up to 16 workgroups into a supergroup. + */ +static uint32_t +choose_workgroups_per_supergroup(uint32_t num_wgs, uint32_t wg_size) +{ + uint32_t best_wgs_per_sg = 1; + uint32_t best_unused_lanes = 16; + for (uint32_t wgs_per_sg = 1; wgs_per_sg <= 16; wgs_per_sg++) { + /* Don't try to pack more workgroups per supergroup than the total amount + * of workgroups dispatched. + */ + if (wgs_per_sg > num_wgs) + return best_wgs_per_sg; + + /* Compute wasted lines for this configuration and keep track of the + * config with less waste. + */ + uint32_t unused_lanes = (16 - ((wgs_per_sg * wg_size) % 16)) & 0x0f; + if (unused_lanes == 0) + return wgs_per_sg; + + if (unused_lanes < best_unused_lanes) { + best_wgs_per_sg = wgs_per_sg; + best_unused_lanes = unused_lanes; + } + } + + return best_wgs_per_sg; +} + static struct v3dv_job * cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, uint32_t group_count_x, @@ -5305,20 +5336,25 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, const struct v3d_compute_prog_data *cpd = cs_variant->prog_data.cs; - const uint32_t wgs_per_sg = 1; /* FIXME */ + const uint32_t num_wgs = group_count_x * group_count_y * group_count_z; const uint32_t wg_size = cpd->local_size[0] * cpd->local_size[1] * cpd->local_size[2]; - submit->cfg[3] |= wgs_per_sg << V3D_CSD_CFG3_WGS_PER_SG_SHIFT; - submit->cfg[3] |= ((DIV_ROUND_UP(wgs_per_sg * wg_size, 16) - 1) << - V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT); + + uint32_t wgs_per_sg = choose_workgroups_per_supergroup(num_wgs, wg_size); + uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16); + uint32_t whole_sgs = num_wgs / wgs_per_sg; + uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg; + uint32_t num_batches = batches_per_sg * whole_sgs + + DIV_ROUND_UP(rem_wgs * wg_size, 16); + + submit->cfg[3] |= (wgs_per_sg & 0xf) << V3D_CSD_CFG3_WGS_PER_SG_SHIFT; + submit->cfg[3] |= (batches_per_sg - 1) << V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT; submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT; if (wg_size_out) *wg_size_out = wg_size; - uint32_t batches_per_wg = DIV_ROUND_UP(wg_size, 16); - submit->cfg[4] = batches_per_wg * - (group_count_x * group_count_y * group_count_z) - 1; + submit->cfg[4] = num_batches - 1; assert(submit->cfg[4] != ~0); assert(pipeline->shared_data->assembly_bo); diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c index 6a573a78841..e6bebd905f6 100644 --- a/src/broadcom/vulkan/v3dv_pipeline.c +++ b/src/broadcom/vulkan/v3dv_pipeline.c @@ -233,6 +233,8 @@ const nir_shader_compiler_options v3dv_nir_options = { .vertex_id_zero_based = false, /* FIXME: to set this to true, the intrinsic * needs to be supported */ .lower_interpolate_at = true, + .divergence_analysis_options = + nir_divergence_multiple_workgroup_per_compute_subgroup }; const nir_shader_compiler_options *