diff --git a/.pick_status.json b/.pick_status.json index 24ec042edbc..1a2763041bd 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -5134,7 +5134,7 @@ "description": "panfrost: Apply direct dispatch WLS instance limit", "nominated": true, "nomination_type": 2, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": "1304f4578d2ee206be20bab8f9aa9a55ae4563b5", "notes": null diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index a9b89f1e99c..fa2d5b34da9 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -1587,29 +1587,6 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, return ubos.gpu; } -/* - * Choose the number of WLS instances to allocate. This must be a power-of-two. - * The number of WLS instances limits the number of concurrent tasks on a given - * shader core, setting to the (rounded) total number of tasks avoids any - * throttling. Smaller values save memory at the expense of possible throttling. - * - * With indirect dispatch, we don't know at launch-time how many tasks will be - * needed, so we use a conservative value that's unlikely to cause slowdown in - * practice without wasting too much memory. - */ -static unsigned -panfrost_choose_wls_instance_count(const struct pipe_grid_info *grid) -{ - if (grid->indirect) { - /* May need tuning in the future, conservative guess */ - return 128; - } else { - return util_next_power_of_two(grid->grid[0]) * - util_next_power_of_two(grid->grid[1]) * - util_next_power_of_two(grid->grid[2]); - } -} - static uint64_t panfrost_emit_shared_memory(struct panfrost_batch *batch, const struct pipe_grid_info *grid) @@ -1620,10 +1597,15 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch, struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE); + struct pan_compute_dim local_size = {grid->block[0], grid->block[1], + grid->block[2]}; + struct pan_compute_dim dim = {grid->grid[0], grid->grid[1], grid->grid[2]}; + struct pan_tls_info info = { .tls.size = ss->info.tls_size, .wls.size = ss->info.wls_size + grid->variable_shared_mem, - .wls.instances = panfrost_choose_wls_instance_count(grid), + .wls.instances = pan_calc_wls_instances(&local_size, &dev->kmod.props, + grid->indirect ? NULL : &dim), }; if (ss->info.tls_size) { @@ -1637,8 +1619,8 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch, } if (info.wls.size) { - unsigned size = pan_wls_adjust_size(info.wls.size) * info.wls.instances * - dev->core_id_range; + unsigned size = pan_calc_total_wls_size(info.wls.size, info.wls.instances, + dev->core_id_range); struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1); diff --git a/src/gallium/drivers/panfrost/pan_precomp.c b/src/gallium/drivers/panfrost/pan_precomp.c index 22133415ec1..cc104d953b8 100644 --- a/src/gallium/drivers/panfrost/pan_precomp.c +++ b/src/gallium/drivers/panfrost/pan_precomp.c @@ -200,7 +200,8 @@ emit_tls(struct panfrost_batch *batch, struct pan_tls_info info = { .tls.size = shader->info.tls_size, .wls.size = shader->info.wls_size, - .wls.instances = pan_wls_instances(dim), + .wls.instances = + pan_calc_wls_instances(&shader->local_size, &dev->kmod.props, dim), }; if (info.tls.size) { @@ -210,8 +211,8 @@ emit_tls(struct panfrost_batch *batch, } if (info.wls.size) { - unsigned size = pan_wls_adjust_size(info.wls.size) * info.wls.instances * - dev->core_id_range; + unsigned size = pan_calc_total_wls_size(info.wls.size, info.wls.instances, + dev->core_id_range); struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1);