panfrost: Apply direct dispatch WLS instance limit

Apply the direct dispatch WLS instance limit to panfrost as well to keep
compute jobs with large workgroup counts from running out of memory.

Fixes: 1304f4578d ("panfrost: Adapt emit_shared_memory for indirect dispatch")
Reviewed-by: Eric R. Smith <eric.smith@collabora.com>
Reviewed-by: John Anthony <john.anthony@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34979>
(cherry picked from commit 64ce37b2d9)
This commit is contained in:
Lars-Ivar Hesselberg Simonsen 2025-05-16 13:50:57 +02:00 committed by Eric Engestrom
parent cc2e341a14
commit c01db1fb7f
3 changed files with 13 additions and 30 deletions

View file

@ -5134,7 +5134,7 @@
"description": "panfrost: Apply direct dispatch WLS instance limit",
"nominated": true,
"nomination_type": 2,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": "1304f4578d2ee206be20bab8f9aa9a55ae4563b5",
"notes": null

View file

@ -1587,29 +1587,6 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
return ubos.gpu;
}
/*
* Choose the number of WLS instances to allocate. This must be a power-of-two.
* The number of WLS instances limits the number of concurrent tasks on a given
* shader core, setting to the (rounded) total number of tasks avoids any
* throttling. Smaller values save memory at the expense of possible throttling.
*
* With indirect dispatch, we don't know at launch-time how many tasks will be
* needed, so we use a conservative value that's unlikely to cause slowdown in
* practice without wasting too much memory.
*/
static unsigned
panfrost_choose_wls_instance_count(const struct pipe_grid_info *grid)
{
if (grid->indirect) {
/* May need tuning in the future, conservative guess */
return 128;
} else {
return util_next_power_of_two(grid->grid[0]) *
util_next_power_of_two(grid->grid[1]) *
util_next_power_of_two(grid->grid[2]);
}
}
static uint64_t
panfrost_emit_shared_memory(struct panfrost_batch *batch,
const struct pipe_grid_info *grid)
@ -1620,10 +1597,15 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
struct panfrost_ptr t =
pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
struct pan_compute_dim local_size = {grid->block[0], grid->block[1],
grid->block[2]};
struct pan_compute_dim dim = {grid->grid[0], grid->grid[1], grid->grid[2]};
struct pan_tls_info info = {
.tls.size = ss->info.tls_size,
.wls.size = ss->info.wls_size + grid->variable_shared_mem,
.wls.instances = panfrost_choose_wls_instance_count(grid),
.wls.instances = pan_calc_wls_instances(&local_size, &dev->kmod.props,
grid->indirect ? NULL : &dim),
};
if (ss->info.tls_size) {
@ -1637,8 +1619,8 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
}
if (info.wls.size) {
unsigned size = pan_wls_adjust_size(info.wls.size) * info.wls.instances *
dev->core_id_range;
unsigned size = pan_calc_total_wls_size(info.wls.size, info.wls.instances,
dev->core_id_range);
struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1);

View file

@ -200,7 +200,8 @@ emit_tls(struct panfrost_batch *batch,
struct pan_tls_info info = {
.tls.size = shader->info.tls_size,
.wls.size = shader->info.wls_size,
.wls.instances = pan_wls_instances(dim),
.wls.instances =
pan_calc_wls_instances(&shader->local_size, &dev->kmod.props, dim),
};
if (info.tls.size) {
@ -210,8 +211,8 @@ emit_tls(struct panfrost_batch *batch,
}
if (info.wls.size) {
unsigned size = pan_wls_adjust_size(info.wls.size) * info.wls.instances *
dev->core_id_range;
unsigned size = pan_calc_total_wls_size(info.wls.size, info.wls.instances,
dev->core_id_range);
struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1);