diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 8d086d5d2de..3550933a44a 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -1587,6 +1587,29 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, return ubos.gpu; } +/* + * Choose the number of WLS instances to allocate. This must be a power-of-two. + * The number of WLS instances limits the number of concurrent tasks on a given + * shader core, setting to the (rounded) total number of tasks avoids any + * throttling. Smaller values save memory at the expense of possible throttling. + * + * With indirect dispatch, we don't know at launch-time how many tasks will be + * needed, so we use a conservative value that's unlikely to cause slowdown in + * practice without wasting too much memory. + */ +static unsigned +panfrost_choose_wls_instance_count(const struct pipe_grid_info *grid) +{ + if (grid->indirect) { + /* May need tuning in the future, conservative guess */ + return 128; + } else { + return util_next_power_of_two(grid->grid[0]) * + util_next_power_of_two(grid->grid[1]) * + util_next_power_of_two(grid->grid[2]); + } +} + static mali_ptr panfrost_emit_shared_memory(struct panfrost_batch *batch, const struct pipe_grid_info *grid) @@ -1601,9 +1624,7 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch, struct pan_tls_info info = { .tls.size = ss->info.tls_size, .wls.size = ss->info.wls_size, - .wls.dim.x = grid->grid[0], - .wls.dim.y = grid->grid[1], - .wls.dim.z = grid->grid[2], + .wls.instances = panfrost_choose_wls_instance_count(grid), }; if (ss->info.tls_size) { @@ -1616,10 +1637,8 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch, } if (ss->info.wls_size) { - unsigned size = - pan_wls_adjust_size(info.wls.size) * - pan_wls_instances(&info.wls.dim) * - dev->core_id_range; + unsigned size = pan_wls_adjust_size(info.wls.size) * + info.wls.instances * dev->core_id_range; struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1); @@ -4138,12 +4157,7 @@ panfrost_launch_grid(struct pipe_context *pipe, struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); - struct panfrost_shader_state *cs = - &ctx->shader[PIPE_SHADER_COMPUTE]->variants[0]; - - /* Indirect dispatch can't handle workgroup local storage since that - * would require dynamic memory allocation. Bail in this case. */ - if (info->indirect && ((cs->info.wls_size != 0) || !PAN_GPU_INDIRECTS)) { + if (info->indirect && !PAN_GPU_INDIRECTS) { struct pipe_transfer *transfer; uint32_t *params = pipe_buffer_map_range(pipe, info->indirect, info->indirect_offset, @@ -4215,6 +4229,9 @@ panfrost_launch_grid(struct pipe_context *pipe, cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE]; } #else + struct panfrost_shader_state *cs = + &ctx->shader[PIPE_SHADER_COMPUTE]->variants[0]; + pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) { cfg.workgroup_size_x = info->block[0]; cfg.workgroup_size_y = info->block[1]; diff --git a/src/panfrost/lib/pan_cs.c b/src/panfrost/lib/pan_cs.c index 233af107256..6113e563a40 100644 --- a/src/panfrost/lib/pan_cs.c +++ b/src/panfrost/lib/pan_cs.c @@ -566,7 +566,7 @@ GENX(pan_emit_tls)(const struct pan_tls_info *info, assert((info->wls.ptr & 0xffffffff00000000ULL) == ((info->wls.ptr + info->wls.size - 1) & 0xffffffff00000000ULL)); cfg.wls_base_pointer = info->wls.ptr; unsigned wls_size = pan_wls_adjust_size(info->wls.size); - cfg.wls_instances = pan_wls_instances(&info->wls.dim); + cfg.wls_instances = info->wls.instances; cfg.wls_size_scale = util_logbase2(wls_size) + 1; } else { cfg.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM; diff --git a/src/panfrost/lib/pan_cs.h b/src/panfrost/lib/pan_cs.h index 2ffa6017ef7..8186102e5c0 100644 --- a/src/panfrost/lib/pan_cs.h +++ b/src/panfrost/lib/pan_cs.h @@ -85,7 +85,7 @@ struct pan_tls_info { } tls; struct { - struct pan_compute_dim dim; + unsigned instances; mali_ptr ptr; unsigned size; } wls;