freedreno/a6xx: Fix max_threads to account for reg footprint
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

The register footprint could limit occupancy.  We need to take this into
account to avoid deadlocks when a kernel is using barriers.

Fixes: 6d85cd6a3b ("freedreno: Implement get_compute_state_info for Adreno 6xx/7xx")
Signed-off-by: Rob Clark <rob.clark@oss.qualcomm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35745>
This commit is contained in:
Rob Clark 2025-06-25 08:45:17 -07:00 committed by Marge Bot
parent 6f5ff6be44
commit 2e00925c81

View file

@ -320,28 +320,32 @@ fd6_compute_state_delete(struct pipe_context *pctx, void *_hwcso)
}
static void
fd6_get_compute_state_info(struct pipe_context *pctx, void *cso, struct pipe_compute_state_object_info *info)
fd6_get_compute_state_info(struct pipe_context *pctx, void *cso, struct pipe_compute_state_object_info *cinfo)
{
static struct ir3_shader_key key; /* static is implicitly zeroed */
struct fd6_compute_state *cs = (struct fd6_compute_state *)cso;
struct ir3_shader_state *hwcso = (struct ir3_shader_state *)cs->hwcso;
struct ir3_shader_variant *v = ir3_shader_variant(ir3_get_shader(hwcso), key, false, &pctx->debug);
struct fd_context *ctx = fd_context(pctx);
uint32_t threadsize_base = ctx->screen->info->threadsize_base;
const struct fd_dev_info *info = fd_context(pctx)->screen->info;
uint32_t threadsize_base = info->threadsize_base;
info->max_threads = threadsize_base * ctx->screen->info->max_waves;
info->simd_sizes = threadsize_base;
info->preferred_simd_size = threadsize_base;
cinfo->max_threads = threadsize_base * info->max_waves;
cinfo->simd_sizes = threadsize_base;
cinfo->preferred_simd_size = threadsize_base;
if (ctx->screen->info->a6xx.supports_double_threadsize &&
v->info.double_threadsize) {
if (info->a6xx.supports_double_threadsize && v->info.double_threadsize) {
info->max_threads *= 2;
info->simd_sizes |= (threadsize_base * 2);
info->preferred_simd_size *= 2;
cinfo->max_threads *= 2;
cinfo->simd_sizes |= (threadsize_base * 2);
cinfo->preferred_simd_size *= 2;
}
info->private_memory = v->pvtmem_size;
unsigned reg_file_size_vec4 = info->a6xx.reg_size_vec4 * threadsize_base * info->wave_granularity;
unsigned vec4_regs_per_thread = v->info.max_reg + 1;
cinfo->max_threads = MIN2(cinfo->max_threads, reg_file_size_vec4 / vec4_regs_per_thread);
cinfo->private_memory = v->pvtmem_size;
}
template <chip CHIP>