From 1827b4a2db3b1f82378a4a6fa3d2e9baa1b10ac5 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 9 Nov 2022 12:10:37 -0500 Subject: [PATCH] panfrost: Compile indirect dispatch shader on first use For 2D UI workloads and even most 3D workloads, the indirect dispatch shader won't actually be needed, but we currently compile it during eglInitialize() on every v7 application. That hurts app start-up time, especially given that this shader doesn't hit the disk cache. We can instead defer compiling this shader until it's actually needed, when glDispatchComputeIndirect() gets called. The tradeoff is that the first glDispatchComputeIndirect() call will be (much) slower than successive calls, since we need to build and compile this internal shader. I'm unconvinced that's a problem in practice. An app would need to call glDispatchComputeIndirect for the first time in the middle of a scene. 2D apps never would call that, OpenCL doesn't have that, and GL compute will have the same costs just moved around. So it's down to a 3D GLES3.1 app that indirectly dispatches compute for the first time time in the middle of a scene. Which, meh? It's not entirely implausible but we have bigger fish to fry, and this fixes a real problem (about 5% of eglInitialize time spent building this shader that won't actually get used). es2_info starts slightly faster with this change. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/gallium/drivers/panfrost/pan_cmdstream.c | 1 - src/panfrost/lib/pan_indirect_dispatch.c | 68 +++++++++++--------- src/panfrost/lib/pan_indirect_dispatch.h | 3 - 3 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index cdb793e0283..cd6a0641c1e 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -4827,7 +4827,6 @@ GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen) GENX(pan_blitter_init)(dev, &screen->blitter.bin_pool.base, &screen->blitter.desc_pool.base); #if PAN_GPU_INDIRECTS - GENX(pan_indirect_dispatch_init)(dev); GENX(panfrost_init_indirect_draw_shaders)(dev, &screen->indirect_draw.bin_pool.base); #endif } diff --git a/src/panfrost/lib/pan_indirect_dispatch.c b/src/panfrost/lib/pan_indirect_dispatch.c index f2f78cea8ea..8a6ad81167d 100644 --- a/src/panfrost/lib/pan_indirect_dispatch.c +++ b/src/panfrost/lib/pan_indirect_dispatch.c @@ -53,38 +53,8 @@ get_tls(const struct panfrost_device *dev) pan_size(RENDERER_STATE); } -unsigned -GENX(pan_indirect_dispatch_emit)(struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - const struct pan_indirect_dispatch_info *inputs) -{ - struct panfrost_device *dev = pool->dev; - struct panfrost_ptr job = - pan_pool_alloc_desc(pool, COMPUTE_JOB); - void *invocation = - pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION); - - panfrost_pack_work_groups_compute(invocation, - 1, 1, 1, 1, 1, 1, - false, false); - - pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) { - cfg.job_task_split = 2; - } - - pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) { - cfg.state = get_rsd(dev); - cfg.thread_storage = get_tls(pool->dev); - cfg.push_uniforms = - pan_pool_upload_aligned(pool, inputs, sizeof(*inputs), 16); - } - - return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE, - false, true, 0, 0, &job, false); -} - -void -GENX(pan_indirect_dispatch_init)(struct panfrost_device *dev) +static void +pan_indirect_dispatch_init(struct panfrost_device *dev) { nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, @@ -192,6 +162,40 @@ GENX(pan_indirect_dispatch_init)(struct panfrost_device *dev) }; } +unsigned +GENX(pan_indirect_dispatch_emit)(struct pan_pool *pool, + struct pan_scoreboard *scoreboard, + const struct pan_indirect_dispatch_info *inputs) +{ + struct panfrost_device *dev = pool->dev; + struct panfrost_ptr job = + pan_pool_alloc_desc(pool, COMPUTE_JOB); + void *invocation = + pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION); + + /* If we haven't compiled the indirect dispatch shader yet, do it now */ + if (!dev->indirect_dispatch.bin) + pan_indirect_dispatch_init(dev); + + panfrost_pack_work_groups_compute(invocation, + 1, 1, 1, 1, 1, 1, + false, false); + + pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) { + cfg.job_task_split = 2; + } + + pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) { + cfg.state = get_rsd(dev); + cfg.thread_storage = get_tls(pool->dev); + cfg.push_uniforms = + pan_pool_upload_aligned(pool, inputs, sizeof(*inputs), 16); + } + + return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE, + false, true, 0, 0, &job, false); +} + void GENX(pan_indirect_dispatch_cleanup)(struct panfrost_device *dev) { diff --git a/src/panfrost/lib/pan_indirect_dispatch.h b/src/panfrost/lib/pan_indirect_dispatch.h index 26ab77939a6..e996c76551f 100644 --- a/src/panfrost/lib/pan_indirect_dispatch.h +++ b/src/panfrost/lib/pan_indirect_dispatch.h @@ -41,9 +41,6 @@ GENX(pan_indirect_dispatch_emit)(struct pan_pool *pool, struct pan_scoreboard *scoreboard, const struct pan_indirect_dispatch_info *dispatch_info); -void -GENX(pan_indirect_dispatch_init)(struct panfrost_device *dev); - void GENX(pan_indirect_dispatch_cleanup)(struct panfrost_device *dev);