From b8d10d9e87a32d039a6b9b11b61d969573d1d11c Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Wed, 14 Sep 2022 20:37:55 +0200
Subject: [PATCH] gallium: split up req_local_mem

This will be required if a frontend has to request additional shared mem
on top of the shader declared one, but wants to create the CSO before
knowing the total amount.

In OpenCL applications can bind additional shared mem through kernel
arguments and this happens quite late.

Note: Clover sets the req_local_mem incorrectly before so we can leave it
as broken.

v2: fix panfrost code (Alyssa)

Signed-off-by: Karol Herbst <kherbst@redhat.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@collabora.com>
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18581>
---
 .../auxiliary/driver_trace/tr_dump_state.c    |  3 +-
 .../drivers/freedreno/a6xx/fd6_compute.c      |  8 +--
 .../drivers/freedreno/ir3/ir3_gallium.c       |  2 +-
 src/gallium/drivers/iris/iris_program.c       |  2 +-
 src/gallium/drivers/iris/iris_state.c         |  2 +-
 src/gallium/drivers/llvmpipe/lp_state_cs.c    |  3 +-
 .../drivers/nouveau/nv50/nv50_compute.c       |  3 +-
 src/gallium/drivers/nouveau/nv50/nv50_state.c |  2 +-
 .../drivers/nouveau/nvc0/nvc0_compute.c       |  2 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_state.c |  2 +-
 .../drivers/nouveau/nvc0/nve4_compute.c       | 18 +++----
 src/gallium/drivers/panfrost/pan_cmdstream.c  |  2 +-
 src/gallium/drivers/panfrost/pan_shader.c     |  2 +-
 src/gallium/drivers/r600/evergreen_compute.c  |  4 +-
 src/gallium/drivers/radeonsi/si_compute.c     | 49 +++++++++++++------
 src/gallium/drivers/radeonsi/si_pipe.h        |  1 +
 src/gallium/drivers/softpipe/sp_compute.c     |  7 +--
 src/gallium/drivers/svga/svga_pipe_cs.c       |  2 +-
 src/gallium/drivers/virgl/virgl_context.c     |  2 +-
 src/gallium/frontends/clover/core/kernel.cpp  |  5 +-
 src/gallium/frontends/lavapipe/lvp_pipeline.c |  2 +-
 src/gallium/frontends/rusticl/core/kernel.rs  | 20 +++++---
 .../frontends/rusticl/mesa/pipe/context.rs    | 13 +++--
 src/gallium/include/pipe/p_state.h            |  9 +++-
 src/mesa/state_tracker/st_pbo_compute.c       |  4 +-
 src/mesa/state_tracker/st_program.c           |  2 +-
 26 files changed, 109 insertions(+), 62 deletions(-)

diff --git a/src/gallium/auxiliary/driver_trace/tr_dump_state.c b/src/gallium/auxiliary/driver_trace/tr_dump_state.c
index 410ac5a1abc..20b507c4efb 100644
--- a/src/gallium/auxiliary/driver_trace/tr_dump_state.c
+++ b/src/gallium/auxiliary/driver_trace/tr_dump_state.c
@@ -349,7 +349,7 @@ void trace_dump_compute_state(const struct pipe_compute_state *state)
    }
    trace_dump_member_end();
 
-   trace_dump_member(uint, state, req_local_mem);
+   trace_dump_member(uint, state, static_shared_mem);
    trace_dump_member(uint, state, req_input_mem);
 
    trace_dump_struct_end();
@@ -1081,6 +1081,7 @@ void trace_dump_grid_info(const struct pipe_grid_info *state)
 
    trace_dump_member(uint, state, pc);
    trace_dump_member(ptr, state, input);
+   trace_dump_member(uint, state, variable_shared_mem);
 
    trace_dump_member_begin("block");
    trace_dump_array(uint, state->block, ARRAY_SIZE(state->block));
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c
index 774e1603cb6..9d9df192f94 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c
@@ -42,7 +42,8 @@
 /* maybe move to fd6_program? */
 static void
 cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
-                struct ir3_shader_variant *v) assert_dt
+                struct ir3_shader_variant *v,
+                uint32_t variable_shared_size) assert_dt
 {
    const struct ir3_info *i = &v->info;
    enum a6xx_threadsize thrsz = i->double_threadsize ? THREAD128 : THREAD64;
@@ -71,7 +72,8 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
                COND(v->mergedregs, A6XX_SP_CS_CTRL_REG0_MERGEDREGS) |
                A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(v)));
 
-   uint32_t shared_size = MAX2(((int)v->cs.req_local_mem - 1) / 1024, 1);
+   uint32_t shared_size =
+      MAX2(((int)v->cs.req_local_mem + variable_shared_size- 1) / 1024, 1);
    OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
    OUT_RING(ring, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
                      A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
@@ -125,7 +127,7 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt
       return;
 
    if (ctx->dirty_shader[PIPE_SHADER_COMPUTE] & FD_DIRTY_SHADER_PROG)
-      cs_program_emit(ctx, ring, v);
+      cs_program_emit(ctx, ring, v, info->variable_shared_mem);
 
    fd6_emit_cs_state(ctx, ring, v);
    fd6_emit_cs_consts(v, ring, ctx, info);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
index 57788c2e788..f2a574ac9c9 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
@@ -315,7 +315,7 @@ ir3_shader_compute_state_create(struct pipe_context *pctx,
                               .real_wavesize = IR3_SINGLE_OR_DOUBLE,
                           }, NULL);
    shader->cs.req_input_mem = align(cso->req_input_mem, 4) / 4;     /* byte->dword */
-   shader->cs.req_local_mem = cso->req_local_mem;
+   shader->cs.req_local_mem = cso->static_shared_mem;
 
    struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
 
diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c
index d7ced54483d..e4acde2e652 100644
--- a/src/gallium/drivers/iris/iris_program.c
+++ b/src/gallium/drivers/iris/iris_program.c
@@ -2534,7 +2534,7 @@ iris_create_compute_state(struct pipe_context *ctx,
    struct iris_uncompiled_shader *ish =
       iris_create_uncompiled_shader(screen, nir, NULL);
    ish->kernel_input_size = state->req_input_mem;
-   ish->kernel_shared_size = state->req_local_mem;
+   ish->kernel_shared_size = state->static_shared_mem;
 
    // XXX: disallow more than 64KB of shared variables
 
diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
index 79f3758a36a..e8c73541326 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -7402,7 +7402,7 @@ iris_upload_gpgpu_walker(struct iris_context *ice,
 
       iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
          idd.SharedLocalMemorySize =
-            encode_slm_size(GFX_VER, ish->kernel_shared_size);
+            encode_slm_size(GFX_VER, ish->kernel_shared_size + grid->variable_shared_mem);
          idd.KernelStartPointer =
             KSP(shader) + brw_cs_prog_data_prog_offset(cs_prog_data,
                                                        dispatch.simd_size);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_cs.c b/src/gallium/drivers/llvmpipe/lp_state_cs.c
index 8a2c28a7b34..8554a7f45f6 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_cs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_cs.c
@@ -500,7 +500,6 @@ llvmpipe_create_compute_state(struct pipe_context *pipe,
    shader->no = cs_no++;
 
    shader->base.type = templ->ir_type;
-   shader->req_local_mem = templ->req_local_mem;
    if (templ->ir_type == PIPE_SHADER_IR_NIR_SERIALIZED) {
       struct blob_reader reader;
       const struct pipe_binary_program_header *hdr = templ->prog;
@@ -1428,7 +1427,7 @@ llvmpipe_launch_grid(struct pipe_context *pipe,
    job_info.block_size[1] = info->block[1];
    job_info.block_size[2] = info->block[2];
    job_info.work_dim = info->work_dim;
-   job_info.req_local_mem = llvmpipe->cs->req_local_mem;
+   job_info.req_local_mem = llvmpipe->cs->req_local_mem + info->variable_shared_mem;
    job_info.zero_initialize_shared_memory = llvmpipe->cs->zero_initialize_shared_memory;
    job_info.current = &llvmpipe->csctx->cs.current;
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.c b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
index 1213effd53d..e6a597c8182 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_compute.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
@@ -579,8 +579,9 @@ nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    BEGIN_NV04(push, NV50_CP(CP_START_ID), 1);
    PUSH_DATA (push, cp->code_base);
 
+   int shared_size = cp->cp.smem_size + info->variable_shared_mem + cp->parm_size + 0x14;
    BEGIN_NV04(push, NV50_CP(SHARED_SIZE), 1);
-   PUSH_DATA (push, align(cp->cp.smem_size + cp->parm_size + 0x14, 0x40));
+   PUSH_DATA (push, align(shared_size, 0x40));
    BEGIN_NV04(push, NV50_CP(CP_REG_ALLOC_TEMP), 1);
    PUSH_DATA (push, cp->max_gpr);
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 0cdf2afacc9..3161549c815 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -861,7 +861,7 @@ nv50_cp_state_create(struct pipe_context *pipe,
       return NULL;
    }
 
-   prog->cp.smem_size = cso->req_local_mem;
+   prog->cp.smem_size = cso->static_shared_mem;
    prog->parm_size = cso->req_input_mem;
 
    return (void *)prog;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
index e9f0cbe7768..237b74ef4b5 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -446,7 +446,7 @@ nvc0_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    PUSH_DATA (push, 0x800); /* WARP_CSTACK_SIZE */
 
    BEGIN_NVC0(push, NVC0_CP(SHARED_SIZE), 3);
-   PUSH_DATA (push, align(cp->cp.smem_size, 0x100));
+   PUSH_DATA (push, align(cp->cp.smem_size + info->variable_shared_mem, 0x100));
    PUSH_DATA (push, info->block[0] * info->block[1] * info->block[2]);
    PUSH_DATA (push, cp->num_barriers);
    BEGIN_NVC0(push, NVC0_CP(CP_GPR_ALLOC), 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 4d4b5c431ea..9eb442c3c10 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -741,7 +741,7 @@ nvc0_cp_state_create(struct pipe_context *pipe,
    prog->type = PIPE_SHADER_COMPUTE;
    prog->pipe.type = cso->ir_type;
 
-   prog->cp.smem_size = cso->req_local_mem;
+   prog->cp.smem_size = cso->static_shared_mem;
    prog->parm_size = cso->req_input_mem;
 
    switch(cso->ir_type) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index 8758efd59c4..e673984bc87 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -627,6 +627,7 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd,
 {
    const struct nvc0_screen *screen = nvc0->screen;
    const struct nvc0_program *cp = nvc0->compprog;
+   uint32_t shared_size = cp->cp.smem_size + info->variable_shared_mem;
 
    NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_HEADER_CACHE, TRUE);
    NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_SAMPLER_CACHE, TRUE);
@@ -647,17 +648,16 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd,
    NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]);
    NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]);
 
-   NVA0C0_QMDV00_06_VAL_SET(qmd, SHARED_MEMORY_SIZE,
-                                 align(cp->cp.smem_size, 0x100));
+   NVA0C0_QMDV00_06_VAL_SET(qmd, SHARED_MEMORY_SIZE, align(shared_size, 0x100));
    NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, cp->hdr[1] & 0xfffff0);
    NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
    NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, 0x800);
 
-   if (cp->cp.smem_size > (32 << 10))
+   if (shared_size > (32 << 10))
       NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION,
                                     DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
    else
-   if (cp->cp.smem_size > (16 << 10))
+   if (shared_size > (16 << 10))
       NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION,
                                     DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB);
    else
@@ -690,6 +690,7 @@ gp100_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd,
 {
    const struct nvc0_screen *screen = nvc0->screen;
    const struct nvc0_program *cp = nvc0->compprog;
+   uint32_t shared_size = cp->cp.smem_size + info->variable_shared_mem;
 
    NVC0C0_QMDV02_01_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
    NVC0C0_QMDV02_01_DEF_SET(qmd, RELEASE_MEMBAR_TYPE, FE_SYSMEMBAR);
@@ -705,8 +706,7 @@ gp100_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd,
    NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]);
    NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]);
 
-   NVC0C0_QMDV02_01_VAL_SET(qmd, SHARED_MEMORY_SIZE,
-                                 align(cp->cp.smem_size, 0x100));
+   NVC0C0_QMDV02_01_VAL_SET(qmd, SHARED_MEMORY_SIZE, align(shared_size, 0x100));
    NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, cp->hdr[1] & 0xfffff0);
    NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
    NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, 0x800);
@@ -749,12 +749,12 @@ gv100_compute_setup_launch_desc(struct nvc0_context *nvc0, u32 *qmd,
    struct nvc0_program *cp = nvc0->compprog;
    struct nvc0_screen *screen = nvc0->screen;
    uint64_t entry = screen->text->offset + cp->code_base;
+   uint32_t shared_size = cp->cp.smem_size + info->variable_shared_mem;
 
    NVC3C0_QMDV02_02_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
    NVC3C0_QMDV02_02_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK);
    NVC3C0_QMDV02_02_DEF_SET(qmd, SAMPLER_INDEX, INDEPENDENTLY);
-   NVC3C0_QMDV02_02_VAL_SET(qmd, SHARED_MEMORY_SIZE,
-                                  align(cp->cp.smem_size, 0x100));
+   NVC3C0_QMDV02_02_VAL_SET(qmd, SHARED_MEMORY_SIZE, align(shared_size, 0x100));
    NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, cp->hdr[1] & 0xfffff0);
    NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
    NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE,
@@ -764,7 +764,7 @@ gv100_compute_setup_launch_desc(struct nvc0_context *nvc0, u32 *qmd,
    NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_VERSION, 2);
    NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_MAJOR_VERSION, 2);
    NVC3C0_QMDV02_02_VAL_SET(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE,
-                                  gv100_sm_config_smem_size(cp->cp.smem_size));
+                                  gv100_sm_config_smem_size(shared_size));
 
    NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]);
    NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]);
diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c
index 1782ca17ce9..0d89e25245b 100644
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@@ -1625,7 +1625,7 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
 
         struct pan_tls_info info = {
                 .tls.size = ss->info.tls_size,
-                .wls.size = ss->info.wls_size,
+                .wls.size = ss->info.wls_size + grid->variable_shared_mem,
                 .wls.instances = panfrost_choose_wls_instance_count(grid),
         };
 
diff --git a/src/gallium/drivers/panfrost/pan_shader.c b/src/gallium/drivers/panfrost/pan_shader.c
index 47e69662937..5af151bd37f 100644
--- a/src/gallium/drivers/panfrost/pan_shader.c
+++ b/src/gallium/drivers/panfrost/pan_shader.c
@@ -481,7 +481,7 @@ panfrost_create_compute_state(
         assert(cso->ir_type == PIPE_SHADER_IR_NIR && "TGSI kernels unsupported");
 
         panfrost_shader_get(pctx->screen, &ctx->shaders, &ctx->descs,
-                            so, &ctx->base.debug, v, cso->req_local_mem);
+                            so, &ctx->base.debug, v, cso->static_shared_mem);
 
         /* The NIR becomes invalid after this. For compute kernels, we never
          * need to access it again. Don't keep a dangling pointer around.
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index baef801d0ec..8f22d2d5d15 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -435,7 +435,7 @@ static void *evergreen_create_compute_state(struct pipe_context *ctx,
 #endif
 
 	shader->ctx = rctx;
-	shader->local_size = cso->req_local_mem;
+	shader->local_size = cso->static_shared_mem;
 	shader->input_size = cso->req_input_mem;
 
 	shader->ir_type = cso->ir_type;
@@ -610,7 +610,7 @@ static void evergreen_emit_dispatch(struct r600_context *rctx,
 	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
 	unsigned wave_divisor = (16 * num_pipes);
 	int group_size = 1;
-	unsigned lds_size = shader->local_size / 4;
+	unsigned lds_size = (shader->local_size + info->variable_shared_mem) / 4;
 
 	if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
 	    shader->ir_type != PIPE_SHADER_IR_NIR)
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index d3f4c93de9c..cac3045282f 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -240,7 +240,7 @@ static void *si_create_compute_state(struct pipe_context *ctx, const struct pipe
       si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_COMPUTE);
    sel->sampler_and_images_descriptors_index =
       si_sampler_and_image_descriptors_idx(PIPE_SHADER_COMPUTE);
-   sel->info.base.shared_size = cso->req_local_mem;
+   sel->info.base.shared_size = cso->static_shared_mem;
    program->shader.selector = &program->sel;
    program->shader.wave_size = si_determine_wave_size(sscreen, &program->shader);
    program->ir_type = cso->ir_type;
@@ -505,43 +505,61 @@ static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_s
 
 static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute *program,
                                      struct si_shader *shader, const amd_kernel_code_t *code_object,
-                                     unsigned offset, bool *prefetch)
+                                     unsigned offset, bool *prefetch, unsigned variable_shared_size)
 {
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
    struct ac_shader_config inline_config = {0};
-   struct ac_shader_config *config;
+   const struct ac_shader_config *config;
+   unsigned rsrc2;
    uint64_t shader_va;
+   unsigned stage = shader->selector->info.base.stage;
 
    *prefetch = false;
 
-   if (sctx->cs_shader_state.emitted_program == program && sctx->cs_shader_state.offset == offset)
+   assert(variable_shared_size == 0 || stage == MESA_SHADER_KERNEL || program->ir_type == PIPE_SHADER_IR_NATIVE);
+   if (sctx->cs_shader_state.emitted_program == program && sctx->cs_shader_state.offset == offset &&
+       sctx->cs_shader_state.variable_shared_size == variable_shared_size)
       return true;
 
    if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
       config = &shader->config;
    } else {
+      code_object_to_config(code_object, &inline_config);
+      config = &inline_config;
+   }
+   /* copy rsrc2 so we don't have to change it inside the si_shader object */
+   rsrc2 = config->rsrc2;
+
+   /* only do this for OpenCL */
+   if (program->ir_type == PIPE_SHADER_IR_NATIVE || stage == MESA_SHADER_KERNEL) {
+      unsigned shared_size = program->sel.info.base.shared_size + variable_shared_size;
       unsigned lds_blocks;
 
-      config = &inline_config;
-      code_object_to_config(code_object, config);
+      /* Clover uses the compute API differently than other frontends and expects drivers to parse
+       * the shared_size out of the shader headers.
+       */
+      if (program->ir_type == PIPE_SHADER_IR_NATIVE) {
+         lds_blocks = config->lds_size;
+      } else {
+         lds_blocks = 0;
+      }
 
-      lds_blocks = config->lds_size;
       /* XXX: We are over allocating LDS.  For GFX6, the shader reports
        * LDS in blocks of 256 bytes, so if there are 4 bytes lds
        * allocated in the shader and 4 bytes allocated by the state
        * tracker, then we will set LDS_SIZE to 512 bytes rather than 256.
        */
       if (sctx->gfx_level <= GFX6) {
-         lds_blocks += align(program->sel.info.base.shared_size, 256) >> 8;
+         lds_blocks += align(shared_size, 256) >> 8;
       } else {
-         lds_blocks += align(program->sel.info.base.shared_size, 512) >> 9;
+         lds_blocks += align(shared_size, 512) >> 9;
       }
 
       /* TODO: use si_multiwave_lds_size_workaround */
       assert(lds_blocks <= 0xFF);
 
-      config->rsrc2 &= C_00B84C_LDS_SIZE;
-      config->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks);
+      rsrc2 &= C_00B84C_LDS_SIZE;
+      rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks);
    }
 
    unsigned tmpring_size;
@@ -584,7 +602,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
    }
 
    radeon_emit(config->rsrc1);
-   radeon_emit(config->rsrc2);
+   radeon_emit(rsrc2);
 
    COMPUTE_DBG(sctx->screen,
                "COMPUTE_PGM_RSRC1: 0x%08x "
@@ -596,6 +614,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
 
    sctx->cs_shader_state.emitted_program = program;
    sctx->cs_shader_state.offset = offset;
+   sctx->cs_shader_state.variable_shared_size = variable_shared_size;
 
    *prefetch = true;
    return true;
@@ -682,7 +701,8 @@ static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_
       dispatch.grid_size_y = util_cpu_to_le32(info->grid[1] * info->block[1]);
       dispatch.grid_size_z = util_cpu_to_le32(info->grid[2] * info->block[2]);
 
-      dispatch.group_segment_size = util_cpu_to_le32(program->sel.info.base.shared_size);
+      dispatch.group_segment_size =
+         util_cpu_to_le32(program->sel.info.base.shared_size + info->variable_shared_mem);
 
       dispatch.kernarg_address = util_cpu_to_le64(kernel_args_va);
 
@@ -1001,7 +1021,8 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
 
    /* First emit registers. */
    bool prefetch;
-   if (!si_switch_compute_shader(sctx, program, &program->shader, code_object, info->pc, &prefetch))
+   if (!si_switch_compute_shader(sctx, program, &program->shader, code_object, info->pc, &prefetch,
+                                 info->variable_shared_mem))
       return;
 
    si_upload_compute_shader_descriptors(sctx);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 95e252187b4..0dcddfa7e39 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -739,6 +739,7 @@ struct si_cs_shader_state {
    struct si_compute *program;
    struct si_compute *emitted_program;
    unsigned offset;
+   uint32_t variable_shared_size;
    bool initialized;
 };
 
diff --git a/src/gallium/drivers/softpipe/sp_compute.c b/src/gallium/drivers/softpipe/sp_compute.c
index 221ef7ec797..2b70fb9777b 100644
--- a/src/gallium/drivers/softpipe/sp_compute.c
+++ b/src/gallium/drivers/softpipe/sp_compute.c
@@ -184,8 +184,9 @@ softpipe_launch_grid(struct pipe_context *context,
 
    fill_grid_size(context, info, grid_size);
 
-   if (cs->shader.req_local_mem) {
-      local_mem = CALLOC(1, cs->shader.req_local_mem);
+   uint32_t shared_mem_size = cs->shader.static_shared_mem + info->variable_shared_mem;
+   if (shared_mem_size) {
+      local_mem = CALLOC(1, shared_mem_size);
    }
 
    machines = CALLOC(sizeof(struct tgsi_exec_machine *), num_threads_in_group);
@@ -202,7 +203,7 @@ softpipe_launch_grid(struct pipe_context *context,
             machines[idx] = tgsi_exec_machine_create(PIPE_SHADER_COMPUTE);
 
             machines[idx]->LocalMem = local_mem;
-            machines[idx]->LocalMemSize = cs->shader.req_local_mem;
+            machines[idx]->LocalMemSize = shared_mem_size;
             machines[idx]->NonHelperMask = (1 << (MIN2(TGSI_QUAD_SIZE, bwidth - local_x))) - 1;
             cs_prepare(cs, machines[idx],
                        local_x, local_y, local_z,
diff --git a/src/gallium/drivers/svga/svga_pipe_cs.c b/src/gallium/drivers/svga/svga_pipe_cs.c
index 7b2293863b7..428afd77c81 100644
--- a/src/gallium/drivers/svga/svga_pipe_cs.c
+++ b/src/gallium/drivers/svga/svga_pipe_cs.c
@@ -70,7 +70,7 @@ svga_create_compute_state(struct pipe_context *pipe,
    /* Collect shader basic info */
    svga_tgsi_scan_shader(&cs->base);
 
-   cs->shared_mem_size = templ->req_local_mem;
+   cs->shared_mem_size = templ->static_shared_mem;
 
    SVGA_STATS_TIME_POP(svga_sws(svga));
    return cs;
diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c
index dbd572da0ed..c1ea961092d 100644
--- a/src/gallium/drivers/virgl/virgl_context.c
+++ b/src/gallium/drivers/virgl/virgl_context.c
@@ -1440,7 +1440,7 @@ static void *virgl_create_compute_state(struct pipe_context *ctx,
    handle = virgl_object_assign_handle();
    ret = virgl_encode_shader_state(vctx, handle, PIPE_SHADER_COMPUTE,
                                    &so_info,
-                                   state->req_local_mem,
+                                   state->static_shared_mem,
                                    new_tokens);
    if (ret) {
       FREE((void *)ntt_tokens);
diff --git a/src/gallium/frontends/clover/core/kernel.cpp b/src/gallium/frontends/clover/core/kernel.cpp
index b07907e471b..f1ff5a49b1a 100644
--- a/src/gallium/frontends/clover/core/kernel.cpp
+++ b/src/gallium/frontends/clover/core/kernel.cpp
@@ -100,6 +100,7 @@ kernel::launch(command_queue &q,
    copy(pad_vector(q, reduced_grid_size, 1), info.grid);
    info.pc = find(name_equals(_name), b.syms).offset;
    info.input = exec.input.data();
+   info.variable_shared_mem = exec.mem_local;
 
    q.pipe->launch_grid(q.pipe, &info);
 
@@ -274,14 +275,14 @@ kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
 
    // Create a new compute state if anything changed.
    if (!st || q != _q ||
-       cs.req_local_mem != mem_local ||
        cs.req_input_mem != input.size()) {
       if (st)
          _q->pipe->delete_compute_state(_q->pipe, st);
 
       cs.ir_type = q->device().ir_format();
       cs.prog = &(msec.data[0]);
-      cs.req_local_mem = mem_local;
+      // we only pass in NIRs or LLVMs and both IRs decode the size
+      cs.static_shared_mem = 0;
       cs.req_input_mem = input.size();
       st = q->pipe->create_compute_state(q->pipe, &cs);
       if (!st) {
diff --git a/src/gallium/frontends/lavapipe/lvp_pipeline.c b/src/gallium/frontends/lavapipe/lvp_pipeline.c
index 87d9da57764..2efd36c9c45 100644
--- a/src/gallium/frontends/lavapipe/lvp_pipeline.c
+++ b/src/gallium/frontends/lavapipe/lvp_pipeline.c
@@ -601,7 +601,7 @@ lvp_pipeline_compile_stage(struct lvp_pipeline *pipeline, nir_shader *nir)
       struct pipe_compute_state shstate = {0};
       shstate.prog = nir;
       shstate.ir_type = PIPE_SHADER_IR_NIR;
-      shstate.req_local_mem = nir->info.shared_size;
+      shstate.static_shared_mem = nir->info.shared_size;
       return device->queue.ctx->create_compute_state(device->queue.ctx, &shstate);
    } else {
       struct pipe_shader_state shstate = {0};
diff --git a/src/gallium/frontends/rusticl/core/kernel.rs b/src/gallium/frontends/rusticl/core/kernel.rs
index 31ce0a8b1b4..1b5cf1be5ab 100644
--- a/src/gallium/frontends/rusticl/core/kernel.rs
+++ b/src/gallium/frontends/rusticl/core/kernel.rs
@@ -811,7 +811,9 @@ impl Kernel {
         let offsets = create_kernel_arr::<u64>(offsets, 0);
         let mut input: Vec<u8> = Vec::new();
         let mut resource_info = Vec::new();
-        let mut local_size: u64 = nir.shared_size() as u64;
+        // Set it once so we get the alignment padding right
+        let static_local_size: u64 = nir.shared_size() as u64;
+        let mut variable_local_size: u64 = static_local_size;
         let printf_size = q.device.printf_buffer_size() as u32;
         let mut samplers = Vec::new();
         let mut iviews = Vec::new();
@@ -876,13 +878,14 @@ impl Kernel {
                 KernelArgValue::LocalMem(size) => {
                     // TODO 32 bit
                     let pot = cmp::min(*size, 0x80);
-                    local_size = align(local_size, pot.next_power_of_two() as u64);
+                    variable_local_size =
+                        align(variable_local_size, pot.next_power_of_two() as u64);
                     if q.device.address_bits() == 64 {
-                        input.extend_from_slice(&local_size.to_ne_bytes());
+                        input.extend_from_slice(&variable_local_size.to_ne_bytes());
                     } else {
-                        input.extend_from_slice(&(local_size as u32).to_ne_bytes());
+                        input.extend_from_slice(&(variable_local_size as u32).to_ne_bytes());
                     }
-                    local_size += *size as u64;
+                    variable_local_size += *size as u64;
                 }
                 KernelArgValue::Sampler(sampler) => {
                     samplers.push(sampler.pipe());
@@ -897,6 +900,9 @@ impl Kernel {
             }
         }
 
+        // subtract the shader local_size as we only request something on top of that.
+        variable_local_size -= nir.shared_size() as u64;
+
         let mut printf_buf = None;
         for arg in &self.internal_args {
             if arg.offset > input.len() {
@@ -992,7 +998,7 @@ impl Kernel {
                     init_data.len() as u32,
                 );
             }
-            let cso = ctx.create_compute_state(nir, local_size as u32);
+            let cso = ctx.create_compute_state(nir, static_local_size as u32);
 
             ctx.bind_compute_state(cso);
             ctx.bind_sampler_states(&samplers);
@@ -1001,7 +1007,7 @@ impl Kernel {
             ctx.set_global_binding(resources.as_slice(), &mut globals);
             ctx.set_constant_buffer(0, &input);
 
-            ctx.launch_grid(work_dim, block, grid);
+            ctx.launch_grid(work_dim, block, grid, variable_local_size as u32);
 
             ctx.clear_global_binding(globals.len() as u32);
             ctx.clear_shader_images(iviews.len() as u32);
diff --git a/src/gallium/frontends/rusticl/mesa/pipe/context.rs b/src/gallium/frontends/rusticl/mesa/pipe/context.rs
index bda36c41e8c..c1275554c59 100644
--- a/src/gallium/frontends/rusticl/mesa/pipe/context.rs
+++ b/src/gallium/frontends/rusticl/mesa/pipe/context.rs
@@ -265,12 +265,12 @@ impl PipeContext {
         unsafe { self.pipe.as_ref().texture_unmap.unwrap()(self.pipe.as_ptr(), tx) };
     }
 
-    pub fn create_compute_state(&self, nir: &NirShader, local_mem: u32) -> *mut c_void {
+    pub fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void {
         let state = pipe_compute_state {
             ir_type: pipe_shader_ir::PIPE_SHADER_IR_NIR,
             prog: nir.dup_for_driver().cast(),
             req_input_mem: 0,
-            req_local_mem: local_mem,
+            static_shared_mem: static_local_mem,
         };
         unsafe { self.pipe.as_ref().create_compute_state.unwrap()(self.pipe.as_ptr(), &state) }
     }
@@ -334,10 +334,17 @@ impl PipeContext {
         }
     }
 
-    pub fn launch_grid(&self, work_dim: u32, block: [u32; 3], grid: [u32; 3]) {
+    pub fn launch_grid(
+        &self,
+        work_dim: u32,
+        block: [u32; 3],
+        grid: [u32; 3],
+        variable_local_mem: u32,
+    ) {
         let info = pipe_grid_info {
             pc: 0,
             input: ptr::null(),
+            variable_shared_mem: variable_local_mem,
             work_dim: work_dim,
             block: block,
             last_block: [0; 3],
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 34154cef08e..4fe3db90794 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -936,6 +936,13 @@ struct pipe_grid_info
     */
    const void *input;
 
+   /**
+    * Variable shared memory used by this invocation.
+    *
+    * This comes on top of shader declared shared memory.
+    */
+   uint32_t variable_shared_mem;
+
    /**
     * Grid number of dimensions, 1-3, e.g. the work_dim parameter passed to
     * clEnqueueNDRangeKernel. Note block[] and grid[] must be padded with
@@ -1005,7 +1012,7 @@ struct pipe_compute_state
 {
    enum pipe_shader_ir ir_type; /**< IR type contained in prog. */
    const void *prog; /**< Compute program to be executed. */
-   unsigned req_local_mem; /**< Required size of the LOCAL resource. */
+   unsigned static_shared_mem; /**< equal to info.shared_size, used for shaders passed as TGSI */
    unsigned req_input_mem; /**< Required size of the INPUT resource. */
 };
 
diff --git a/src/mesa/state_tracker/st_pbo_compute.c b/src/mesa/state_tracker/st_pbo_compute.c
index 45a06d9c192..fc8bc3785fc 100644
--- a/src/mesa/state_tracker/st_pbo_compute.c
+++ b/src/mesa/state_tracker/st_pbo_compute.c
@@ -942,7 +942,7 @@ download_texture_compute(struct st_context *st,
             assert(async->nir && !async->cs);
             struct pipe_compute_state state = {0};
             state.ir_type = PIPE_SHADER_IR_NIR;
-            state.req_local_mem = async->nir->info.shared_size;
+            state.static_shared_mem = async->nir->info.shared_size;
             state.prog = async->nir;
             async->nir = NULL;
             async->cs = pipe->create_compute_state(pipe, &state);
@@ -957,7 +957,7 @@ download_texture_compute(struct st_context *st,
                if (!spec->cs) {
                   struct pipe_compute_state state = {0};
                   state.ir_type = PIPE_SHADER_IR_NIR;
-                  state.req_local_mem = spec->nir->info.shared_size;
+                  state.static_shared_mem = spec->nir->info.shared_size;
                   state.prog = spec->nir;
                   spec->nir = NULL;
                   spec->cs = pipe->create_compute_state(pipe, &state);
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 376a5f6621e..ff784a44803 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -549,7 +549,7 @@ st_create_nir_shader(struct st_context *st, struct pipe_shader_state *state)
    case MESA_SHADER_COMPUTE: {
       struct pipe_compute_state cs = {0};
       cs.ir_type = state->type;
-      cs.req_local_mem = info.shared_size;
+      cs.static_shared_mem = info.shared_size;
 
       if (state->type == PIPE_SHADER_IR_NIR)
          cs.prog = state->ir.nir;