diff --git a/src/compiler/shader_info.h b/src/compiler/shader_info.h
index be3a6a542e8..13da17fa264 100644
--- a/src/compiler/shader_info.h
+++ b/src/compiler/shader_info.h
@@ -298,6 +298,7 @@ typedef struct shader_info {
 
       struct {
          uint16_t local_size[3];
+         uint16_t max_variable_local_size;
 
          bool local_size_variable:1;
          uint8_t user_data_components_amd:3;
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 08999e95071..2e34b16dd44 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -615,6 +615,9 @@ enum brw_param_builtin {
    BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y,
    BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z,
    BRW_PARAM_BUILTIN_SUBGROUP_ID,
+   BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X,
+   BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Y,
+   BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z,
 };
 
 #define BRW_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \
@@ -901,11 +904,13 @@ struct brw_cs_prog_data {
    struct brw_stage_prog_data base;
 
    unsigned local_size[3];
+   unsigned max_variable_local_size;
    unsigned simd_size;
    unsigned threads;
    unsigned slm_size;
    bool uses_barrier;
    bool uses_num_work_groups;
+   bool uses_variable_group_size;
 
    struct {
       struct brw_push_const_block cross_thread;
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 96fdb6b0992..323fdb56ff5 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1190,6 +1190,8 @@ fs_visitor::import_uniforms(fs_visitor *v)
    this->pull_constant_loc = v->pull_constant_loc;
    this->uniforms = v->uniforms;
    this->subgroup_id = v->subgroup_id;
+   for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++)
+      this->group_size[i] = v->group_size[i];
 }
 
 void
@@ -8866,9 +8868,16 @@ static void
 cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size)
 {
    cs_prog_data->simd_size = size;
-   unsigned group_size = cs_prog_data->local_size[0] *
-      cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
-   cs_prog_data->threads = (group_size + size - 1) / size;
+
+   unsigned group_size;
+   if (cs_prog_data->uses_variable_group_size) {
+      group_size = cs_prog_data->max_variable_local_size;
+   } else {
+      group_size = cs_prog_data->local_size[0] *
+                   cs_prog_data->local_size[1] *
+                   cs_prog_data->local_size[2];
+   }
+   cs_prog_data->threads = DIV_ROUND_UP(group_size, size);
 }
 
 static nir_shader *
@@ -8903,13 +8912,20 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
                char **error_str)
 {
    prog_data->base.total_shared = src_shader->info.cs.shared_size;
-   prog_data->local_size[0] = src_shader->info.cs.local_size[0];
-   prog_data->local_size[1] = src_shader->info.cs.local_size[1];
-   prog_data->local_size[2] = src_shader->info.cs.local_size[2];
    prog_data->slm_size = src_shader->num_shared;
-   unsigned local_workgroup_size =
-      src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
-      src_shader->info.cs.local_size[2];
+
+   unsigned local_workgroup_size;
+   if (prog_data->uses_variable_group_size) {
+      prog_data->max_variable_local_size =
+         src_shader->info.cs.max_variable_local_size;
+      local_workgroup_size = src_shader->info.cs.max_variable_local_size;
+   } else {
+      prog_data->local_size[0] = src_shader->info.cs.local_size[0];
+      prog_data->local_size[1] = src_shader->info.cs.local_size[1];
+      prog_data->local_size[2] = src_shader->info.cs.local_size[2];
+      local_workgroup_size = src_shader->info.cs.local_size[0] *
+         src_shader->info.cs.local_size[1] * src_shader->info.cs.local_size[2];
+   }
 
    /* Limit max_threads to 64 for the GPGPU_WALKER command */
    const uint32_t max_threads = MIN2(64, compiler->devinfo->max_cs_threads);
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index c09c4eb8759..f2612968f25 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -370,6 +370,7 @@ public:
    int *push_constant_loc;
 
    fs_reg subgroup_id;
+   fs_reg group_size[3];
    fs_reg scratch_base;
    fs_reg frag_depth;
    fs_reg frag_stencil;
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index f1d17a322e9..a038db72daa 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -101,11 +101,23 @@ fs_visitor::nir_setup_uniforms()
    uniforms = nir->num_uniforms / 4;
 
    if (stage == MESA_SHADER_COMPUTE) {
-      /* Add a uniform for the thread local id.  It must be the last uniform
-       * on the list.
-       */
+      /* Add uniforms for builtins after regular NIR uniforms. */
       assert(uniforms == prog_data->nr_params);
-      uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
+
+      uint32_t *param;
+      if (brw_cs_prog_data(prog_data)->uses_variable_group_size) {
+         param = brw_stage_prog_data_add_params(prog_data, 3);
+         for (unsigned i = 0; i < 3; i++) {
+            param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i);
+            group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
+         }
+      }
+
+      /* Subgroup ID must be the last uniform on the list.  This will make
+       * easier later to split between cross thread and per thread
+       * uniforms.
+       */
+      param = brw_stage_prog_data_add_params(prog_data, 1);
       *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
       subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
    }
@@ -3814,7 +3826,8 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
        * invocations are already executed lock-step.  Instead of an actual
        * barrier just emit a scheduling fence, that will generate no code.
        */
-      if (workgroup_size() <= dispatch_width) {
+      if (!cs_prog_data->uses_variable_group_size &&
+          workgroup_size() <= dispatch_width) {
          bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);
          break;
       }
@@ -3949,6 +3962,14 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
       break;
    }
 
+   case nir_intrinsic_load_local_group_size: {
+      for (unsigned i = 0; i < 3; i++) {
+         bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
+            group_size[i]);
+      }
+      break;
+   }
+
    default:
       nir_emit_intrinsic(bld, instr);
       break;
@@ -4337,7 +4358,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        *
        * TODO: Check if applies for many HW threads sharing same Data Port.
        */
-      if (slm_fence && workgroup_size() <= dispatch_width)
+      if (!brw_cs_prog_data(prog_data)->uses_variable_group_size &&
+          slm_fence && workgroup_size() <= dispatch_width)
          slm_fence = false;
 
       /* Prior to Gen11, there's only L3 fence, so emit that instead. */
diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
index 434ad005281..2393011312c 100644
--- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
+++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
@@ -72,8 +72,16 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
             nir_ssa_def *channel = nir_load_subgroup_invocation(b);
             nir_ssa_def *linear = nir_iadd(b, channel, thread_local_id);
 
-            nir_ssa_def *size_x = nir_imm_int(b, nir->info.cs.local_size[0]);
-            nir_ssa_def *size_y = nir_imm_int(b, nir->info.cs.local_size[1]);
+            nir_ssa_def *size_x;
+            nir_ssa_def *size_y;
+            if (state->nir->info.cs.local_size_variable) {
+               nir_ssa_def *size_xyz = nir_load_local_group_size(b);
+               size_x = nir_channel(b, size_xyz, 0);
+               size_y = nir_channel(b, size_xyz, 1);
+            } else {
+               size_x = nir_imm_int(b, nir->info.cs.local_size[0]);
+               size_y = nir_imm_int(b, nir->info.cs.local_size[1]);
+            }
 
             /* The local invocation index and ID must respect the following
              *
@@ -152,12 +160,26 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
          break;
 
       case nir_intrinsic_load_num_subgroups: {
-         unsigned local_workgroup_size =
-            nir->info.cs.local_size[0] * nir->info.cs.local_size[1] *
-            nir->info.cs.local_size[2];
-         unsigned num_subgroups =
-            DIV_ROUND_UP(local_workgroup_size, state->dispatch_width);
-         sysval = nir_imm_int(b, num_subgroups);
+         if (state->nir->info.cs.local_size_variable) {
+            nir_ssa_def *size_xyz = nir_load_local_group_size(b);
+            nir_ssa_def *size_x = nir_channel(b, size_xyz, 0);
+            nir_ssa_def *size_y = nir_channel(b, size_xyz, 1);
+            nir_ssa_def *size_z = nir_channel(b, size_xyz, 2);
+            nir_ssa_def *size = nir_imul(b, nir_imul(b, size_x, size_y), size_z);
+
+            /* Calculate the equivalent of DIV_ROUND_UP. */
+            sysval = nir_idiv(b,
+                              nir_iadd_imm(b,
+                                 nir_iadd_imm(b, size, state->dispatch_width), -1),
+                              nir_imm_int(b, state->dispatch_width));
+         } else {
+            unsigned local_workgroup_size =
+               nir->info.cs.local_size[0] * nir->info.cs.local_size[1] *
+               nir->info.cs.local_size[2];
+            unsigned num_subgroups =
+               DIV_ROUND_UP(local_workgroup_size, state->dispatch_width);
+            sysval = nir_imm_int(b, num_subgroups);
+         }
          break;
       }
 
@@ -198,16 +220,21 @@ brw_nir_lower_cs_intrinsics(nir_shader *nir,
       .dispatch_width = dispatch_width,
    };
 
-   assert(!nir->info.cs.local_size_variable);
-   state.local_workgroup_size = nir->info.cs.local_size[0] *
-                                nir->info.cs.local_size[1] *
-                                nir->info.cs.local_size[2];
+   if (!nir->info.cs.local_size_variable) {
+      state.local_workgroup_size = nir->info.cs.local_size[0] *
+                                   nir->info.cs.local_size[1] *
+                                   nir->info.cs.local_size[2];
+   } else {
+      state.local_workgroup_size = nir->info.cs.max_variable_local_size;
+   }
 
    /* Constraints from NV_compute_shader_derivatives. */
-   if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
+   if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS &&
+       !nir->info.cs.local_size_variable) {
       assert(nir->info.cs.local_size[0] % 2 == 0);
       assert(nir->info.cs.local_size[1] % 2 == 0);
-   } else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR) {
+   } else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR &&
+              !nir->info.cs.local_size_variable) {
       assert(state.local_workgroup_size % 4 == 0);
    }