pvr, pco: add base compute support

Signed-off-by: Simon Perretta <simon.perretta@imgtec.com> Acked-by: Erik Faye-Lund <erik.faye-lund@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36412>
2026-05-04 22:49:13 +02:00 · 2025-01-01 22:11:21 +00:00 · 2025-01-01 22:11:21 +00:00 · 4f79bc2e30
commit 4f79bc2e30
parent ad2b623744
12 changed files with 778 additions and 421 deletions
--- a/src/imagination/pco/meson.build
+++ b/src/imagination/pco/meson.build
@ -16,6 +16,7 @@ libpowervr_compiler_files = files(
  'pco_ir.c',
  'pco_legalize.c',
  'pco_nir.c',
+  'pco_nir_compute.c',
  'pco_nir_pvfio.c',
  'pco_nir_vk.c',
  'pco_opt.c',
--- a/src/imagination/pco/pco_data.h
+++ b/src/imagination/pco/pco_data.h
@ -71,7 +71,7 @@ typedef struct _pco_fs_data {

 /** PCO compute shader-specific data. */
 typedef struct _pco_cs_data {
-   /**/
+   unsigned workgroup_size[3]; /** Workgroup size. */
 } pco_cs_data;

 /** PCO descriptor binding data. */
--- a/src/imagination/pco/pco_internal.h
+++ b/src/imagination/pco/pco_internal.h
@ -1531,6 +1531,7 @@ bool pco_end(pco_shader *shader);
 bool pco_group_instrs(pco_shader *shader);
 bool pco_index(pco_shader *shader, bool skip_ssa);
 bool pco_legalize(pco_shader *shader);
+bool pco_nir_compute_instance_check(nir_shader *shader);
 bool pco_nir_lower_algebraic(nir_shader *shader);
 bool pco_nir_lower_algebraic_late(nir_shader *shader);
 bool pco_nir_lower_vk(nir_shader *shader, pco_common_data *common);
--- a/src/imagination/pco/pco_nir.c
+++ b/src/imagination/pco/pco_nir.c
@ -81,6 +81,9 @@ const nir_shader_compiler_options *pco_nir_options(void)
 */
 void pco_preprocess_nir(pco_ctx *ctx, nir_shader *nir)
 {
+   if (nir->info.stage == MESA_SHADER_COMPUTE)
+      NIR_PASS(_, nir, pco_nir_compute_instance_check);
+
   if (nir->info.internal)
      NIR_PASS(_, nir, nir_lower_returns);

@ -108,6 +111,15 @@ void pco_preprocess_nir(pco_ctx *ctx, nir_shader *nir)

   NIR_PASS(_, nir, nir_lower_system_values);

+   if (nir->info.stage == MESA_SHADER_COMPUTE) {
+      NIR_PASS(_,
+               nir,
+               nir_lower_compute_system_values,
+               &(nir_lower_compute_system_values_options){
+                  .lower_cs_local_id_to_index = true,
+               });
+   }
+
   NIR_PASS(_,
            nir,
            nir_lower_io_vars_to_temporaries,
@ -396,6 +408,18 @@ static void gather_fs_data(nir_shader *nir, pco_data *data)
   }
 }

+/**
+ * \brief Gathers compute shader data.
+ *
+ * \param[in] nir NIR shader.
+ * \param[in,out] data Shader data.
+ */
+static void gather_cs_data(nir_shader *nir, pco_data *data)
+{
+   for (unsigned u = 0; u < ARRAY_SIZE(data->cs.workgroup_size); ++u)
+      data->cs.workgroup_size[u] = nir->info.workgroup_size[u];
+}
+
 /**
 * \brief Checks whether a NIR intrinsic op is atomic.
 *
@ -462,12 +486,16 @@ static void gather_data(nir_shader *nir, pco_data *data)
      return gather_fs_data(nir, data);

   case MESA_SHADER_VERTEX:
-      /* TODO */
-      break;
+      return;
+
+   case MESA_SHADER_COMPUTE:
+      return gather_cs_data(nir, data);

   default:
-      UNREACHABLE("");
+      break;
   }
+
+   UNREACHABLE("");
 }

 /**
--- a/src/imagination/pco/pco_nir_compute.c
+++ b/src/imagination/pco/pco_nir_compute.c
@ -0,0 +1,98 @@
+/*
+ * Copyright © 2025 Imagination Technologies Ltd.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+/**
+ * \file pco_nir_compute.c
+ *
+ * \brief PCO NIR compute-specific passes.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "pco.h"
+#include "pco_builder.h"
+#include "pco_internal.h"
+#include "util/macros.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+#define INST_CHK_FUNC "@pco_inst_chk"
+
+/**
+ * \brief Inserts the instance check.
+ *
+ * \param[in,out] shader NIR shader.
+ */
+static void insert_instance_check(nir_shader *shader)
+{
+   /* Get original entrypoint. */
+   nir_function *orig_entrypoint = nir_shader_get_entrypoint(shader)->function;
+
+   /* Create a function for the instance check which will serve as the new
+    * entrypoint.
+    */
+   nir_function *inst_chk_func = nir_function_create(shader, INST_CHK_FUNC);
+
+   inst_chk_func->is_entrypoint = true;
+   orig_entrypoint->is_entrypoint = false;
+
+   nir_builder b = nir_builder_create(nir_function_impl_create(inst_chk_func));
+   b.cursor = nir_after_cf_list(&b.impl->body);
+
+   /* If the current instance index is greater than the total workgroup size,
+    * we don't execute.
+    */
+   nir_def *local_size = nir_load_workgroup_size(&b);
+   nir_def *size_x = nir_channel(&b, local_size, 0);
+   nir_def *size_y = nir_channel(&b, local_size, 1);
+   nir_def *size_z = nir_channel(&b, local_size, 2);
+   nir_def *flat_size = nir_imul(&b, nir_imul(&b, size_x, size_y), size_z);
+
+   nir_def *flat_id = nir_load_local_invocation_index(&b);
+
+   nir_def *cond_inst_valid = nir_ilt(&b, flat_id, flat_size);
+   nir_if *nif = nir_push_if(&b, cond_inst_valid);
+   {
+      nir_call(&b, orig_entrypoint);
+   }
+   nir_pop_if(&b, nif);
+   nir_jump(&b, nir_jump_return);
+}
+
+/**
+ * \brief Inserts an instance check for compute shaders.
+ *
+ * \param[in,out] shader NIR shader.
+ * \return True if the pass made progress.
+ */
+bool pco_nir_compute_instance_check(nir_shader *shader)
+{
+   assert(shader->info.stage == MESA_SHADER_COMPUTE);
+
+   if (shader->info.internal)
+      return false;
+
+   /* Check we haven't already done this. */
+   nir_foreach_function (function, shader) {
+      if (function->name && !strcmp(function->name, INST_CHK_FUNC))
+         return false;
+   }
+
+   insert_instance_check(shader);
+
+   /* Re-inline. */
+   NIR_PASS(_, shader, nir_lower_variable_initializers, nir_var_function_temp);
+   NIR_PASS(_, shader, nir_lower_returns);
+   NIR_PASS(_, shader, nir_inline_functions);
+   NIR_PASS(_, shader, nir_copy_prop);
+   NIR_PASS(_, shader, nir_opt_deref);
+   nir_remove_non_entrypoints(shader);
+   NIR_PASS(_, shader, nir_lower_variable_initializers, ~0);
+
+   return true;
+}
--- a/src/imagination/pco/pco_trans_nir.c
+++ b/src/imagination/pco/pco_trans_nir.c
@ -630,8 +630,36 @@ static pco_instr *trans_atomic_buffer(trans_ctx *tctx,
   UNREACHABLE("");
 }

+static inline enum pco_reg_class sys_val_to_reg_class(gl_system_value sys_val,
+                                                      mesa_shader_stage stage)
+{
+   switch (stage) {
+   case MESA_SHADER_VERTEX:
+      return PCO_REG_CLASS_VTXIN;
+
+   case MESA_SHADER_COMPUTE:
+      switch (sys_val) {
+      case SYSTEM_VALUE_LOCAL_INVOCATION_INDEX:
+         return PCO_REG_CLASS_VTXIN;
+
+      case SYSTEM_VALUE_WORKGROUP_ID:
+      case SYSTEM_VALUE_NUM_WORKGROUPS:
+         return PCO_REG_CLASS_COEFF;
+
+      default:
+         break;
+      }
+      break;
+
+   default:
+      break;
+   }
+
+   UNREACHABLE("");
+}
+
 /**
- * \brief Translates a NIR vs load system value intrinsic into PCO.
+ * \brief Translates a NIR load system value intrinsic into PCO.
 *
 * \param[in,out] tctx Translation context.
 * \param[in] intr System value intrinsic.
@ -639,7 +667,7 @@ static pco_instr *trans_atomic_buffer(trans_ctx *tctx,
 * \return The translated PCO instruction.
 */
 static pco_instr *
-trans_load_sysval_vs(trans_ctx *tctx, nir_intrinsic_instr *intr, pco_ref dest)
+trans_load_sysval(trans_ctx *tctx, nir_intrinsic_instr *intr, pco_ref dest)
 {
   gl_system_value sys_val = nir_system_value_from_intrinsic(intr->intrinsic);
   const pco_range *range = &tctx->shader->data.common.sys_vals[sys_val];
@ -647,7 +675,9 @@ trans_load_sysval_vs(trans_ctx *tctx, nir_intrinsic_instr *intr, pco_ref dest)
   unsigned chans = pco_ref_get_chans(dest);
   assert(chans == range->count);

-   pco_ref src = pco_ref_hwreg_vec(range->start, PCO_REG_CLASS_VTXIN, chans);
+   pco_ref src = pco_ref_hwreg_vec(range->start,
+                                   sys_val_to_reg_class(sys_val, tctx->stage),
+                                   chans);
   return pco_mov(&tctx->b, dest, src, .rpt = chans);
 }

@ -702,12 +732,18 @@ static pco_instr *trans_intr(trans_ctx *tctx, nir_intrinsic_instr *intr)
      instr = trans_atomic_buffer(tctx, intr, dest, src[1], src[2]);
      break;

+   /* Vertex sysvals. */
   case nir_intrinsic_load_vertex_id:
   case nir_intrinsic_load_instance_id:
   case nir_intrinsic_load_base_instance:
   case nir_intrinsic_load_base_vertex:
   case nir_intrinsic_load_draw_id:
-      instr = trans_load_sysval_vs(tctx, intr, dest);
+
+   /* Compute sysvals. */
+   case nir_intrinsic_load_local_invocation_index:
+   case nir_intrinsic_load_workgroup_id:
+   case nir_intrinsic_load_num_workgroups:
+      instr = trans_load_sysval(tctx, intr, dest);
      break;

   case nir_intrinsic_ddx:
--- a/src/imagination/vulkan/pds/pvr_pds.c
+++ b/src/imagination/vulkan/pds/pvr_pds.c
@ -57,7 +57,7 @@
 */
 #define PVR_PDS_CDM_WORK_GROUP_ID_X 0
 #define PVR_PDS_CDM_WORK_GROUP_ID_Y 1
-#define PVR_PDS_CDM_WORK_GROUP_ID_Z 2
+#define PVR_PDS_CDM_WORK_GROUP_ID_Z 3
 /* Local IDs are available in every task. */
 #define PVR_PDS_CDM_LOCAL_ID_X 0
 #define PVR_PDS_CDM_LOCAL_ID_YZ 1
@ -91,6 +91,12 @@ static const uint32_t cache_control_const[2][2] = {
   { 0, 0 }
 };

+static const uint32_t wg_id_temps[3] = {
+   PVR_PDS_CDM_WORK_GROUP_ID_X,
+   PVR_PDS_CDM_WORK_GROUP_ID_Y,
+   PVR_PDS_CDM_WORK_GROUP_ID_Z,
+};
+
 /*****************************************************************************
 Function definitions
 *****************************************************************************/
@ -1768,6 +1774,7 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
    * DOUTW for local, and two for global.
    */
   uint32_t work_group_id_ctrl_words[2] = { 0 };
+   uint32_t num_work_groups_ctrl_words[2] = { 0 };
   uint32_t local_id_ctrl_word = 0;
   uint32_t local_input_register;

@ -1795,6 +1802,42 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,

   uint32_t next_constant = PVR_PDS_CONSTANTS_BLOCK_BASE;

+   const bool has_local_input_regs =
+      (program->local_input_regs[0] != PVR_PDS_REG_UNUSED) ||
+      (program->local_input_regs[1] != PVR_PDS_REG_UNUSED) ||
+      (program->local_input_regs[2] != PVR_PDS_REG_UNUSED);
+
+   const bool has_local_input_reg[3] = {
+      [0] = program->local_input_regs[0] != PVR_PDS_REG_UNUSED,
+      [1] = program->local_input_regs[1] != PVR_PDS_REG_UNUSED,
+      [2] = program->local_input_regs[2] != PVR_PDS_REG_UNUSED,
+   };
+
+   const bool has_work_group_input_regs =
+      (program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED) ||
+      (program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED) ||
+      (program->work_group_input_regs[2] != PVR_PDS_REG_UNUSED);
+
+   const bool has_work_group_input_reg[3] = {
+      [0] = program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED,
+      [1] = program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED,
+      [2] = program->work_group_input_regs[2] != PVR_PDS_REG_UNUSED,
+   };
+
+   const bool has_num_work_groups_regs =
+      (program->num_work_groups_regs[0] != PVR_PDS_REG_UNUSED) ||
+      (program->num_work_groups_regs[1] != PVR_PDS_REG_UNUSED) ||
+      (program->num_work_groups_regs[2] != PVR_PDS_REG_UNUSED);
+
+   const bool has_num_work_groups_reg[3] = {
+      [0] = program->num_work_groups_regs[0] != PVR_PDS_REG_UNUSED,
+      [1] = program->num_work_groups_regs[1] != PVR_PDS_REG_UNUSED,
+      [2] = program->num_work_groups_regs[2] != PVR_PDS_REG_UNUSED,
+   };
+
+   const bool has_barrier_coefficient = program->barrier_coefficient !=
+                                        PVR_PDS_REG_UNUSED;
+
   if (program->kick_usc) {
      /* Copy the USC task control words to constants. */
      usc_control_constant64 =
@ -1822,13 +1865,12 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
      program->cond_render_pred_temp = cond_render_pred_temp;
   }

-   if ((program->barrier_coefficient != PVR_PDS_REG_UNUSED) ||
-       (program->clear_pds_barrier) ||
+   if (has_barrier_coefficient || program->clear_pds_barrier ||
       (program->kick_usc && program->conditional_render)) {
      zero_constant64 = pvr_pds_get_constants(&next_constant, 2, &data_size);
   }

-   if (program->barrier_coefficient != PVR_PDS_REG_UNUSED) {
+   if (has_barrier_coefficient) {
      barrier_ctrl_word = pvr_pds_get_constants(&next_constant, 1, &data_size);
      if (PVR_HAS_QUIRK(dev_info, 51210)) {
         barrier_ctrl_word2 =
@ -1836,35 +1878,51 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
      }
   }

-   if (program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED ||
-       program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED) {
+   /* For DOUTW */
+   if (has_work_group_input_reg[0] || has_work_group_input_reg[1]) {
      work_group_id_ctrl_words[0] =
         pvr_pds_get_constants(&next_constant, 1, &data_size);
   }

-   if (program->work_group_input_regs[2] != PVR_PDS_REG_UNUSED) {
+   if (has_work_group_input_reg[2]) {
      work_group_id_ctrl_words[1] =
         pvr_pds_get_constants(&next_constant, 1, &data_size);
   }

-   if ((program->local_input_regs[0] != PVR_PDS_REG_UNUSED) ||
-       (program->local_input_regs[1] != PVR_PDS_REG_UNUSED) ||
-       (program->local_input_regs[2] != PVR_PDS_REG_UNUSED)) {
+   /* For DOUTW */
+   if (has_num_work_groups_reg[0] || has_num_work_groups_reg[1]) {
+      num_work_groups_ctrl_words[0] =
+         pvr_pds_get_constants(&next_constant, 1, &data_size);
+   }
+
+   if (has_num_work_groups_reg[2]) {
+      num_work_groups_ctrl_words[1] =
+         pvr_pds_get_constants(&next_constant, 1, &data_size);
+   }
+
+   /* For DOUTW */
+   if (has_local_input_regs) {
      local_id_ctrl_word = pvr_pds_get_constants(&next_constant, 1, &data_size);
   }

-   if (program->add_base_workgroup) {
-      for (uint32_t workgroup_component = 0; workgroup_component < 3;
-           workgroup_component++) {
-         if (program->work_group_input_regs[workgroup_component] !=
-             PVR_PDS_REG_UNUSED) {
-            program
-               ->base_workgroup_constant_offset_in_dwords[workgroup_component] =
-               pvr_pds_get_constants(&next_constant, 1, &data_size);
-         }
+   /* Patch constants. */
+   for (uint32_t wg_comp = 0; wg_comp < 3; ++wg_comp) {
+      if (has_work_group_input_reg[wg_comp]) {
+         program->base_workgroup_constant_offset_in_dwords[wg_comp] =
+            pvr_pds_get_constants(&next_constant, 1, &data_size);
      }
   }

+   if (has_num_work_groups_regs) {
+      /* Ensure 64-bit alignment. */
+      program->num_workgroups_constant_offset_in_dwords[0] =
+         pvr_pds_get_constants(&next_constant, 2, &data_size);
+      program->num_workgroups_constant_offset_in_dwords[1] =
+         program->num_workgroups_constant_offset_in_dwords[0];
+      program->num_workgroups_constant_offset_in_dwords[2] =
+         pvr_pds_get_constants(&next_constant, 1, &data_size);
+   }
+
   if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
      if (program->kick_usc) {
         /* Src0 for DOUTU */
@ -1883,15 +1941,14 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
            program->usc_task_control_coeff_update.src0); /* 64-bit Src0 */
      }

-      if ((program->barrier_coefficient != PVR_PDS_REG_UNUSED) ||
-          (program->clear_pds_barrier) ||
+      if (has_barrier_coefficient || program->clear_pds_barrier ||
          (program->kick_usc && program->conditional_render)) {
         pvr_pds_write_wide_constant(buffer, zero_constant64, 0); /* 64-bit
                                                                   * Src0
                                                                   */
      }

-      if (program->barrier_coefficient != PVR_PDS_REG_UNUSED) {
+      if (has_barrier_coefficient) {
         if (PVR_HAS_QUIRK(dev_info, 51210)) {
            /* Write the constant for the coefficient register write. */
            doutw = pvr_pds_encode_doutw_src1(
@ -1913,18 +1970,16 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
         /* Check whether the barrier is going to be the last DOUTW done by
          * the coefficient sync task.
          */
-         if ((program->work_group_input_regs[0] == PVR_PDS_REG_UNUSED) &&
-             (program->work_group_input_regs[1] == PVR_PDS_REG_UNUSED) &&
-             (program->work_group_input_regs[2] == PVR_PDS_REG_UNUSED)) {
+         if (!has_work_group_input_regs && !has_num_work_groups_regs)
            doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
-         }

         pvr_pds_write_constant32(buffer, barrier_ctrl_word, doutw);
      }

+      /**/
+
      /* If we want work-group id X, see if we also want work-group id Y. */
-      if (program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED &&
-          program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED) {
+      if (has_work_group_input_reg[0] && has_work_group_input_reg[1]) {
         /* Make sure we are going to DOUTW them into adjacent registers
          * otherwise we can't do it in one.
          */
@ -1940,14 +1995,14 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,

         /* If we don't want the Z work-group id then this is the last one.
          */
-         if (program->work_group_input_regs[2] == PVR_PDS_REG_UNUSED)
+         if (!has_work_group_input_reg[2] && !has_num_work_groups_regs)
            doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;

         pvr_pds_write_constant32(buffer, work_group_id_ctrl_words[0], doutw);
      }
      /* If we only want one of X or Y then handle them separately. */
      else {
-         if (program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED) {
+         if (has_work_group_input_reg[0]) {
            doutw = pvr_pds_encode_doutw_src1(
               program->work_group_input_regs[0],
               PVR_PDS_DOUTW_LOWER32,
@ -1958,13 +2013,13 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
            /* If we don't want the Z work-group id then this is the last
             * one.
             */
-            if (program->work_group_input_regs[2] == PVR_PDS_REG_UNUSED)
+            if (!has_work_group_input_reg[2] && !has_num_work_groups_regs)
               doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;

            pvr_pds_write_constant32(buffer,
                                     work_group_id_ctrl_words[0],
                                     doutw);
-         } else if (program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED) {
+         } else if (has_work_group_input_reg[1]) {
            doutw = pvr_pds_encode_doutw_src1(
               program->work_group_input_regs[1],
               PVR_PDS_DOUTW_UPPER32,
@ -1975,7 +2030,7 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
            /* If we don't want the Z work-group id then this is the last
             * one.
             */
-            if (program->work_group_input_regs[2] == PVR_PDS_REG_UNUSED)
+            if (!has_work_group_input_reg[2] && !has_num_work_groups_regs)
               doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;

            pvr_pds_write_constant32(buffer,
@ -1985,35 +2040,111 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
      }

      /* Handle work-group id Z. */
-      if (program->work_group_input_regs[2] != PVR_PDS_REG_UNUSED) {
+      if (has_work_group_input_reg[2]) {
         doutw = pvr_pds_encode_doutw_src1(
            program->work_group_input_regs[2],
            PVR_PDS_DOUTW_UPPER32,
+            PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
+            true,
+            dev_info);
+
+         if (!has_num_work_groups_regs)
+            doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
+
+         pvr_pds_write_constant32(buffer, work_group_id_ctrl_words[1], doutw);
+      }
+
+      /**/
+
+      /* If we want num work-groups X, see if we also want num work-groups Y. */
+      if (has_num_work_groups_reg[0] && has_num_work_groups_reg[1]) {
+         /* Make sure we are going to DOUTW them into adjacent registers
+          * otherwise we can't do it in one.
+          */
+         assert(program->num_work_groups_regs[1] ==
+                (program->num_work_groups_regs[0] + 1));
+
+         doutw = pvr_pds_encode_doutw_src1(
+            program->num_work_groups_regs[0],
+            PVR_PDS_DOUTW_LOWER64,
+            PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
+            true,
+            dev_info);
+
+         /* If we don't want num work-groups Z then this is the last one.
+          */
+         if (!has_num_work_groups_reg[2])
+            doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
+
+         pvr_pds_write_constant32(buffer, num_work_groups_ctrl_words[0], doutw);
+      }
+      /* If we only want one of X or Y then handle them separately. */
+      else {
+         if (has_num_work_groups_reg[0]) {
+            doutw = pvr_pds_encode_doutw_src1(
+               program->num_work_groups_regs[0],
+               PVR_PDS_DOUTW_LOWER32,
+               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
+               true,
+               dev_info);
+
+            /* If we don't want num work-groups Z then this is the last
+             * one.
+             */
+            if (has_num_work_groups_reg[2])
+               doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
+
+            pvr_pds_write_constant32(buffer,
+                                     num_work_groups_ctrl_words[0],
+                                     doutw);
+         } else if (has_num_work_groups_reg[1]) {
+            doutw = pvr_pds_encode_doutw_src1(
+               program->num_work_groups_regs[1],
+               PVR_PDS_DOUTW_UPPER32,
+               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
+               true,
+               dev_info);
+
+            /* If we don't want num work-groups Z then this is the last
+             * one.
+             */
+            if (!has_num_work_groups_reg[2])
+               doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
+
+            pvr_pds_write_constant32(buffer,
+                                     num_work_groups_ctrl_words[0],
+                                     doutw);
+         }
+      }
+
+      /* Handle num work-groups Z. */
+      if (has_num_work_groups_reg[2]) {
+         doutw = pvr_pds_encode_doutw_src1(
+            program->num_work_groups_regs[2],
+            PVR_PDS_DOUTW_LOWER32,
            PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE |
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN,
            true,
            dev_info);

-         pvr_pds_write_constant32(buffer, work_group_id_ctrl_words[1], doutw);
+         pvr_pds_write_constant32(buffer, num_work_groups_ctrl_words[1], doutw);
      }

      /* Handle the local IDs. */
-      if ((program->local_input_regs[1] != PVR_PDS_REG_UNUSED) ||
-          (program->local_input_regs[2] != PVR_PDS_REG_UNUSED)) {
+      if (has_local_input_reg[1] || has_local_input_reg[2]) {
         uint32_t dest_reg;

         /* If we want local id Y and Z make sure the compiler wants them in
          * the same register.
          */
         if (!program->flattened_work_groups) {
-            if ((program->local_input_regs[1] != PVR_PDS_REG_UNUSED) &&
-                (program->local_input_regs[2] != PVR_PDS_REG_UNUSED)) {
+            if (has_local_input_reg[1] && has_local_input_reg[2]) {
               assert(program->local_input_regs[1] ==
                      program->local_input_regs[2]);
            }
         }

-         if (program->local_input_regs[1] != PVR_PDS_REG_UNUSED)
+         if (has_local_input_reg[1])
            dest_reg = program->local_input_regs[1];
         else
            dest_reg = program->local_input_regs[2];
@ -2021,7 +2152,7 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
         /* If we want local id X and (Y or Z) then we can do that in a
          * single 64-bit DOUTW.
          */
-         if (program->local_input_regs[0] != PVR_PDS_REG_UNUSED) {
+         if (has_local_input_reg[0]) {
            assert(dest_reg == (program->local_input_regs[0] + 1));

            doutw = pvr_pds_encode_doutw_src1(
@ -2052,7 +2183,7 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
      }
      /* If we don't want Y or Z then just DMA in X in a single 32-bit DOUTW.
       */
-      else if (program->local_input_regs[0] != PVR_PDS_REG_UNUSED) {
+      else if (has_local_input_reg[0]) {
         doutw = pvr_pds_encode_doutw_src1(
            program->local_input_regs[0],
            PVR_PDS_DOUTW_LOWER32,
@ -2090,7 +2221,7 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
                            program->coeff_update_task_branch_size /* ADDR */));

      /* Do we need to initialize the barrier coefficient? */
-      if (program->barrier_coefficient != PVR_PDS_REG_UNUSED) {
+      if (has_barrier_coefficient) {
         if (PVR_HAS_QUIRK(dev_info, 51210)) {
            /* Initialize the second barrier coefficient registers to zero.
             */
@ -2106,51 +2237,42 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
                                       zero_constant64 >> 1)); /* SRC0 */
      }

-      if (program->add_base_workgroup) {
-         const uint32_t temp_values[3] = { 0, 1, 3 };
-         for (uint32_t workgroup_component = 0; workgroup_component < 3;
-              workgroup_component++) {
-            if (program->work_group_input_regs[workgroup_component] ==
-                PVR_PDS_REG_UNUSED) {
-               continue;
-            }
+      /* Add base workgroup to workgroup ids. */
+      for (uint32_t wg_comp = 0; wg_comp < 3; ++wg_comp) {
+         if (!has_work_group_input_reg[wg_comp])
+            continue;

-            APPEND(pvr_pds_inst_encode_add32(
-               /* cc */ 0x0,
-               /* ALUM */ 0,
-               /* SNA */ 0,
-               /* SRC0 (R32)*/ PVR_ROGUE_PDSINST_REGS32_CONST32_LOWER +
-                  program->base_workgroup_constant_offset_in_dwords
-                     [workgroup_component],
-               /* SRC1 (R32)*/ PVR_ROGUE_PDSINST_REGS32_TEMP32_LOWER +
-                  PVR_PDS_CDM_WORK_GROUP_ID_X +
-                  temp_values[workgroup_component],
-               /* DST  (R32TP)*/ PVR_ROGUE_PDSINST_REGS32TP_TEMP32_LOWER +
-                  PVR_PDS_CDM_WORK_GROUP_ID_X +
-                  temp_values[workgroup_component]));
-         }
+         APPEND(pvr_pds_inst_encode_add32(
+            /* cc */ 0x0,
+            /* ALUM */ 0,
+            /* SNA */ 0,
+            /* SRC0 (R32)*/ PVR_ROGUE_PDSINST_REGS32_CONST32_LOWER +
+               program->base_workgroup_constant_offset_in_dwords[wg_comp],
+            /* SRC1 (R32)*/ PVR_ROGUE_PDSINST_REGS32_TEMP32_LOWER +
+               wg_id_temps[wg_comp],
+            /* DST  (R32TP)*/ PVR_ROGUE_PDSINST_REGS32TP_TEMP32_LOWER +
+               wg_id_temps[wg_comp]));
      }

      /* If we are going to put the work-group IDs in coefficients then we
       * just need to do the DOUTWs.
       */
-      if ((program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED) ||
-          (program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED)) {
-         uint32_t dest_reg;
+      if (has_work_group_input_reg[0] || has_work_group_input_reg[1]) {
+         uint32_t src_reg;

-         if (program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED)
-            dest_reg = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_WORK_GROUP_ID_X;
+         if (has_work_group_input_reg[0])
+            src_reg = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_WORK_GROUP_ID_X;
         else
-            dest_reg = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_WORK_GROUP_ID_Y;
+            src_reg = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_WORK_GROUP_ID_Y;

         APPEND(pvr_pds_encode_doutw64(0, /* cc */
                                       0, /* END */
                                       work_group_id_ctrl_words[0], /* SRC1
                                                                     */
-                                       dest_reg >> 1)); /* SRC0 */
+                                       src_reg >> 1)); /* SRC0 */
      }

-      if (program->work_group_input_regs[2] != PVR_PDS_REG_UNUSED) {
+      if (has_work_group_input_reg[2]) {
         APPEND(pvr_pds_encode_doutw64(
            0, /* cc */
            0, /* END */
@ -2159,6 +2281,35 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
               1)); /* SRC0 */
      }

+      /* If we are going to put the num work-groups in coefficients then we
+       * just need to do the DOUTWs.
+       */
+      if (has_num_work_groups_reg[0] || has_num_work_groups_reg[1]) {
+         uint32_t src_reg;
+
+         if (has_num_work_groups_reg[0])
+            src_reg = PVR_PDS_CONSTANTS_BLOCK_BASE +
+                      program->num_workgroups_constant_offset_in_dwords[0];
+         else
+            src_reg = PVR_PDS_CONSTANTS_BLOCK_BASE +
+                      program->num_workgroups_constant_offset_in_dwords[1];
+
+         APPEND(pvr_pds_encode_doutw64(0, /* cc */
+                                       0, /* END */
+                                       num_work_groups_ctrl_words[0], /* SRC1 */
+                                       src_reg >> 1)); /* SRC0 */
+      }
+
+      if (has_num_work_groups_reg[2]) {
+         APPEND(pvr_pds_encode_doutw64(
+            0, /* cc */
+            0, /* END */
+            num_work_groups_ctrl_words[1], /* SRC1 */
+            (PVR_PDS_CONSTANTS_BLOCK_BASE +
+             program->num_workgroups_constant_offset_in_dwords[2]) >>
+               1)); /* SRC0 */
+      }
+
      /* Issue the task to the USC. */
      if (program->kick_usc && program->has_coefficient_update_task) {
         APPEND(pvr_pds_encode_doutu(0, /* cc */
@ -2176,28 +2327,24 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
      /* DOUTW in the local IDs. */

      /* If we want X and Y or Z, we only need one DOUTW. */
-      if ((program->local_input_regs[0] != PVR_PDS_REG_UNUSED) &&
-          ((program->local_input_regs[1] != PVR_PDS_REG_UNUSED) ||
-           (program->local_input_regs[2] != PVR_PDS_REG_UNUSED))) {
+      if (has_local_input_reg[0] &&
+          (has_local_input_reg[1] || has_local_input_reg[2])) {
         local_input_register =
            PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_LOCAL_ID_X;
      } else {
         /* If we just want X. */
-         if (program->local_input_regs[0] != PVR_PDS_REG_UNUSED) {
+         if (has_local_input_reg[0]) {
            local_input_register =
               PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_LOCAL_ID_X;
         }
         /* If we just want Y or Z. */
-         else if (program->local_input_regs[1] != PVR_PDS_REG_UNUSED ||
-                  program->local_input_regs[2] != PVR_PDS_REG_UNUSED) {
+         else if (has_local_input_reg[1] || has_local_input_reg[2]) {
            local_input_register =
               PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_LOCAL_ID_YZ;
         }
      }

-      if ((program->local_input_regs[0] != PVR_PDS_REG_UNUSED) ||
-          (program->local_input_regs[1] != PVR_PDS_REG_UNUSED) ||
-          (program->local_input_regs[2] != PVR_PDS_REG_UNUSED)) {
+      if (has_local_input_regs) {
         APPEND(pvr_pds_encode_doutw64(0, /* cc */
                                       0, /* END */
                                       local_id_ctrl_word, /* SRC1 */
--- a/src/imagination/vulkan/pds/pvr_pds.h
+++ b/src/imagination/vulkan/pds/pvr_pds.h
@ -539,6 +539,7 @@ struct pvr_pds_compute_shader_program {

   uint32_t local_input_regs[3];
   uint32_t work_group_input_regs[3];
+   uint32_t num_work_groups_regs[3];
   uint32_t global_input_regs[3];

   uint32_t barrier_coefficient;
@ -553,8 +554,8 @@ struct pvr_pds_compute_shader_program {

   uint32_t coeff_update_task_branch_size;

-   bool add_base_workgroup;
   uint32_t base_workgroup_constant_offset_in_dwords[3];
+   uint32_t num_workgroups_constant_offset_in_dwords[3];

   bool kick_usc;

@ -585,6 +586,11 @@ static inline void pvr_pds_compute_shader_program_init(
         PVR_PDS_REG_UNUSED,
         PVR_PDS_REG_UNUSED,
      },
+      .num_work_groups_regs = {
+         PVR_PDS_REG_UNUSED,
+         PVR_PDS_REG_UNUSED,
+         PVR_PDS_REG_UNUSED,
+      },
      .global_input_regs = {
         PVR_PDS_REG_UNUSED,
         PVR_PDS_REG_UNUSED,
--- a/src/imagination/vulkan/pvr_cmd_buffer.c
+++ b/src/imagination/vulkan/pvr_cmd_buffer.c
@ -2609,26 +2609,37 @@ void pvr_CmdBindDescriptorSets2KHR(

   PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);

-   if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
-      struct pvr_descriptor_state *desc_state =
-         &cmd_buffer->state.gfx_desc_state;
+   struct pvr_descriptor_state *graphics_desc_state =
+      &cmd_buffer->state.gfx_desc_state;
+   struct pvr_descriptor_state *compute_desc_state =
+      &cmd_buffer->state.compute_desc_state;

-      for (unsigned u = 0; u < pBindDescriptorSetsInfo->descriptorSetCount;
-           ++u) {
-         VK_FROM_HANDLE(pvr_descriptor_set,
-                        set,
-                        pBindDescriptorSetsInfo->pDescriptorSets[u]);
-         unsigned desc_set = u + pBindDescriptorSetsInfo->firstSet;
+   for (unsigned u = 0; u < pBindDescriptorSetsInfo->descriptorSetCount; ++u) {
+      VK_FROM_HANDLE(pvr_descriptor_set,
+                     set,
+                     pBindDescriptorSetsInfo->pDescriptorSets[u]);
+      unsigned desc_set = u + pBindDescriptorSetsInfo->firstSet;

-         if (desc_state->sets[desc_set] != set) {
-            desc_state->sets[desc_set] = set;
-            desc_state->dirty_sets |= BITFIELD_BIT(desc_set);
+      if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
+         if (graphics_desc_state->sets[desc_set] != set) {
+            graphics_desc_state->sets[desc_set] = set;
+            graphics_desc_state->dirty_sets |= BITFIELD_BIT(desc_set);
         }
      }

-      cmd_buffer->state.dirty.gfx_desc_dirty = true;
+      if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
+         if (compute_desc_state->sets[desc_set] != set) {
+            compute_desc_state->sets[desc_set] = set;
+            compute_desc_state->dirty_sets |= BITFIELD_BIT(desc_set);
+         }
+      }
   }
-   assert(!(pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT));
+
+   if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS)
+      cmd_buffer->state.dirty.gfx_desc_dirty = true;
+
+   if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT)
+      cmd_buffer->state.dirty.compute_desc_dirty = true;
 }

 void pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
@ -3600,8 +3611,7 @@ static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer,
   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
   struct pvr_csb *csb = &sub_cmd->control_stream;
   const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
-   const uint32_t const_shared_regs =
-      pipeline->shader_state.const_shared_reg_count;
+   const uint32_t const_shared_regs = pipeline->cs_data.common.shareds;
   struct pvr_compute_kernel_info info;

   /* No shared regs, no need to use an allocation kernel. */
@ -3624,7 +3634,7 @@ static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer,
      .usc_target = ROGUE_CDMCTRL_USC_TARGET_ALL,
      .usc_common_shared = true,
      .usc_common_size =
-         DIV_ROUND_UP(const_shared_regs,
+         DIV_ROUND_UP(PVR_DW_TO_BYTES(const_shared_regs),
                      ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE),

      .global_size = { 1, 1, 1 },
@ -3748,8 +3758,6 @@ void pvr_compute_update_kernel_private(
   const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
 {
   const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
-   const struct pvr_device_runtime_info *dev_runtime_info =
-      &pdevice->dev_runtime_info;
   struct pvr_csb *csb = &sub_cmd->control_stream;

   struct pvr_compute_kernel_info info = {
@ -3783,15 +3791,8 @@ void pvr_compute_update_kernel_private(
   uint32_t work_size = pipeline->workgroup_size.width *
                        pipeline->workgroup_size.height *
                        pipeline->workgroup_size.depth;
-   uint32_t coeff_regs;
-
-   if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
-      /* Enforce a single workgroup per cluster through allocation starvation.
-       */
-      coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
-   } else {
-      coeff_regs = pipeline->coeff_regs_count;
-   }
+   uint32_t coeff_regs =
+      pipeline->coeff_regs_count + pipeline->const_shared_regs_count;

   info.usc_common_size =
      DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
@ -3800,8 +3801,6 @@ void pvr_compute_update_kernel_private(
   /* Use a whole slot per workgroup. */
   work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);

-   coeff_regs += pipeline->const_shared_regs_count;
-
   if (pipeline->const_shared_regs_count > 0)
      info.sd_type = ROGUE_CDMCTRL_SD_TYPE_USC;

@ -3818,24 +3817,53 @@ void pvr_compute_update_kernel_private(
   pvr_compute_generate_control_stream(csb, sub_cmd, &info);
 }

-/* TODO: Wire up the base_workgroup variant program when implementing
- * VK_KHR_device_group. The values will also need patching into the program.
- */
 static void pvr_compute_update_kernel(
   struct pvr_cmd_buffer *cmd_buffer,
   struct pvr_sub_cmd_compute *const sub_cmd,
   pvr_dev_addr_t indirect_addr,
+   const uint32_t global_base_group[static const PVR_WORKGROUP_DIMENSIONS],
   const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
 {
   const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
-   const struct pvr_device_runtime_info *dev_runtime_info =
-      &pdevice->dev_runtime_info;
   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
   struct pvr_csb *csb = &sub_cmd->control_stream;
   const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
-   const struct pvr_compute_shader_state *shader_state =
-      &pipeline->shader_state;
-   const struct pvr_pds_info *program_info = &pipeline->primary_program_info;
+   const pco_data *const cs_data = &pipeline->cs_data;
+   const struct pvr_pds_info *program_info = &pipeline->pds_cs_program_info;
+   bool uses_wg_id = pipeline->base_workgroup_data_patching_offset != ~0u;
+   bool uses_num_wgs = pipeline->num_workgroups_data_patching_offset != ~0u;
+   bool base_group_set = !!global_base_group[0] || !!global_base_group[1] ||
+                         !!global_base_group[2];
+   uint32_t pds_data_offset = pipeline->pds_cs_program.data_offset;
+
+   /* Does the PDS data segment need patching, or can the default be used? */
+   if ((uses_wg_id && base_group_set) || uses_num_wgs) {
+      struct pvr_pds_upload pds_data_upload;
+      uint32_t *pds_data;
+
+      /* Upload and patch PDS data segment. */
+      pvr_cmd_buffer_upload_pds_data(cmd_buffer,
+                                     pipeline->pds_cs_data_section,
+                                     program_info->data_size_in_dwords,
+                                     16,
+                                     &pds_data_upload);
+      pds_data_offset = pds_data_upload.data_offset;
+      pds_data = pvr_bo_suballoc_get_map_addr(pds_data_upload.pvr_bo);
+
+      if (uses_wg_id && base_group_set) {
+         unsigned offset = pipeline->base_workgroup_data_patching_offset;
+         for (unsigned u = 0; u < PVR_WORKGROUP_DIMENSIONS; ++u) {
+            pds_data[offset + u] = global_base_group[u];
+         }
+      }
+
+      if (uses_num_wgs) {
+         unsigned offset = pipeline->num_workgroups_data_patching_offset;
+         for (unsigned u = 0; u < PVR_WORKGROUP_DIMENSIONS; ++u) {
+            pds_data[offset + u] = global_workgroup_size[u];
+         }
+      }
+   }

   struct pvr_compute_kernel_info info = {
      .indirect_buffer_addr = indirect_addr,
@ -3847,13 +3875,13 @@ static void pvr_compute_update_kernel(
      .pds_data_size =
         DIV_ROUND_UP(PVR_DW_TO_BYTES(program_info->data_size_in_dwords),
                      ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
-      .pds_data_offset = pipeline->primary_program.data_offset,
-      .pds_code_offset = pipeline->primary_program.code_offset,
+      .pds_data_offset = pds_data_offset,
+      .pds_code_offset = pipeline->pds_cs_program.code_offset,

      .sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE,

      .usc_unified_size =
-         DIV_ROUND_UP(shader_state->input_register_count << 2U,
+         DIV_ROUND_UP(cs_data->common.vtxins << 2U,
                      ROGUE_CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE),

      /* clang-format off */
@ -3865,16 +3893,10 @@ static void pvr_compute_update_kernel(
      /* clang-format on */
   };

-   uint32_t work_size = shader_state->work_size;
-   uint32_t coeff_regs;
-
-   if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
-      /* Enforce a single workgroup per cluster through allocation starvation.
-       */
-      coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
-   } else {
-      coeff_regs = shader_state->coefficient_register_count;
-   }
+   uint32_t work_size = cs_data->cs.workgroup_size[0] *
+                        cs_data->cs.workgroup_size[1] *
+                        cs_data->cs.workgroup_size[2];
+   uint32_t coeff_regs = cs_data->common.coeffs + cs_data->common.shareds;

   info.usc_common_size =
      DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
@ -3883,9 +3905,7 @@ static void pvr_compute_update_kernel(
   /* Use a whole slot per workgroup. */
   work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);

-   coeff_regs += shader_state->const_shared_reg_count;
-
-   if (shader_state->const_shared_reg_count > 0)
+   if (cs_data->common.shareds > 0)
      info.sd_type = ROGUE_CDMCTRL_SD_TYPE_USC;

   work_size =
@ -3947,19 +3967,21 @@ static VkResult pvr_cmd_upload_push_consts(struct pvr_cmd_buffer *cmd_buffer)
 static void pvr_cmd_dispatch(
   struct pvr_cmd_buffer *const cmd_buffer,
   const pvr_dev_addr_t indirect_addr,
+   const uint32_t base_group[static const PVR_WORKGROUP_DIMENSIONS],
   const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
 {
   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
   const struct pvr_compute_pipeline *compute_pipeline =
      state->compute_pipeline;
+   const pco_data *const cs_data = &compute_pipeline->cs_data;
   struct pvr_sub_cmd_compute *sub_cmd;
   VkResult result;

   pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE);

   sub_cmd = &state->current_sub_cmd->compute;
-   sub_cmd->uses_atomic_ops |= compute_pipeline->shader_state.uses_atomic_ops;
-   sub_cmd->uses_barrier |= compute_pipeline->shader_state.uses_barrier;
+   sub_cmd->uses_atomic_ops |= cs_data->common.uses.atomics;
+   sub_cmd->uses_barrier |= cs_data->common.uses.barriers;

   if (state->push_constants.dirty_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
      result = pvr_cmd_upload_push_consts(cmd_buffer);
@ -3972,16 +3994,33 @@ static void pvr_cmd_dispatch(
      state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
   }

-   UNREACHABLE("compute descriptor support");
+   if (state->dirty.compute_desc_dirty ||
+       state->dirty.compute_pipeline_binding) {
+      result = pvr_setup_descriptor_mappings(
+         cmd_buffer,
+         PVR_STAGE_ALLOCATION_COMPUTE,
+         &compute_pipeline->descriptor_state,
+         NULL,
+         &state->pds_compute_descriptor_data_offset);
+      if (result != VK_SUCCESS)
+         return;
+   }

   pvr_compute_update_shared(cmd_buffer, sub_cmd);
-   pvr_compute_update_kernel(cmd_buffer, sub_cmd, indirect_addr, workgroup_size);
+   pvr_compute_update_kernel(cmd_buffer,
+                             sub_cmd,
+                             indirect_addr,
+                             base_group,
+                             workgroup_size);
 }

-void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
-                     uint32_t groupCountX,
-                     uint32_t groupCountY,
-                     uint32_t groupCountZ)
+void pvr_CmdDispatchBase(VkCommandBuffer commandBuffer,
+                         uint32_t baseGroupX,
+                         uint32_t baseGroupY,
+                         uint32_t baseGroupZ,
+                         uint32_t groupCountX,
+                         uint32_t groupCountY,
+                         uint32_t groupCountZ)
 {
   PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);

@ -3992,6 +4031,7 @@ void pvr_CmdDispatch(VkCommandBuffer commandBuffer,

   pvr_cmd_dispatch(cmd_buffer,
                    PVR_DEV_ADDR_INVALID,
+                    (uint32_t[]){ baseGroupX, baseGroupY, baseGroupZ },
                    (uint32_t[]){ groupCountX, groupCountY, groupCountZ });
 }

@ -4006,6 +4046,7 @@ void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,

   pvr_cmd_dispatch(cmd_buffer,
                    PVR_DEV_ADDR_OFFSET(buffer->dev_addr, offset),
+                    (uint32_t[]){ 0, 0, 0 },
                    (uint32_t[]){ 1, 1, 1 });
 }

@ -4070,7 +4111,7 @@ pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer,
      state0.usc_target = ROGUE_VDMCTRL_USC_TARGET_ALL;

      state0.usc_common_size =
-         DIV_ROUND_UP(vs_data->common.shareds,
+         DIV_ROUND_UP(PVR_DW_TO_BYTES(vs_data->common.shareds),
                      ROGUE_VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE);

      state0.pds_data_size = DIV_ROUND_UP(
--- a/src/imagination/vulkan/pvr_hardcode.c
+++ b/src/imagination/vulkan/pvr_hardcode.c
@ -102,14 +102,6 @@ static const struct pvr_hard_coding_data {
         .shader_size = sizeof(pvr_simple_compute_shader),

         .shader_info = {
-            .uses_atomic_ops = false,
-            .uses_barrier = false,
-            .uses_num_workgroups = false,
-
-            .const_shared_reg_count = 4,
-            .input_register_count = 8,
-            .work_size = 1 * 1 * 1,
-            .coefficient_register_count = 4,
         },

         .build_info = {
--- a/src/imagination/vulkan/pvr_pipeline.c
+++ b/src/imagination/vulkan/pvr_pipeline.c
@ -669,221 +669,164 @@ static void pvr_pds_descriptor_program_destroy(

 static void pvr_pds_compute_program_setup(
   const struct pvr_device_info *dev_info,
-   const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
-   const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
-   uint32_t barrier_coefficient,
-   bool add_base_workgroup,
-   uint32_t usc_temps,
-   pvr_dev_addr_t usc_shader_dev_addr,
+   pco_data *cs_data,
+   struct pvr_compute_shader_state *compute_state,
   struct pvr_pds_compute_shader_program *const program)
 {
+   pco_range *sys_vals = cs_data->common.sys_vals;
+
   pvr_pds_compute_shader_program_init(program);
-   program->local_input_regs[0] = local_input_regs[0];
-   program->local_input_regs[1] = local_input_regs[1];
-   program->local_input_regs[2] = local_input_regs[2];
-   program->work_group_input_regs[0] = work_group_input_regs[0];
-   program->work_group_input_regs[1] = work_group_input_regs[1];
-   program->work_group_input_regs[2] = work_group_input_regs[2];
-   program->barrier_coefficient = barrier_coefficient;
-   program->add_base_workgroup = add_base_workgroup;
+
+   if (sys_vals[SYSTEM_VALUE_LOCAL_INVOCATION_INDEX].count > 0) {
+      program->local_input_regs[0] =
+         sys_vals[SYSTEM_VALUE_LOCAL_INVOCATION_INDEX].start;
+   }
+
+   for (unsigned u = 0; u < ARRAY_SIZE(program->work_group_input_regs); ++u) {
+      if (sys_vals[SYSTEM_VALUE_WORKGROUP_ID].count > u) {
+         program->work_group_input_regs[u] =
+            sys_vals[SYSTEM_VALUE_WORKGROUP_ID].start + u;
+      }
+   }
+
+   for (unsigned u = 0; u < ARRAY_SIZE(program->num_work_groups_regs); ++u) {
+      if (sys_vals[SYSTEM_VALUE_NUM_WORKGROUPS].count > u) {
+         program->num_work_groups_regs[u] =
+            sys_vals[SYSTEM_VALUE_NUM_WORKGROUPS].start + u;
+      }
+   }
+
   program->flattened_work_groups = true;
   program->kick_usc = true;

-   STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) ==
-                 PVR_WORKGROUP_DIMENSIONS);
-   STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) ==
-                 PVR_WORKGROUP_DIMENSIONS);
-   STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) ==
-                 PVR_WORKGROUP_DIMENSIONS);
-
   pvr_pds_setup_doutu(&program->usc_task_control,
-                       usc_shader_dev_addr.addr,
-                       usc_temps,
+                       compute_state->bo->dev_addr.addr,
+                       cs_data->common.temps,
                       ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
                       false);

   pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info);
 }

-/* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged.
+/* This uploads the code segment and base data segment variant.
+ * This can be patched at dispatch time.
 */
 static VkResult pvr_pds_compute_program_create_and_upload(
   struct pvr_device *const device,
   const VkAllocationCallbacks *const allocator,
-   const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
-   const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
-   uint32_t barrier_coefficient,
-   uint32_t usc_temps,
-   pvr_dev_addr_t usc_shader_dev_addr,
-   struct pvr_pds_upload *const pds_upload_out,
-   struct pvr_pds_info *const pds_info_out)
+   struct pvr_compute_shader_state *compute_state,
+   struct pvr_compute_pipeline *compute_pipeline)
 {
+   pco_range *sys_vals = compute_pipeline->cs_data.common.sys_vals;
   struct pvr_device_info *dev_info = &device->pdevice->dev_info;
   struct pvr_pds_compute_shader_program program;
-   uint32_t staging_buffer_size;
-   uint32_t *staging_buffer;
+   uint32_t *code_buffer;
+   uint32_t *data_buffer;
   VkResult result;

+   bool uses_wg_id = sys_vals[SYSTEM_VALUE_WORKGROUP_ID].count > 0;
+   bool uses_num_wgs = sys_vals[SYSTEM_VALUE_NUM_WORKGROUPS].count > 0;
+
   pvr_pds_compute_program_setup(dev_info,
-                                 local_input_regs,
-                                 work_group_input_regs,
-                                 barrier_coefficient,
-                                 false,
-                                 usc_temps,
-                                 usc_shader_dev_addr,
+                                 &compute_pipeline->cs_data,
+                                 compute_state,
                                 &program);

-   /* FIXME: According to pvr_device_init_compute_pds_program() the code size
-    * is in bytes. Investigate this.
-    */
-   staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
-
-   staging_buffer = vk_alloc2(&device->vk.alloc,
-                              allocator,
-                              staging_buffer_size,
-                              8,
-                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-   if (!staging_buffer)
+   code_buffer = vk_alloc2(&device->vk.alloc,
+                           allocator,
+                           PVR_DW_TO_BYTES(program.code_size),
+                           8,
+                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!code_buffer)
      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

-   /* FIXME: pvr_pds_compute_shader doesn't implement
-    * PDS_GENERATE_CODEDATA_SEGMENTS.
-    */
+   data_buffer = vk_alloc2(&device->vk.alloc,
+                           allocator,
+                           PVR_DW_TO_BYTES(program.code_size),
+                           8,
+                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!data_buffer) {
+      vk_free2(&device->vk.alloc, allocator, code_buffer);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
   pvr_pds_compute_shader(&program,
-                          &staging_buffer[0],
+                          &code_buffer[0],
                          PDS_GENERATE_CODE_SEGMENT,
                          dev_info);

   pvr_pds_compute_shader(&program,
-                          &staging_buffer[program.code_size],
+                          &data_buffer[0],
                          PDS_GENERATE_DATA_SEGMENT,
                          dev_info);

+   /* Initialize. */
+   if (uses_wg_id) {
+      unsigned offset = program.base_workgroup_constant_offset_in_dwords[0];
+      for (unsigned u = 0; u < PVR_WORKGROUP_DIMENSIONS; ++u) {
+         data_buffer[offset + u] = 0;
+      }
+   }
+
+   if (uses_num_wgs) {
+      unsigned offset = program.num_workgroups_constant_offset_in_dwords[0];
+      for (unsigned u = 0; u < PVR_WORKGROUP_DIMENSIONS; ++u) {
+         data_buffer[offset + u] = 0;
+      }
+   }
+
   /* FIXME: Figure out the define for alignment of 16. */
   result = pvr_gpu_upload_pds(device,
-                               &staging_buffer[program.code_size],
+                               data_buffer,
                               program.data_size,
                               16,
-                               &staging_buffer[0],
+                               code_buffer,
                               program.code_size,
                               16,
                               16,
-                               pds_upload_out);
+                               &compute_pipeline->pds_cs_program);
   if (result != VK_SUCCESS) {
-      vk_free2(&device->vk.alloc, allocator, staging_buffer);
+      vk_free2(&device->vk.alloc, allocator, code_buffer);
+      vk_free2(&device->vk.alloc, allocator, data_buffer);
      return result;
   }

-   *pds_info_out = (struct pvr_pds_info){
+   compute_pipeline->pds_cs_data_section = data_buffer;
+
+   /* The base workgroup and num workgroups can be patched in the
+    * PDS data section before dispatch so we save their offsets.
+    */
+   compute_pipeline->base_workgroup_data_patching_offset = ~0u;
+   if (uses_wg_id) {
+      compute_pipeline->base_workgroup_data_patching_offset =
+         program.base_workgroup_constant_offset_in_dwords[0];
+   }
+
+   compute_pipeline->num_workgroups_data_patching_offset = ~0u;
+   if (uses_num_wgs) {
+      compute_pipeline->num_workgroups_data_patching_offset =
+         program.num_workgroups_constant_offset_in_dwords[0];
+   }
+
+   compute_pipeline->pds_cs_program_info = (struct pvr_pds_info){
      .temps_required = program.highest_temp,
      .code_size_in_dwords = program.code_size,
      .data_size_in_dwords = program.data_size,
   };

-   vk_free2(&device->vk.alloc, allocator, staging_buffer);
-
-   return VK_SUCCESS;
-};
-
-static void pvr_pds_compute_program_destroy(
-   struct pvr_device *const device,
-   const struct VkAllocationCallbacks *const allocator,
-   struct pvr_pds_upload *const pds_program,
-   struct pvr_pds_info *const pds_info)
-{
-   /* We don't allocate an entries buffer so we don't need to free it */
-   pvr_bo_suballoc_free(pds_program->pvr_bo);
-}
-
-/* This only uploads the code segment. The data segment will need to be patched
- * with the base workgroup before uploading.
- */
-static VkResult pvr_pds_compute_base_workgroup_variant_program_init(
-   struct pvr_device *const device,
-   const VkAllocationCallbacks *const allocator,
-   const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
-   const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
-   uint32_t barrier_coefficient,
-   uint32_t usc_temps,
-   pvr_dev_addr_t usc_shader_dev_addr,
-   struct pvr_pds_base_workgroup_program *program_out)
-{
-   struct pvr_device_info *dev_info = &device->pdevice->dev_info;
-   struct pvr_pds_compute_shader_program program;
-   uint32_t buffer_size;
-   uint32_t *buffer;
-   VkResult result;
-
-   pvr_pds_compute_program_setup(dev_info,
-                                 local_input_regs,
-                                 work_group_input_regs,
-                                 barrier_coefficient,
-                                 true,
-                                 usc_temps,
-                                 usc_shader_dev_addr,
-                                 &program);
-
-   /* FIXME: According to pvr_device_init_compute_pds_program() the code size
-    * is in bytes. Investigate this.
-    */
-   buffer_size = PVR_DW_TO_BYTES(MAX2(program.code_size, program.data_size));
-
-   buffer = vk_alloc2(&device->vk.alloc,
-                      allocator,
-                      buffer_size,
-                      8,
-                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (!buffer)
-      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   pvr_pds_compute_shader(&program,
-                          &buffer[0],
-                          PDS_GENERATE_CODE_SEGMENT,
-                          dev_info);
-
-   /* FIXME: Figure out the define for alignment of 16. */
-   result = pvr_gpu_upload_pds(device,
-                               NULL,
-                               0,
-                               0,
-                               buffer,
-                               program.code_size,
-                               16,
-                               16,
-                               &program_out->code_upload);
-   if (result != VK_SUCCESS) {
-      vk_free2(&device->vk.alloc, allocator, buffer);
-      return result;
-   }
-
-   pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info);
-
-   program_out->data_section = buffer;
-
-   /* We'll need to patch the base workgroup in the PDS data section before
-    * dispatch so we save the offsets at which to patch. We only need to save
-    * the offset for the first workgroup id since the workgroup ids are stored
-    * contiguously in the data segment.
-    */
-   program_out->base_workgroup_data_patching_offset =
-      program.base_workgroup_constant_offset_in_dwords[0];
-
-   program_out->info = (struct pvr_pds_info){
-      .temps_required = program.highest_temp,
-      .code_size_in_dwords = program.code_size,
-      .data_size_in_dwords = program.data_size,
-   };
+   vk_free2(&device->vk.alloc, allocator, code_buffer);

   return VK_SUCCESS;
 }

-static void pvr_pds_compute_base_workgroup_variant_program_finish(
-   struct pvr_device *device,
-   const VkAllocationCallbacks *const allocator,
-   struct pvr_pds_base_workgroup_program *const state)
+static void
+pvr_pds_compute_program_destroy(struct pvr_device *device,
+                                const VkAllocationCallbacks *const allocator,
+                                struct pvr_pds_upload *const pds_cs_program,
+                                uint32_t *pds_cs_data_section)
 {
-   pvr_bo_suballoc_free(state->code_upload.pvr_bo);
-   vk_free2(&device->vk.alloc, allocator, state->data_section);
+   pvr_bo_suballoc_free(pds_cs_program->pvr_bo);
+   vk_free2(&device->vk.alloc, allocator, pds_cs_data_section);
 }

 /******************************************************************************
@ -917,10 +860,28 @@ static void pvr_pipeline_finish(struct pvr_device *device,
 #define PVR_DEV_ADDR_SIZE_IN_SH_REGS \
   DIV_ROUND_UP(sizeof(pvr_dev_addr_t), sizeof(uint32_t))

+static void pvr_preprocess_shader_data(pco_data *data,
+                                       nir_shader *nir,
+                                       const void *pCreateInfo,
+                                       struct vk_pipeline_layout *layout);
+
+static void pvr_postprocess_shader_data(pco_data *data,
+                                        nir_shader *nir,
+                                        const void *pCreateInfo,
+                                        struct vk_pipeline_layout *layout);
+
 /******************************************************************************
   Compute pipeline functions
 ******************************************************************************/

+static void
+pvr_compute_state_save(struct pvr_compute_pipeline *compute_pipeline,
+                       pco_shader *cs)
+{
+   const pco_data *shader_data = pco_shader_data(cs);
+   memcpy(&compute_pipeline->cs_data, shader_data, sizeof(*shader_data));
+}
+
 /* Compiles and uploads shaders and PDS programs. */
 static VkResult pvr_compute_pipeline_compile(
   struct pvr_device *const device,
@ -930,71 +891,76 @@ static VkResult pvr_compute_pipeline_compile(
   struct pvr_compute_pipeline *const compute_pipeline)
 {
   struct vk_pipeline_layout *layout = compute_pipeline->base.layout;
-   uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS];
-   uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS];
-   uint32_t barrier_coefficient;
-   uint32_t usc_temps;
+   const uint32_t cache_line_size =
+      rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
+   pco_ctx *pco_ctx = device->pdevice->pco_ctx;
+   void *shader_mem_ctx = ralloc_context(NULL);
+   pco_data shader_data = { 0 };
+   nir_shader *nir;
+   pco_shader *cs;
+
+   struct pvr_compute_shader_state *compute_state =
+      &compute_pipeline->shader_state;
+
   VkResult result;

-   compute_pipeline->shader_state.const_shared_reg_count = 0;
+   result =
+      vk_pipeline_shader_stage_to_nir(&device->vk,
+                                      compute_pipeline->base.pipeline_flags,
+                                      &pCreateInfo->stage,
+                                      pco_spirv_options(),
+                                      pco_nir_options(),
+                                      shader_mem_ctx,
+                                      &nir);
+   if (result != VK_SUCCESS)
+      goto err_free_build_context;

-   /* FIXME: Compile and upload the shader. */
-   /* FIXME: Initialize the shader state and setup build info. */
-   UNREACHABLE("finishme: compute support");
+   pco_preprocess_nir(pco_ctx, nir);
+   pvr_preprocess_shader_data(&shader_data, nir, pCreateInfo, layout);
+   pco_lower_nir(pco_ctx, nir, &shader_data);
+   pco_postprocess_nir(pco_ctx, nir, &shader_data);
+   pvr_postprocess_shader_data(&shader_data, nir, pCreateInfo, layout);
+
+   cs = pco_trans_nir(pco_ctx, nir, &shader_data, shader_mem_ctx);
+   if (!cs) {
+      result = VK_ERROR_INITIALIZATION_FAILED;
+      goto err_free_build_context;
+   }
+
+   pco_process_ir(pco_ctx, cs);
+   pco_encode_ir(pco_ctx, cs);
+
+   pvr_compute_state_save(compute_pipeline, cs);
+
+   result = pvr_gpu_upload_usc(device,
+                               pco_shader_binary_data(cs),
+                               pco_shader_binary_size(cs),
+                               cache_line_size,
+                               &compute_pipeline->shader_state.bo);
+   if (result != VK_SUCCESS)
+      goto err_free_build_context;

   result = pvr_pds_descriptor_program_create_and_upload(
      device,
      allocator,
      layout,
      MESA_SHADER_COMPUTE,
-      NULL,
+      &compute_pipeline->cs_data,
      &compute_pipeline->descriptor_state);
   if (result != VK_SUCCESS)
      goto err_free_shader;

-   result = pvr_pds_compute_program_create_and_upload(
-      device,
-      allocator,
-      local_input_regs,
-      work_group_input_regs,
-      barrier_coefficient,
-      usc_temps,
-      compute_pipeline->shader_state.bo->dev_addr,
-      &compute_pipeline->primary_program,
-      &compute_pipeline->primary_program_info);
+   result = pvr_pds_compute_program_create_and_upload(device,
+                                                      allocator,
+                                                      compute_state,
+                                                      compute_pipeline);
   if (result != VK_SUCCESS)
      goto err_free_descriptor_program;

-   /* If the workgroup ID is required, then we require the base workgroup
-    * variant of the PDS compute program as well.
-    */
-   compute_pipeline->flags.base_workgroup =
-      work_group_input_regs[0] != PVR_PDS_REG_UNUSED ||
-      work_group_input_regs[1] != PVR_PDS_REG_UNUSED ||
-      work_group_input_regs[2] != PVR_PDS_REG_UNUSED;
-
-   if (compute_pipeline->flags.base_workgroup) {
-      result = pvr_pds_compute_base_workgroup_variant_program_init(
-         device,
-         allocator,
-         local_input_regs,
-         work_group_input_regs,
-         barrier_coefficient,
-         usc_temps,
-         compute_pipeline->shader_state.bo->dev_addr,
-         &compute_pipeline->primary_base_workgroup_variant_program);
-      if (result != VK_SUCCESS)
-         goto err_destroy_compute_program;
-   }
+   ralloc_free(shader_mem_ctx);

   return VK_SUCCESS;

-err_destroy_compute_program:
-   pvr_pds_compute_program_destroy(device,
-                                   allocator,
-                                   &compute_pipeline->primary_program,
-                                   &compute_pipeline->primary_program_info);
-
 err_free_descriptor_program:
   pvr_pds_descriptor_program_destroy(device,
                                      allocator,
@ -1003,6 +969,8 @@ err_free_descriptor_program:
 err_free_shader:
   pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);

+err_free_build_context:
+   ralloc_free(shader_mem_ctx);
   return result;
 }

@ -1067,27 +1035,24 @@ pvr_compute_pipeline_create(struct pvr_device *device,
   return VK_SUCCESS;
 }

+static void pvr_pipeline_destroy_shader_data(pco_data *data);
+
 static void pvr_compute_pipeline_destroy(
   struct pvr_device *const device,
   const VkAllocationCallbacks *const allocator,
   struct pvr_compute_pipeline *const compute_pipeline)
 {
-   if (compute_pipeline->flags.base_workgroup) {
-      pvr_pds_compute_base_workgroup_variant_program_finish(
-         device,
-         allocator,
-         &compute_pipeline->primary_base_workgroup_variant_program);
-   }
-
   pvr_pds_compute_program_destroy(device,
                                   allocator,
-                                   &compute_pipeline->primary_program,
-                                   &compute_pipeline->primary_program_info);
+                                   &compute_pipeline->pds_cs_program,
+                                   compute_pipeline->pds_cs_data_section);
   pvr_pds_descriptor_program_destroy(device,
                                      allocator,
                                      &compute_pipeline->descriptor_state);
   pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);

+   pvr_pipeline_destroy_shader_data(&compute_pipeline->cs_data);
+
   pvr_pipeline_finish(device, &compute_pipeline->base);

   vk_free2(&device->vk.alloc, allocator, compute_pipeline);
@ -1915,6 +1880,58 @@ static void pvr_setup_fs_input_attachments(
   pvr_finishme("pvr_setup_fs_input_attachments");
 }

+static void pvr_alloc_cs_sysvals(pco_data *data, nir_shader *nir)
+{
+   BITSET_DECLARE(system_values_read, SYSTEM_VALUE_MAX);
+   BITSET_COPY(system_values_read, nir->info.system_values_read);
+
+   gl_system_value vtxin_sys_vals[] = {
+      SYSTEM_VALUE_LOCAL_INVOCATION_INDEX,
+   };
+
+   gl_system_value coeff_sys_vals[] = {
+      SYSTEM_VALUE_WORKGROUP_ID,
+      SYSTEM_VALUE_NUM_WORKGROUPS,
+   };
+
+   for (unsigned u = 0; u < ARRAY_SIZE(vtxin_sys_vals); ++u) {
+      if (BITSET_TEST(system_values_read, vtxin_sys_vals[u])) {
+         nir_intrinsic_op op =
+            nir_intrinsic_from_system_value(vtxin_sys_vals[u]);
+         unsigned dwords = nir_intrinsic_infos[op].dest_components;
+         assert(dwords > 0);
+
+         allocate_val(data->common.sys_vals,
+                      &data->common.vtxins,
+                      vtxin_sys_vals[u],
+                      dwords);
+
+         BITSET_CLEAR(system_values_read, vtxin_sys_vals[u]);
+      }
+   }
+
+   for (unsigned u = 0; u < ARRAY_SIZE(coeff_sys_vals); ++u) {
+      if (BITSET_TEST(system_values_read, coeff_sys_vals[u])) {
+         nir_intrinsic_op op =
+            nir_intrinsic_from_system_value(coeff_sys_vals[u]);
+         unsigned dwords = nir_intrinsic_infos[op].dest_components;
+         assert(dwords > 0);
+
+         if (dwords > 1 && data->common.coeffs & 1)
+            ++data->common.coeffs;
+
+         allocate_val(data->common.sys_vals,
+                      &data->common.coeffs,
+                      coeff_sys_vals[u],
+                      dwords);
+
+         BITSET_CLEAR(system_values_read, coeff_sys_vals[u]);
+      }
+   }
+
+   assert(BITSET_IS_EMPTY(system_values_read));
+}
+
 static void pvr_init_descriptors(pco_data *data,
                                 nir_shader *nir,
                                 struct vk_pipeline_layout *layout)
@ -1985,27 +2002,28 @@ static void pvr_setup_descriptors(pco_data *data,
   assert(data->common.shareds < 256);
 }

-static void
-pvr_preprocess_shader_data(pco_data *data,
-                           nir_shader *nir,
-                           const VkGraphicsPipelineCreateInfo *pCreateInfo,
-                           struct vk_pipeline_layout *layout)
+static void pvr_preprocess_shader_data(pco_data *data,
+                                       nir_shader *nir,
+                                       const void *pCreateInfo,
+                                       struct vk_pipeline_layout *layout)
 {
+   const VkGraphicsPipelineCreateInfo *pGraphicsCreateInfo = pCreateInfo;
+
   switch (nir->info.stage) {
   case MESA_SHADER_VERTEX: {
      const VkPipelineVertexInputStateCreateInfo *const vertex_input_state =
-         pCreateInfo->pVertexInputState;
+         pGraphicsCreateInfo->pVertexInputState;

      pvr_init_vs_attribs(data, vertex_input_state);
      break;
   }

   case MESA_SHADER_FRAGMENT: {
-      PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass);
+      PVR_FROM_HANDLE(pvr_render_pass, pass, pGraphicsCreateInfo->renderPass);
      const struct pvr_render_subpass *const subpass =
-         &pass->subpasses[pCreateInfo->subpass];
+         &pass->subpasses[pGraphicsCreateInfo->subpass];
      const struct pvr_renderpass_hw_map *subpass_map =
-         &pass->hw_setup->subpass_map[pCreateInfo->subpass];
+         &pass->hw_setup->subpass_map[pGraphicsCreateInfo->subpass];
      const struct pvr_renderpass_hwsetup_subpass *hw_subpass =
         &pass->hw_setup->renders[subpass_map->render]
             .subpasses[subpass_map->subpass];
@ -2017,6 +2035,10 @@ pvr_preprocess_shader_data(pco_data *data,
      break;
   }

+   case MESA_SHADER_COMPUTE: {
+      break;
+   }
+
   default:
      UNREACHABLE("");
   }
@ -2026,12 +2048,13 @@ pvr_preprocess_shader_data(pco_data *data,
   /* TODO: common things, like large constants being put into shareds. */
 }

-static void
-pvr_postprocess_shader_data(pco_data *data,
-                            nir_shader *nir,
-                            const VkGraphicsPipelineCreateInfo *pCreateInfo,
-                            struct vk_pipeline_layout *layout)
+static void pvr_postprocess_shader_data(pco_data *data,
+                                        nir_shader *nir,
+                                        const void *pCreateInfo,
+                                        struct vk_pipeline_layout *layout)
 {
+   const VkGraphicsPipelineCreateInfo *pGraphicsCreateInfo = pCreateInfo;
+
   switch (nir->info.stage) {
   case MESA_SHADER_VERTEX: {
      pvr_alloc_vs_sysvals(data, nir);
@ -2041,11 +2064,11 @@ pvr_postprocess_shader_data(pco_data *data,
   }

   case MESA_SHADER_FRAGMENT: {
-      PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass);
+      PVR_FROM_HANDLE(pvr_render_pass, pass, pGraphicsCreateInfo->renderPass);
      const struct pvr_render_subpass *const subpass =
-         &pass->subpasses[pCreateInfo->subpass];
+         &pass->subpasses[pGraphicsCreateInfo->subpass];
      const struct pvr_renderpass_hw_map *subpass_map =
-         &pass->hw_setup->subpass_map[pCreateInfo->subpass];
+         &pass->hw_setup->subpass_map[pGraphicsCreateInfo->subpass];
      const struct pvr_renderpass_hwsetup_subpass *hw_subpass =
         &pass->hw_setup->renders[subpass_map->render]
             .subpasses[subpass_map->subpass];
@ -2059,6 +2082,11 @@ pvr_postprocess_shader_data(pco_data *data,
      break;
   }

+   case MESA_SHADER_COMPUTE: {
+      pvr_alloc_cs_sysvals(data, nir);
+      break;
+   }
+
   default:
      UNREACHABLE("");
   }
--- a/src/imagination/vulkan/pvr_private.h
+++ b/src/imagination/vulkan/pvr_private.h
@ -862,16 +862,6 @@ struct pvr_pipeline_stage_state {
 struct pvr_compute_shader_state {
   /* Pointer to a buffer object that contains the shader binary. */
   struct pvr_suballoc_bo *bo;
-
-   bool uses_atomic_ops;
-   bool uses_barrier;
-   /* E.g. GLSL shader uses gl_NumWorkGroups. */
-   bool uses_num_workgroups;
-
-   uint32_t const_shared_reg_count;
-   uint32_t input_register_count;
-   uint32_t work_size;
-   uint32_t coefficient_register_count;
 };

 struct pvr_vertex_shader_state {
@ -910,28 +900,17 @@ struct pvr_pipeline {
 struct pvr_compute_pipeline {
   struct pvr_pipeline base;

+   pco_data cs_data;
+
   struct pvr_compute_shader_state shader_state;
-
-   struct {
-      uint32_t base_workgroup : 1;
-   } flags;
-
   struct pvr_stage_allocation_descriptor_state descriptor_state;

-   struct pvr_pds_upload primary_program;
-   struct pvr_pds_info primary_program_info;
+   struct pvr_pds_upload pds_cs_program;
+   struct pvr_pds_info pds_cs_program_info;

-   struct pvr_pds_base_workgroup_program {
-      struct pvr_pds_upload code_upload;
-
-      uint32_t *data_section;
-      /* Offset within the PDS data section at which the base workgroup id
-       * resides.
-       */
-      uint32_t base_workgroup_data_patching_offset;
-
-      struct pvr_pds_info info;
-   } primary_base_workgroup_variant_program;
+   uint32_t *pds_cs_data_section;
+   uint32_t base_workgroup_data_patching_offset;
+   uint32_t num_workgroups_data_patching_offset;
 };

 struct pvr_graphics_pipeline {