diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index bbb54a45d76..871f79fbd32 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -3080,7 +3080,9 @@ job_update_ez_state(struct v3dv_job *job,
     */
 
    /* If the FS writes Z, then it may update against the chosen EZ direction */
-   if (pipeline->fs->current_variant->prog_data.fs->writes_z) {
+   struct v3dv_shader_variant *fs_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+   if (fs_variant->prog_data.fs->writes_z) {
       job->ez_state = VC5_EZ_DISABLED;
       return;
    }
@@ -3673,7 +3675,7 @@ emit_varyings_state(struct v3dv_cmd_buffer *cmd_buffer)
    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
 
    struct v3d_fs_prog_data *prog_data_fs =
-      pipeline->fs->current_variant->prog_data.fs;
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs;
 
    const uint32_t num_flags =
       ARRAY_SIZE(prog_data_fs->flat_shade_flags);
@@ -3753,8 +3755,11 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
       (pipeline->layout->shader_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
 
    if (needs_fs_update) {
+      struct v3dv_shader_variant *fs_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+
       cmd_buffer->state.uniforms.fs =
-         v3dv_write_uniforms(cmd_buffer, pipeline->fs);
+         v3dv_write_uniforms(cmd_buffer, pipeline, fs_variant);
    }
 
    const bool needs_vs_update =
@@ -3762,11 +3767,17 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
       (pipeline->layout->shader_stages & VK_SHADER_STAGE_VERTEX_BIT);
 
    if (needs_vs_update) {
+      struct v3dv_shader_variant *vs_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
+
+       struct v3dv_shader_variant *vs_bin_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
+
       cmd_buffer->state.uniforms.vs =
-         v3dv_write_uniforms(cmd_buffer, pipeline->vs);
+         v3dv_write_uniforms(cmd_buffer, pipeline, vs_variant);
 
       cmd_buffer->state.uniforms.vs_bin =
-         v3dv_write_uniforms(cmd_buffer, pipeline->vs_bin);
+         v3dv_write_uniforms(cmd_buffer, pipeline, vs_bin_variant);
    }
 }
 
@@ -3780,10 +3791,17 @@ emit_gl_shader_state(struct v3dv_cmd_buffer *cmd_buffer)
    struct v3dv_pipeline *pipeline = state->gfx.pipeline;
    assert(pipeline);
 
+   struct v3d_vs_prog_data *prog_data_vs =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]->prog_data.vs;
+   struct v3d_vs_prog_data *prog_data_vs_bin =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]->prog_data.vs;
+   struct v3d_fs_prog_data *prog_data_fs =
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs;
+
    /* Update the cache dirty flag based on the shader progs data */
-   job->tmu_dirty_rcl |= pipeline->vs_bin->current_variant->prog_data.vs->base.tmu_dirty_rcl;
-   job->tmu_dirty_rcl |= pipeline->vs->current_variant->prog_data.vs->base.tmu_dirty_rcl;
-   job->tmu_dirty_rcl |= pipeline->fs->current_variant->prog_data.fs->base.tmu_dirty_rcl;
+   job->tmu_dirty_rcl |= prog_data_vs_bin->base.tmu_dirty_rcl;
+   job->tmu_dirty_rcl |= prog_data_vs->base.tmu_dirty_rcl;
+   job->tmu_dirty_rcl |= prog_data_fs->base.tmu_dirty_rcl;
 
    /* See GFXH-930 workaround below */
    uint32_t num_elements_to_emit = MAX2(pipeline->va_count, 1);
@@ -3796,6 +3814,14 @@ emit_gl_shader_state(struct v3dv_cmd_buffer *cmd_buffer)
                            32);
    v3dv_return_if_oom(cmd_buffer, NULL);
 
+   struct v3dv_shader_variant *vs_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
+   struct v3dv_shader_variant *vs_bin_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
+   struct v3dv_shader_variant *fs_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+   struct v3dv_bo *assembly_bo = pipeline->shared_data->assembly_bo;
+
    cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
                           pipeline->shader_state_record, shader) {
 
@@ -3810,11 +3836,11 @@ emit_gl_shader_state(struct v3dv_cmd_buffer *cmd_buffer)
          pipeline->vpm_cfg.As;
 
       shader.coordinate_shader_code_address =
-         v3dv_cl_address(pipeline->vs_bin->current_variant->assembly_bo, 0);
+         v3dv_cl_address(assembly_bo, vs_bin_variant->assembly_offset);
       shader.vertex_shader_code_address =
-         v3dv_cl_address(pipeline->vs->current_variant->assembly_bo, 0);
+         v3dv_cl_address(assembly_bo, vs_variant->assembly_offset);
       shader.fragment_shader_code_address =
-         v3dv_cl_address(pipeline->fs->current_variant->assembly_bo, 0);
+         v3dv_cl_address(assembly_bo, fs_variant->assembly_offset);
 
       shader.coordinate_shader_uniforms_address = cmd_buffer->state.uniforms.vs_bin;
       shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
@@ -3825,12 +3851,6 @@ emit_gl_shader_state(struct v3dv_cmd_buffer *cmd_buffer)
    }
 
    /* Upload vertex element attributes (SHADER_STATE_ATTRIBUTE_RECORD) */
-   struct v3d_vs_prog_data *prog_data_vs =
-      pipeline->vs->current_variant->prog_data.vs;
-
-   struct v3d_vs_prog_data *prog_data_vs_bin =
-      pipeline->vs_bin->current_variant->prog_data.vs;
-
    bool cs_loaded_any = false;
    const bool cs_uses_builtins = prog_data_vs_bin->uses_iid ||
                                  prog_data_vs_bin->uses_biid ||
@@ -5122,7 +5142,8 @@ static void
 cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
 {
    assert(cmd_buffer->state.compute.pipeline);
-   assert(cmd_buffer->state.compute.pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
+   assert(cmd_buffer->state.compute.pipeline->active_stages ==
+          VK_SHADER_STAGE_COMPUTE_BIT);
 
    uint32_t *dirty = &cmd_buffer->state.dirty;
    *dirty &= ~(V3DV_CMD_DIRTY_COMPUTE_PIPELINE |
@@ -5198,7 +5219,9 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
                           uint32_t *wg_size_out)
 {
    struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
-   assert(pipeline && pipeline->cs && pipeline->cs->current_variant);
+   assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
+   struct v3dv_shader_variant *cs_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE];
 
    struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
                                     sizeof(struct v3dv_job), 8,
@@ -5222,7 +5245,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
    submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT;
 
    const struct v3d_compute_prog_data *cpd =
-      pipeline->cs->current_variant->prog_data.cs;
+      cs_variant->prog_data.cs;
 
    const uint32_t wgs_per_sg = 1; /* FIXME */
    const uint32_t wg_size = cpd->local_size[0] *
@@ -5230,7 +5253,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
                             cpd->local_size[2];
    submit->cfg[3] |= wgs_per_sg << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
    submit->cfg[3] |= ((DIV_ROUND_UP(wgs_per_sg * wg_size, 16) - 1) <<
-                     V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT);
+                       V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT);
    submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
    if (wg_size_out)
       *wg_size_out = wg_size;
@@ -5240,20 +5263,20 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
                     (group_count_x * group_count_y * group_count_z) - 1;
    assert(submit->cfg[4] != ~0);
 
-   assert(pipeline->cs->current_variant &&
-          pipeline->cs->current_variant->assembly_bo);
-   const struct v3dv_shader_variant *variant = pipeline->cs->current_variant;
-   submit->cfg[5] = variant->assembly_bo->offset;
+   assert(pipeline->shared_data->assembly_bo);
+   struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;
+
+   submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;
    submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
-   if (variant->prog_data.base->single_seg)
+   if (cs_variant->prog_data.base->single_seg)
       submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
-   if (variant->prog_data.base->threads == 4)
+   if (cs_variant->prog_data.base->threads == 4)
       submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
 
-   if (variant->prog_data.cs->shared_size > 0) {
+   if (cs_variant->prog_data.cs->shared_size > 0) {
       job->csd.shared_memory =
          v3dv_bo_alloc(cmd_buffer->device,
-                       variant->prog_data.cs->shared_size * wgs_per_sg,
+                       cs_variant->prog_data.cs->shared_size * wgs_per_sg,
                        "shared_vars", true);
       if (!job->csd.shared_memory) {
          v3dv_flag_oom(cmd_buffer, NULL);
@@ -5261,10 +5284,10 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
       }
    }
 
-   v3dv_job_add_bo(job, variant->assembly_bo);
-
+   v3dv_job_add_bo(job, cs_assembly_bo);
    struct v3dv_cl_reloc uniforms =
-      v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline->cs,
+      v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline,
+                                     cs_variant,
                                      wg_uniform_offsets_out);
    submit->cfg[6] = uniforms.bo->offset + uniforms.offset;
 
diff --git a/src/broadcom/vulkan/v3dv_meta_clear.c b/src/broadcom/vulkan/v3dv_meta_clear.c
index 6b0de54b391..0a38edb21e1 100644
--- a/src/broadcom/vulkan/v3dv_meta_clear.c
+++ b/src/broadcom/vulkan/v3dv_meta_clear.c
@@ -235,7 +235,8 @@ create_pipeline(struct v3dv_device *device,
    struct vk_shader_module fs_m;
 
    v3dv_shader_module_internal_init(device, &vs_m, vs_nir);
-   v3dv_shader_module_internal_init(device, &fs_m, fs_nir);
+   if (fs_nir)
+      v3dv_shader_module_internal_init(device, &fs_m, fs_nir);
 
    VkPipelineShaderStageCreateInfo stages[2] = {
       {
@@ -247,7 +248,7 @@ create_pipeline(struct v3dv_device *device,
       {
          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
          .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
-         .module = vk_shader_module_to_handle(&fs_m),
+         .module = fs_nir ? vk_shader_module_to_handle(&fs_m) : VK_NULL_HANDLE,
          .pName = "main",
       },
    };
diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c
index 5c6816c64b3..190e8026e68 100644
--- a/src/broadcom/vulkan/v3dv_pipeline.c
+++ b/src/broadcom/vulkan/v3dv_pipeline.c
@@ -83,8 +83,9 @@ void
 v3dv_shader_variant_destroy(struct v3dv_device *device,
                             struct v3dv_shader_variant *variant)
 {
-   if (variant->assembly_bo)
-      v3dv_bo_free(device, variant->assembly_bo);
+   /* The assembly BO is shared by all variants in the pipeline, so it can't
+    * be freed here and should be freed with the pipeline
+    */
    ralloc_free(variant->prog_data.base);
    vk_free(&device->vk.alloc, variant);
 }
@@ -98,11 +99,30 @@ destroy_pipeline_stage(struct v3dv_device *device,
       return;
 
    ralloc_free(p_stage->nir);
-   if (p_stage->current_variant)
-      v3dv_shader_variant_unref(device, p_stage->current_variant);
    vk_free2(&device->vk.alloc, pAllocator, p_stage);
 }
 
+static void
+pipeline_free_stages(struct v3dv_device *device,
+                     struct v3dv_pipeline *pipeline,
+                     const VkAllocationCallbacks *pAllocator)
+{
+   assert(pipeline);
+
+   /* FIXME: we can't just use a loop over mesa stage due the bin, would be
+    * good to find an alternative.
+    */
+   destroy_pipeline_stage(device, pipeline->vs, pAllocator);
+   destroy_pipeline_stage(device, pipeline->vs_bin, pAllocator);
+   destroy_pipeline_stage(device, pipeline->fs, pAllocator);
+   destroy_pipeline_stage(device, pipeline->cs, pAllocator);
+
+   pipeline->vs = NULL;
+   pipeline->vs_bin = NULL;
+   pipeline->fs = NULL;
+   pipeline->cs = NULL;
+}
+
 static void
 v3dv_destroy_pipeline(struct v3dv_pipeline *pipeline,
                       struct v3dv_device *device,
@@ -111,13 +131,12 @@ v3dv_destroy_pipeline(struct v3dv_pipeline *pipeline,
    if (!pipeline)
       return;
 
-   /* FIXME: we can't just use a loop over mesa stage due the bin, would be
-    * good to find an alternative.
-    */
-   destroy_pipeline_stage(device, pipeline->vs, pAllocator);
-   destroy_pipeline_stage(device, pipeline->vs_bin, pAllocator);
-   destroy_pipeline_stage(device, pipeline->fs, pAllocator);
-   destroy_pipeline_stage(device, pipeline->cs, pAllocator);
+   pipeline_free_stages(device, pipeline, pAllocator);
+
+   if (pipeline->shared_data) {
+      v3dv_pipeline_shared_data_unref(device, pipeline->shared_data);
+      pipeline->shared_data = NULL;
+   }
 
    if (pipeline->spill.bo) {
       assert(pipeline->spill.size_per_thread > 0);
@@ -432,6 +451,7 @@ shader_module_compile_to_nir(struct v3dv_device *device,
                          broadcom_shader_stage_to_gl(stage->stage),
                          stage->entrypoint,
                          &spirv_options, nir_options);
+      assert(nir);
       nir_validate_shader(nir, "after spirv_to_nir");
       free(spec_entries);
    } else {
@@ -565,7 +585,7 @@ lower_vulkan_resource_index(nir_builder *b,
    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
       struct v3dv_descriptor_map *descriptor_map =
          nir_intrinsic_desc_type(instr) == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ?
-         &pipeline->ubo_map : &pipeline->ssbo_map;
+         &pipeline->shared_data->ubo_map : &pipeline->shared_data->ssbo_map;
 
       if (!const_val)
          unreachable("non-constant vulkan_resource_index array index");
@@ -680,9 +700,11 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
 
    uint8_t return_size = relaxed_precision || instr->is_shadow ? 16 : 32;
 
+   struct v3dv_descriptor_map *map = is_sampler ?
+      &pipeline->shared_data->sampler_map :
+      &pipeline->shared_data->texture_map;
    int desc_index =
-      descriptor_map_add(is_sampler ?
-                         &pipeline->sampler_map : &pipeline->texture_map,
+      descriptor_map_add(map,
                          deref->var->data.descriptor_set,
                          deref->var->data.binding,
                          array_index,
@@ -784,7 +806,7 @@ lower_image_deref(nir_builder *b,
           binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER);
 
    int desc_index =
-      descriptor_map_add(&pipeline->texture_map,
+      descriptor_map_add(&pipeline->shared_data->texture_map,
                          deref->var->data.descriptor_set,
                          deref->var->data.binding,
                          array_index,
@@ -957,8 +979,10 @@ pipeline_populate_v3d_key(struct v3d_key *key,
    /* The following values are default values used at pipeline create. We use
     * there 32 bit as default return size.
     */
-   struct v3dv_descriptor_map *sampler_map = &p_stage->pipeline->sampler_map;
-   struct v3dv_descriptor_map *texture_map = &p_stage->pipeline->texture_map;
+   struct v3dv_descriptor_map *sampler_map =
+      &p_stage->pipeline->shared_data->sampler_map;
+   struct v3dv_descriptor_map *texture_map =
+      &p_stage->pipeline->shared_data->texture_map;
 
    key->num_tex_used = texture_map->num_desc;
    assert(key->num_tex_used <= V3D_MAX_TEXTURE_SAMPLERS);
@@ -1171,7 +1195,8 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
       key->num_used_outputs = 0;
    } else {
       struct v3dv_pipeline *pipeline = p_stage->pipeline;
-      struct v3dv_shader_variant *fs_variant = pipeline->fs->current_variant;
+      struct v3dv_shader_variant *fs_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
 
       key->num_used_outputs = fs_variant->prog_data.fs->num_inputs;
 
@@ -1217,113 +1242,123 @@ pipeline_stage_create_vs_bin(const struct v3dv_pipeline_stage *src,
    p_stage->stage = BROADCOM_SHADER_VERTEX_BIN;
    p_stage->entrypoint = src->entrypoint;
    p_stage->module = src->module;
-   p_stage->nir = nir_shader_clone(NULL, src->nir);
+   p_stage->nir = src->nir ? nir_shader_clone(NULL, src->nir) : NULL;
    p_stage->spec_info = src->spec_info;
    memcpy(p_stage->shader_sha1, src->shader_sha1, 20);
 
    return p_stage;
 }
 
-/* FIXME: right now this just asks for an bo for the exact size of the qpu
- * assembly. It would be good to be able to re-use bos to avoid bo
- * fragmentation. This could be tricky though, as right now we are uploading
- * the assembly from two paths, when compiling a shader, or when deserializing
- * from the pipeline cache. This also means that the same variant can be
- * shared by different objects. So with the current approach it is clear who
- * owns the assembly bo, but if shared, who owns the shared bo?
- *
- * For now one-bo per-assembly would work.
- *
+/**
  * Returns false if it was not able to allocate or map the assembly bo memory.
  */
 static bool
-upload_assembly(struct v3dv_device *device,
-                struct v3dv_shader_variant *variant,
-                broadcom_shader_stage stage,
-                const void *data,
-                uint32_t size)
+upload_assembly(struct v3dv_pipeline *pipeline)
 {
-   const char *name = NULL;
-   /* We are uploading the assembly just once, so at this point we shouldn't
-    * have any bo
-    */
-   assert(variant->assembly_bo == NULL);
+   uint32_t total_size = 0;
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      struct v3dv_shader_variant *variant =
+         pipeline->shared_data->variants[stage];
 
-   switch (stage) {
-   case BROADCOM_SHADER_VERTEX:
-      name = "vertex_shader_assembly";
-      break;
-   case BROADCOM_SHADER_VERTEX_BIN:
-      name = "vs_bin_shader_assembly";
-      break;
-   case BROADCOM_SHADER_FRAGMENT:
-      name = "fragment_shader_assembly";
-      break;
-   case BROADCOM_SHADER_COMPUTE:
-      name = "compute_shader_assembly";
-      break;
-   default:
-      unreachable("Stage not supported\n");
-      break;
-   };
+      if (variant != NULL)
+         total_size += variant->qpu_insts_size;
+   }
 
-   struct v3dv_bo *bo = v3dv_bo_alloc(device, size, name, true);
+   struct v3dv_bo *bo = v3dv_bo_alloc(pipeline->device, total_size,
+                                      "pipeline shader assembly", true);
    if (!bo) {
       fprintf(stderr, "failed to allocate memory for shader\n");
       return false;
    }
 
-   bool ok = v3dv_bo_map(device, bo, size);
+   bool ok = v3dv_bo_map(pipeline->device, bo, total_size);
    if (!ok) {
       fprintf(stderr, "failed to map source shader buffer\n");
       return false;
    }
 
-   memcpy(bo->map, data, size);
+   uint32_t offset = 0;
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      struct v3dv_shader_variant *variant =
+         pipeline->shared_data->variants[stage];
 
-   /* We don't unmap the assembly bo, as we would use to gather the assembly
-    * when serializing the variant.
-    */
-   variant->assembly_bo = bo;
+      if (variant != NULL) {
+         variant->assembly_offset = offset;
+
+         memcpy(bo->map + offset, variant->qpu_insts, variant->qpu_insts_size);
+         offset += variant->qpu_insts_size;
+
+         /* We dont need qpu_insts anymore. */
+         free(variant->qpu_insts);
+         variant->qpu_insts = NULL;
+      }
+   }
+   assert(total_size == offset);
+
+   pipeline->shared_data->assembly_bo = bo;
 
    return true;
 }
 
 static void
-pipeline_hash_variant(const struct v3dv_pipeline_stage *p_stage,
-                      struct v3d_key *key,
-                      size_t key_size,
-                      unsigned char *sha1_out)
+pipeline_hash_graphics(const struct v3dv_pipeline *pipeline,
+                       struct v3dv_pipeline_key *key,
+                       unsigned char *sha1_out)
 {
    struct mesa_sha1 ctx;
-   struct v3dv_pipeline *pipeline = p_stage->pipeline;
    _mesa_sha1_init(&ctx);
 
-   if (p_stage->stage == BROADCOM_SHADER_COMPUTE) {
-      _mesa_sha1_update(&ctx, p_stage->shader_sha1, sizeof(p_stage->shader_sha1));
-   } else {
-      /* We need to include both on the sha1 key as one could affect the other
-       * during linking (like if vertex output are constants, then the
-       * fragment shader would load_const intead of load_input). An
-       * alternative would be to use the serialized nir, but that seems like
-       * an overkill
-       */
-      _mesa_sha1_update(&ctx, pipeline->vs->shader_sha1,
-                        sizeof(pipeline->vs->shader_sha1));
-      _mesa_sha1_update(&ctx, pipeline->fs->shader_sha1,
-                        sizeof(pipeline->fs->shader_sha1));
-   }
-   _mesa_sha1_update(&ctx, key, key_size);
+   /* We need to include both on the sha1 key as one could affect the other
+    * during linking (like if vertex output are constants, then the
+    * fragment shader would load_const intead of load_input). An
+    * alternative would be to use the serialized nir, but that seems like
+    * an overkill
+    */
+   _mesa_sha1_update(&ctx, pipeline->vs->shader_sha1,
+                     sizeof(pipeline->vs->shader_sha1));
+   _mesa_sha1_update(&ctx, pipeline->fs->shader_sha1,
+                     sizeof(pipeline->fs->shader_sha1));
+
+   _mesa_sha1_update(&ctx, key, sizeof(struct v3dv_pipeline_key));
 
    _mesa_sha1_final(&ctx, sha1_out);
 }
 
-/* Checks that the pipeline has enough spill size to use a specific variant */
 static void
-pipeline_check_spill_size(struct v3dv_pipeline *pipeline,
-                          struct v3dv_shader_variant *variant)
+pipeline_hash_compute(const struct v3dv_pipeline *pipeline,
+                      struct v3dv_pipeline_key *key,
+                      unsigned char *sha1_out)
 {
-   if (variant->prog_data.base->spill_size > pipeline->spill.size_per_thread) {
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+
+   _mesa_sha1_update(&ctx, pipeline->cs->shader_sha1,
+                     sizeof(pipeline->cs->shader_sha1));
+
+   _mesa_sha1_update(&ctx, key, sizeof(struct v3dv_pipeline_key));
+
+   _mesa_sha1_final(&ctx, sha1_out);
+}
+
+/* Checks that the pipeline has enough spill size to use for any of their
+ * variants
+ */
+static void
+pipeline_check_spill_size(struct v3dv_pipeline *pipeline)
+{
+   uint32_t max_spill_size = 0;
+
+   for(uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      struct v3dv_shader_variant *variant =
+         pipeline->shared_data->variants[stage];
+
+      if (variant != NULL) {
+         max_spill_size = MAX2(variant->prog_data.base->spill_size,
+                               max_spill_size);
+      }
+   }
+
+   if (max_spill_size > 0) {
       struct v3dv_device *device = pipeline->device;
 
       /* The TIDX register we use for choosing the area to access
@@ -1332,30 +1367,35 @@ pipeline_check_spill_size(struct v3dv_pipeline *pipeline,
        * means we still multiply by qpus by 4.
        */
       const uint32_t total_spill_size =
-         4 * device->devinfo.qpu_count * variant->prog_data.base->spill_size;
+         4 * device->devinfo.qpu_count * max_spill_size;
       if (pipeline->spill.bo) {
          assert(pipeline->spill.size_per_thread > 0);
          v3dv_bo_free(device, pipeline->spill.bo);
       }
       pipeline->spill.bo =
          v3dv_bo_alloc(device, total_spill_size, "spill", true);
-      pipeline->spill.size_per_thread = variant->prog_data.base->spill_size;
+      pipeline->spill.size_per_thread = max_spill_size;
    }
 }
 
-/*
- * Creates a new shader_variant_create. Note that for prog_data is const, so
- * it is used only to copy to their own prog_data
+/**
+ * Creates a new shader_variant_create. Note that for prog_data is not const,
+ * so it is assumed that the caller will prove a pointer that the
+ * shader_variant will own.
  *
- * Creation includes allocating a shader source bo, and filling it up.
+ * Creation doesn't include allocate a BD to store the content of qpu_insts,
+ * as we will try to share the same bo for several shader variants. Also note
+ * that qpu_ints being NULL is valid, for example if we are creating the
+ * shader_variants from the cache, so we can just upload the assembly of all
+ * the shader stages at once.
  */
 struct v3dv_shader_variant *
 v3dv_shader_variant_create(struct v3dv_device *device,
                            broadcom_shader_stage stage,
-                           const unsigned char *variant_sha1,
                            struct v3d_prog_data *prog_data,
                            uint32_t prog_data_size,
-                           const uint64_t *qpu_insts,
+                           uint32_t assembly_offset,
+                           uint64_t *qpu_insts,
                            uint32_t qpu_insts_size,
                            VkResult *out_vk_result)
 {
@@ -1368,70 +1408,35 @@ v3dv_shader_variant_create(struct v3dv_device *device,
       return NULL;
    }
 
-   variant->ref_cnt = 1;
    variant->stage = stage;
-   memcpy(variant->variant_sha1, variant_sha1, sizeof(variant->variant_sha1));
    variant->prog_data_size = prog_data_size;
    variant->prog_data.base = prog_data;
 
-   if (qpu_insts) {
-      if (!upload_assembly(device, variant, stage,
-                           qpu_insts, qpu_insts_size)) {
-         ralloc_free(variant->prog_data.base);
-         vk_free(&device->vk.alloc, variant);
-
-         *out_vk_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
-         return NULL;
-      }
-      variant->qpu_insts_size = qpu_insts_size;
-   }
+   variant->assembly_offset = assembly_offset;
+   variant->qpu_insts_size = qpu_insts_size;
+   variant->qpu_insts = qpu_insts;
 
    *out_vk_result = VK_SUCCESS;
 
    return variant;
 }
 
-/* For a given key, it returns the compiled version of the shader. If it was
- * already compiled, it gets it from the p_stage cache, if not it compiles is
- * through the v3d compiler
+/* For a given key, it returns the compiled version of the shader.
  *
  * If the method returns NULL it means that it was not able to allocate the
- * resources for the variant. out_vk_result would return which OOM applies.
+ * resources for the variant. out_vk_result would return the corresponding OOM
+ * error.
  *
- * Returns a new reference of the shader_variant to the caller.
+ * Returns a new reference to the shader_variant to the caller.
  */
-struct v3dv_shader_variant*
-v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage,
-                        struct v3dv_pipeline_cache *cache,
-                        struct v3d_key *key,
-                        size_t key_size,
-                        const VkAllocationCallbacks *pAllocator,
-                        VkResult *out_vk_result)
+static struct v3dv_shader_variant*
+pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
+                                struct v3d_key *key,
+                                size_t key_size,
+                                const VkAllocationCallbacks *pAllocator,
+                                VkResult *out_vk_result)
 {
-   /* We search on the pipeline cache if provided by the user, or the default
-    * one
-    */
-   unsigned char variant_sha1[20];
-   pipeline_hash_variant(p_stage, key, key_size, variant_sha1);
-
    struct v3dv_pipeline *pipeline = p_stage->pipeline;
-   struct v3dv_device *device = pipeline->device;
-   if (cache == NULL && device->instance->default_pipeline_cache_enabled)
-       cache = &device->default_pipeline_cache;
-
-   struct v3dv_shader_variant *variant =
-      v3dv_pipeline_cache_search_for_variant(pipeline,
-                                             cache,
-                                             variant_sha1);
-
-   if (variant) {
-      pipeline_check_spill_size(pipeline, variant);
-      *out_vk_result = VK_SUCCESS;
-      return variant;
-   }
-   /* If we don't find the variant in any cache, we compile one and add the
-    * variant to the cache
-    */
    struct v3dv_physical_device *physical_device =
       &pipeline->device->instance->physicalDevice;
    const struct v3d_compiler *compiler = physical_device->compiler;
@@ -1448,6 +1453,8 @@ v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage,
    uint64_t *qpu_insts;
    uint32_t qpu_insts_size;
    struct v3d_prog_data *prog_data;
+   uint32_t prog_data_size =
+      v3d_prog_data_size(broadcom_shader_stage_to_gl(p_stage->stage));
 
    qpu_insts = v3d_compile(compiler,
                            key, &prog_data,
@@ -1462,30 +1469,17 @@ v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage,
               p_stage->program_id);
    }
 
-   variant = v3dv_shader_variant_create(device, p_stage->stage,
-                                        variant_sha1,
-                                        prog_data, v3d_prog_data_size(p_stage->stage),
-                                        qpu_insts, qpu_insts_size,
-                                        out_vk_result);
-   if (qpu_insts)
-      free(qpu_insts);
+   struct v3dv_shader_variant *variant =
+      v3dv_shader_variant_create(pipeline->device, p_stage->stage,
+                                 prog_data, prog_data_size,
+                                 0, /* assembly_offset, no final value yet */
+                                 qpu_insts, qpu_insts_size,
+                                 out_vk_result);
 
-   if (variant)
-      pipeline_check_spill_size(pipeline, variant);
-
-   if (*out_vk_result == VK_SUCCESS) {
-      struct v3dv_pipeline_cache *default_cache =
-         &pipeline->device->default_pipeline_cache;
-
-      v3dv_pipeline_cache_upload_variant(pipeline, cache, variant);
-
-      /* Ensure that the NIR shader is on the default cache, as cmd_buffer could
-       * need to change the current variant.
-       */
-      if (default_cache != cache) {
-         v3dv_pipeline_cache_upload_variant(pipeline, default_cache, variant);
-      }
-   }
+   /* At this point we don't need anymore the nir shader, but we are freeing
+    * all the temporary p_stage structs used during the pipeline creation when
+    * we finish it, so let's not worry about freeing the nir here.
+    */
 
    return variant;
 }
@@ -1596,12 +1590,12 @@ pipeline_lower_nir(struct v3dv_pipeline *pipeline,
     * another for the case we need a 32bit return size.
     */
    UNUSED unsigned index =
-      descriptor_map_add(&pipeline->sampler_map,
+      descriptor_map_add(&pipeline->shared_data->sampler_map,
                          -1, -1, -1, 0, 16);
    assert(index == V3DV_NO_SAMPLER_16BIT_IDX);
 
    index =
-      descriptor_map_add(&pipeline->sampler_map,
+      descriptor_map_add(&pipeline->shared_data->sampler_map,
                          -2, -2, -2, 0, 32);
    assert(index == V3DV_NO_SAMPLER_32BIT_IDX);
 
@@ -1693,75 +1687,184 @@ pipeline_hash_shader(const struct vk_shader_module *module,
    _mesa_sha1_final(&ctx, sha1_out);
 }
 
-
 static VkResult
 pipeline_compile_vertex_shader(struct v3dv_pipeline *pipeline,
-                               struct v3dv_pipeline_cache *cache,
-                               const VkGraphicsPipelineCreateInfo *pCreateInfo,
-                               const VkAllocationCallbacks *pAllocator)
+                               const VkAllocationCallbacks *pAllocator,
+                               const VkGraphicsPipelineCreateInfo *pCreateInfo)
 {
    struct v3dv_pipeline_stage *p_stage = pipeline->vs;
 
-   pipeline_lower_nir(pipeline, p_stage, pipeline->layout);
    /* Right now we only support pipelines with both vertex and fragment
     * shader.
     */
-   assert(pipeline->fs);
+   assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
 
-   /* Make sure we do all our common lowering *before* we create the vs
-    * and vs_bin pipeline stages, since from that point forward we need to
-    * run lowerings for both of them separately, since each stage will
-    * own its NIR code.
-    */
-   lower_vs_io(p_stage->nir);
-
-   pipeline->vs_bin = pipeline_stage_create_vs_bin(pipeline->vs, pAllocator);
-   if (pipeline->vs_bin == NULL)
-      return VK_ERROR_OUT_OF_HOST_MEMORY;
+   assert(pipeline->vs_bin != NULL);
+   if (pipeline->vs_bin->nir == NULL) {
+      assert(pipeline->vs->nir);
+      pipeline->vs_bin->nir = nir_shader_clone(NULL, pipeline->vs->nir);
+   }
 
+   VkResult vk_result;
    struct v3d_vs_key key;
    pipeline_populate_v3d_vs_key(&key, pCreateInfo, pipeline->vs);
-   VkResult vk_result;
-   pipeline->vs->current_variant =
-      v3dv_get_shader_variant(pipeline->vs, cache, &key.base, sizeof(key),
-                              pAllocator, &vk_result);
+   pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX] =
+      pipeline_compile_shader_variant(pipeline->vs, &key.base, sizeof(key),
+                                      pAllocator, &vk_result);
    if (vk_result != VK_SUCCESS)
       return vk_result;
 
-   pipeline_populate_v3d_vs_key(&key, pCreateInfo, pipeline->vs_bin);
-   pipeline->vs_bin->current_variant =
-      v3dv_get_shader_variant(pipeline->vs_bin, cache, &key.base, sizeof(key),
-                              pAllocator, &vk_result);
+   p_stage = pipeline->vs_bin;
+   pipeline_populate_v3d_vs_key(&key, pCreateInfo, p_stage);
+   pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN] =
+      pipeline_compile_shader_variant(pipeline->vs_bin, &key.base, sizeof(key),
+                                      pAllocator, &vk_result);
 
    return vk_result;
 }
 
 static VkResult
 pipeline_compile_fragment_shader(struct v3dv_pipeline *pipeline,
-                                 struct v3dv_pipeline_cache *cache,
-                                 const VkGraphicsPipelineCreateInfo *pCreateInfo,
-                                 const VkAllocationCallbacks *pAllocator)
+                                 const VkAllocationCallbacks *pAllocator,
+                                 const VkGraphicsPipelineCreateInfo *pCreateInfo)
 {
    struct v3dv_pipeline_stage *p_stage = pipeline->vs;
 
    p_stage = pipeline->fs;
-   pipeline_lower_nir(pipeline, p_stage, pipeline->layout);
 
    struct v3d_fs_key key;
 
    pipeline_populate_v3d_fs_key(&key, pCreateInfo, p_stage,
                                 get_ucp_enable_mask(pipeline->vs));
 
-   lower_fs_io(p_stage->nir);
-
    VkResult vk_result;
-   p_stage->current_variant =
-      v3dv_get_shader_variant(p_stage, cache, &key.base, sizeof(key),
-                              pAllocator, &vk_result);
+   pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT] =
+      pipeline_compile_shader_variant(p_stage, &key.base, sizeof(key),
+                                      pAllocator, &vk_result);
 
    return vk_result;
 }
 
+static void
+pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline,
+                               struct v3dv_pipeline_key *key,
+                               const VkGraphicsPipelineCreateInfo *pCreateInfo)
+{
+   memset(key, 0, sizeof(*key));
+   key->robust_buffer_access =
+      pipeline->device->features.robustBufferAccess;
+
+   const VkPipelineInputAssemblyStateCreateInfo *ia_info =
+      pCreateInfo->pInputAssemblyState;
+   key->topology = vk_to_pipe_prim_type[ia_info->topology];
+
+   const VkPipelineColorBlendStateCreateInfo *cb_info =
+      pCreateInfo->pColorBlendState;
+   key->logicop_func = cb_info && cb_info->logicOpEnable == VK_TRUE ?
+      vk_to_pipe_logicop[cb_info->logicOp] :
+      PIPE_LOGICOP_COPY;
+
+   const bool raster_enabled =
+      !pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
+
+   /* Multisample rasterization state must be ignored if rasterization
+    * is disabled.
+    */
+   const VkPipelineMultisampleStateCreateInfo *ms_info =
+      raster_enabled ? pCreateInfo->pMultisampleState : NULL;
+   if (ms_info) {
+      assert(ms_info->rasterizationSamples == VK_SAMPLE_COUNT_1_BIT ||
+             ms_info->rasterizationSamples == VK_SAMPLE_COUNT_4_BIT);
+      key->msaa = ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT;
+
+      if (key->msaa) {
+         key->sample_coverage =
+            pipeline->sample_mask != (1 << V3D_MAX_SAMPLES) - 1;
+         key->sample_alpha_to_coverage = ms_info->alphaToCoverageEnable;
+         key->sample_alpha_to_one = ms_info->alphaToOneEnable;
+      }
+   }
+
+   const struct v3dv_render_pass *pass =
+      v3dv_render_pass_from_handle(pCreateInfo->renderPass);
+   const struct v3dv_subpass *subpass = pipeline->subpass;
+   for (uint32_t i = 0; i < subpass->color_count; i++) {
+      const uint32_t att_idx = subpass->color_attachments[i].attachment;
+      if (att_idx == VK_ATTACHMENT_UNUSED)
+         continue;
+
+      key->cbufs |= 1 << i;
+
+      VkFormat fb_format = pass->attachments[att_idx].desc.format;
+      enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format);
+
+      /* If logic operations are enabled then we might emit color reads and we
+       * need to know the color buffer format and swizzle for that
+       */
+      if (key->logicop_func != PIPE_LOGICOP_COPY) {
+         key->color_fmt[i].format = fb_pipe_format;
+         key->color_fmt[i].swizzle = v3dv_get_format_swizzle(fb_format);
+      }
+
+      const struct util_format_description *desc =
+         vk_format_description(fb_format);
+
+      if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+          desc->channel[0].size == 32) {
+         key->f32_color_rb |= 1 << i;
+      }
+   }
+
+   const VkPipelineVertexInputStateCreateInfo *vi_info =
+      pCreateInfo->pVertexInputState;
+   for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
+      const VkVertexInputAttributeDescription *desc =
+         &vi_info->pVertexAttributeDescriptions[i];
+      assert(desc->location < MAX_VERTEX_ATTRIBS);
+      if (desc->format == VK_FORMAT_B8G8R8A8_UNORM)
+         key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location);
+   }
+
+}
+
+static void
+pipeline_populate_compute_key(struct v3dv_pipeline *pipeline,
+                              struct v3dv_pipeline_key *key,
+                              const VkComputePipelineCreateInfo *pCreateInfo)
+{
+   /* We use the same pipeline key for graphics and compute, but we don't need
+    * to add a field to flag compute keys because this key is not used alone
+    * to search in the cache, we also use the SPIR-V or the serialized NIR for
+    * example, which already flags compute shaders.
+    */
+   memset(key, 0, sizeof(*key));
+   key->robust_buffer_access =
+      pipeline->device->features.robustBufferAccess;
+}
+
+static struct v3dv_pipeline_shared_data *
+v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20],
+                                    struct v3dv_device *device)
+{
+   size_t size = sizeof(struct v3dv_pipeline_shared_data);
+   /* We create new_entry using the device alloc. Right now shared_data is ref
+    * and unref by both the pipeline and the pipeline cache, so we can't
+    * ensure that the cache or pipeline alloc will be available on the last
+    * unref.
+    */
+   struct v3dv_pipeline_shared_data *new_entry =
+      vk_zalloc2(&device->vk.alloc, NULL, size, 8,
+                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   if (new_entry == NULL)
+      return NULL;
+
+   new_entry->ref_cnt = 1;
+   memcpy(new_entry->sha1_key, sha1_key, 20);
+
+   return new_entry;
+}
+
 /*
  * It compiles a pipeline. Note that it also allocate internal object, but if
  * some allocations success, but other fails, the method is not freeing the
@@ -1782,8 +1885,8 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
    struct v3dv_physical_device *physical_device =
       &device->instance->physicalDevice;
 
-   /* First pass to get the the common info from the shader and the nir
-    * shader. We don't care of the coord shader for now.
+   /* First pass to get some common info from the shader, and create the
+    * individual pipeline_stage objects
     */
    for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
       const VkPipelineShaderStageCreateInfo *sinfo = &pCreateInfo->pStages[i];
@@ -1819,11 +1922,19 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
 
       pipeline->active_stages |= sinfo->stage;
 
-      p_stage->nir = pipeline_stage_get_nir(p_stage, pipeline, cache);
+      /* We will try to get directly the compiled shader variant, so let's not
+       * worry about getting the nir shader for now.
+       */
+      p_stage->nir = NULL;
 
       switch(stage) {
       case MESA_SHADER_VERTEX:
          pipeline->vs = p_stage;
+         pipeline->vs_bin =
+            pipeline_stage_create_vs_bin(pipeline->vs, pAllocator);
+         if (pipeline->vs_bin == NULL)
+            return VK_ERROR_OUT_OF_HOST_MEMORY;
+
          break;
       case MESA_SHADER_FRAGMENT:
          pipeline->fs = p_stage;
@@ -1864,34 +1975,86 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
       pipeline->active_stages |= MESA_SHADER_FRAGMENT;
    }
 
-   /* Linking */
+   /* Now we will try to get the variants from the pipeline cache */
+   struct v3dv_pipeline_key pipeline_key;
+   pipeline_populate_graphics_key(pipeline, &pipeline_key, pCreateInfo);
+   unsigned char pipeline_sha1[20];
+   pipeline_hash_graphics(pipeline, &pipeline_key, pipeline_sha1);
+
+   pipeline->shared_data =
+      v3dv_pipeline_cache_search_for_pipeline(cache, pipeline_sha1);
+
+   if (pipeline->shared_data != NULL) {
+      assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]);
+      assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
+      assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
+
+      goto success;
+   }
+
+   pipeline->shared_data =
+      v3dv_pipeline_shared_data_new_empty(pipeline_sha1, pipeline->device);
+   /* If not, we try to get the nir shaders (from the SPIR-V shader, or from
+    * the pipeline cache again) and compile.
+    */
+   if (!pipeline->vs->nir)
+      pipeline->vs->nir = pipeline_stage_get_nir(pipeline->vs, pipeline, cache);
+   if (!pipeline->fs->nir)
+      pipeline->fs->nir = pipeline_stage_get_nir(pipeline->fs, pipeline, cache);
+
+   /* Linking + pipeline lowerings */
    link_shaders(pipeline->vs->nir, pipeline->fs->nir);
 
-   /* Compiling to vir (or getting it from a cache);
-    */
+   pipeline_lower_nir(pipeline, pipeline->fs, pipeline->layout);
+   lower_fs_io(pipeline->fs->nir);
+
+   pipeline_lower_nir(pipeline, pipeline->vs, pipeline->layout);
+   lower_vs_io(pipeline->vs->nir);
+
+   /* Compiling to vir */
    VkResult vk_result;
-   vk_result = pipeline_compile_fragment_shader(pipeline, cache,
-                                                pCreateInfo, pAllocator);
+
+   /* We should have got all the variants or no variants from the cache */
+   assert(!pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
+   vk_result = pipeline_compile_fragment_shader(pipeline, pAllocator, pCreateInfo);
    if (vk_result != VK_SUCCESS)
       return vk_result;
 
-   vk_result = pipeline_compile_vertex_shader(pipeline, cache,
-                                              pCreateInfo, pAllocator);
+   assert(!pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX] &&
+          !pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
+
+   vk_result = pipeline_compile_vertex_shader(pipeline, pAllocator, pCreateInfo);
    if (vk_result != VK_SUCCESS)
       return vk_result;
 
+   if (!upload_assembly(pipeline))
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+   v3dv_pipeline_cache_upload_pipeline(pipeline, cache);
+
+   /* As we got the variants in pipeline->shared_data, after compiling we
+    * don't need the pipeline_stages
+    */
+   pipeline_free_stages(device, pipeline, pAllocator);
+
+ success:
+   pipeline_check_spill_size(pipeline);
+
    /* FIXME: values below are default when non-GS is available. Would need to
     * provide real values if GS gets supported
     */
+   struct v3dv_shader_variant *vs_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
+   struct v3dv_shader_variant *vs_bin_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
+
    pipeline->vpm_cfg_bin.As = 1;
    pipeline->vpm_cfg_bin.Ve = 0;
-   pipeline->vpm_cfg_bin.Vc =
-      pipeline->vs_bin->current_variant->prog_data.vs->vcm_cache_size;
+   pipeline->vpm_cfg_bin.Vc = vs_bin_variant->prog_data.vs->vcm_cache_size;
 
    pipeline->vpm_cfg.As = 1;
    pipeline->vpm_cfg.Ve = 0;
-   pipeline->vpm_cfg.Vc =
-      pipeline->vs->current_variant->prog_data.vs->vcm_cache_size;
+   pipeline->vpm_cfg.Vc = vs_variant->prog_data.vs->vcm_cache_size;
 
    return VK_SUCCESS;
 }
@@ -2397,13 +2560,13 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
           cl_packet_length(GL_SHADER_STATE_RECORD));
 
    struct v3d_fs_prog_data *prog_data_fs =
-      pipeline->fs->current_variant->prog_data.fs;
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs;
 
    struct v3d_vs_prog_data *prog_data_vs =
-      pipeline->vs->current_variant->prog_data.vs;
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]->prog_data.vs;
 
    struct v3d_vs_prog_data *prog_data_vs_bin =
-      pipeline->vs_bin->current_variant->prog_data.vs;
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]->prog_data.vs;
 
 
    /* Note: we are not packing addresses, as we need the job (see
@@ -2787,7 +2950,7 @@ pipeline_init(struct v3dv_pipeline *pipeline,
 
    pipeline->va_count = 0;
    struct v3d_vs_prog_data *prog_data_vs =
-      pipeline->vs->current_variant->prog_data.vs;
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]->prog_data.vs;
 
    for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
       const VkVertexInputAttributeDescription *desc =
@@ -2835,7 +2998,7 @@ graphics_pipeline_create(VkDevice _device,
 
    /* Use the default pipeline cache if none is specified */
    if (cache == NULL && device->instance->default_pipeline_cache_enabled)
-       cache = &device->default_pipeline_cache;
+      cache = &device->default_pipeline_cache;
 
    pipeline = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pipeline),
                                VK_OBJECT_TYPE_PIPELINE);
@@ -2945,24 +3108,62 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
                         p_stage->spec_info,
                         p_stage->shader_sha1);
 
-   p_stage->nir = pipeline_stage_get_nir(p_stage, pipeline, cache);
+   /* We try to get directly the variant first from the cache */
+   p_stage->nir = NULL;
 
+   pipeline->cs = p_stage;
    pipeline->active_stages |= sinfo->stage;
+
+   struct v3dv_pipeline_key pipeline_key;
+   pipeline_populate_compute_key(pipeline, &pipeline_key, info);
+   unsigned char pipeline_sha1[20];
+   pipeline_hash_compute(pipeline, &pipeline_key, pipeline_sha1);
+
+   pipeline->shared_data =
+      v3dv_pipeline_cache_search_for_pipeline(cache, pipeline_sha1);
+
+   if (pipeline->shared_data != NULL) {
+      assert(pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
+      goto success;
+   }
+
+   pipeline->shared_data = v3dv_pipeline_shared_data_new_empty(pipeline_sha1,
+                                                               pipeline->device);
+
+   /* If not found on cache, compile it */
+   p_stage->nir = pipeline_stage_get_nir(p_stage, pipeline, cache);
+   assert(p_stage->nir);
+
    st_nir_opts(p_stage->nir);
    pipeline_lower_nir(pipeline, p_stage, pipeline->layout);
    lower_cs_shared(p_stage->nir);
 
-   pipeline->cs = p_stage;
+   VkResult result = VK_SUCCESS;
 
    struct v3d_key key;
    memset(&key, 0, sizeof(key));
    pipeline_populate_v3d_key(&key, p_stage, 0,
                              pipeline->device->features.robustBufferAccess);
+   pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE] =
+      pipeline_compile_shader_variant(p_stage, &key, sizeof(key),
+                                      alloc, &result);
 
-   VkResult result;
-   p_stage->current_variant =
-      v3dv_get_shader_variant(p_stage, cache, &key, sizeof(key), alloc, &result);
-   return result;
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (!upload_assembly(pipeline))
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+   v3dv_pipeline_cache_upload_pipeline(pipeline, cache);
+   /* As we got the variants in pipeline->shared_data, after compiling we
+    * don't need the pipeline_stages
+    */
+   pipeline_free_stages(device, pipeline, alloc);
+
+ success:
+   pipeline_check_spill_size(pipeline);
+
+   return VK_SUCCESS;
 }
 
 static VkResult
@@ -2997,7 +3198,7 @@ compute_pipeline_create(VkDevice _device,
 
    /* Use the default pipeline cache if none is specified */
    if (cache == NULL && device->instance->default_pipeline_cache_enabled)
-       cache = &device->default_pipeline_cache;
+      cache = &device->default_pipeline_cache;
 
    pipeline = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pipeline),
                                VK_OBJECT_TYPE_PIPELINE);
diff --git a/src/broadcom/vulkan/v3dv_pipeline_cache.c b/src/broadcom/vulkan/v3dv_pipeline_cache.c
index ab1f71639af..4200eba9fa5 100644
--- a/src/broadcom/vulkan/v3dv_pipeline_cache.c
+++ b/src/broadcom/vulkan/v3dv_pipeline_cache.c
@@ -58,9 +58,9 @@ cache_dump_stats(struct v3dv_pipeline_cache *cache)
    fprintf(stderr, "  NIR cache miss count:   %d\n", cache->nir_stats.miss);
    fprintf(stderr, "  NIR cache hit  count:   %d\n", cache->nir_stats.hit);
 
-   fprintf(stderr, "  variant cache entries:      %d\n", cache->variant_stats.count);
-   fprintf(stderr, "  variant cache miss count:   %d\n", cache->variant_stats.miss);
-   fprintf(stderr, "  variant cache hit  count:   %d\n", cache->variant_stats.hit);
+   fprintf(stderr, "  cache entries:      %d\n", cache->stats.count);
+   fprintf(stderr, "  cache miss count:   %d\n", cache->stats.miss);
+   fprintf(stderr, "  cache hit  count:   %d\n", cache->stats.hit);
 }
 
 void
@@ -197,59 +197,65 @@ v3dv_pipeline_cache_init(struct v3dv_pipeline_cache *cache,
       cache->nir_stats.hit = 0;
       cache->nir_stats.count = 0;
 
-      cache->variant_cache = _mesa_hash_table_create(NULL, sha1_hash_func,
-                                                     sha1_compare_func);
-      cache->variant_stats.miss = 0;
-      cache->variant_stats.hit = 0;
-      cache->variant_stats.count = 0;
+      cache->cache = _mesa_hash_table_create(NULL, sha1_hash_func,
+                                             sha1_compare_func);
+      cache->stats.miss = 0;
+      cache->stats.hit = 0;
+      cache->stats.count = 0;
    } else {
       cache->nir_cache = NULL;
-      cache->variant_cache = NULL;
+      cache->cache = NULL;
    }
 
 }
 
-struct v3dv_shader_variant*
-v3dv_pipeline_cache_search_for_variant(struct v3dv_pipeline *pipeline,
-                                       struct v3dv_pipeline_cache *cache,
-                                       unsigned char sha1_key[20])
+/**
+ * It searchs for pipeline cached data, and returns a v3dv_pipeline_shared_data with
+ * it, or NULL if doesn't have it cached. On the former, it will increases the
+ * ref_count, so caller is responsible to unref it.
+ */
+struct v3dv_pipeline_shared_data *
+v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
+                                        unsigned char sha1_key[20])
 {
-   if (!cache || !cache->variant_cache)
+   if (!cache || !cache->cache)
       return NULL;
 
    if (debug_cache) {
       char sha1buf[41];
       _mesa_sha1_format(sha1buf, sha1_key);
 
-      fprintf(stderr, "pipeline cache %p, search variant with key %s\n", cache, sha1buf);
+      fprintf(stderr, "pipeline cache %p, search pipeline with key %s\n", cache, sha1buf);
    }
 
    pthread_mutex_lock(&cache->mutex);
 
    struct hash_entry *entry =
-      _mesa_hash_table_search(cache->variant_cache, sha1_key);
+      _mesa_hash_table_search(cache->cache, sha1_key);
 
    if (entry) {
-      struct v3dv_shader_variant *variant =
-         (struct v3dv_shader_variant *) entry->data;
+      struct v3dv_pipeline_shared_data *cache_entry =
+         (struct v3dv_pipeline_shared_data *) entry->data;
+      assert(cache_entry);
 
-      cache->variant_stats.hit++;
+      cache->stats.hit++;
       if (debug_cache) {
-         fprintf(stderr, "\tvariant cache hit: %p\n", variant);
+         fprintf(stderr, "\tcache hit: %p\n", cache_entry);
          if (dump_stats)
             cache_dump_stats(cache);
       }
 
-      if (variant)
-         v3dv_shader_variant_ref(variant);
+
+      v3dv_pipeline_shared_data_ref(cache_entry);
 
       pthread_mutex_unlock(&cache->mutex);
-      return variant;
+
+      return cache_entry;
    }
 
-   cache->variant_stats.miss++;
+   cache->stats.miss++;
    if (debug_cache) {
-      fprintf(stderr, "\tvariant cache miss\n");
+      fprintf(stderr, "\tcache miss\n");
       if (dump_stats)
          cache_dump_stats(cache);
    }
@@ -259,34 +265,109 @@ v3dv_pipeline_cache_search_for_variant(struct v3dv_pipeline *pipeline,
 }
 
 void
-v3dv_pipeline_cache_upload_variant(struct v3dv_pipeline *pipeline,
-                                   struct v3dv_pipeline_cache *cache,
-                                   struct v3dv_shader_variant  *variant)
+v3dv_pipeline_shared_data_destroy(struct v3dv_device *device,
+                                  struct v3dv_pipeline_shared_data *shared_data)
 {
-   if (!cache || !cache->variant_cache)
+   assert(shared_data->ref_cnt == 0);
+
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      if (shared_data->variants[stage] != NULL)
+         v3dv_shader_variant_destroy(device, shared_data->variants[stage]);
+   }
+
+   if (shared_data->assembly_bo)
+      v3dv_bo_free(device, shared_data->assembly_bo);
+
+   vk_free(&device->vk.alloc, shared_data);
+}
+
+static struct v3dv_pipeline_shared_data *
+v3dv_pipeline_shared_data_new(struct v3dv_pipeline_cache *cache,
+                              const unsigned char sha1_key[20],
+                              struct v3dv_shader_variant **variants,
+                              const struct v3dv_descriptor_map *ubo_map,
+                              const struct v3dv_descriptor_map *ssbo_map,
+                              const struct v3dv_descriptor_map *sampler_map,
+                              const struct v3dv_descriptor_map *texture_map,
+                              const uint64_t *total_assembly,
+                              const uint32_t total_assembly_size)
+{
+   size_t size = sizeof(struct v3dv_pipeline_shared_data);
+   /* We create new_entry using the device alloc. Right now shared_data is ref
+    * and unref by both the pipeline and the pipeline cache, so we can't
+    * ensure that the cache or pipeline alloc will be available on the last
+    * unref.
+    */
+   struct v3dv_pipeline_shared_data *new_entry =
+      vk_zalloc2(&cache->device->vk.alloc, NULL, size, 8,
+                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   if (new_entry == NULL)
+      return NULL;
+
+   new_entry->ref_cnt = 1;
+   memcpy(new_entry->sha1_key, sha1_key, 20);
+
+   memcpy(&new_entry->ubo_map, ubo_map, sizeof(struct v3dv_descriptor_map));
+   memcpy(&new_entry->ssbo_map, ssbo_map, sizeof(struct v3dv_descriptor_map));
+   memcpy(&new_entry->sampler_map, sampler_map, sizeof(struct v3dv_descriptor_map));
+   memcpy(&new_entry->texture_map, texture_map, sizeof(struct v3dv_descriptor_map));
+
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++)
+      new_entry->variants[stage] = variants[stage];
+
+   struct v3dv_bo *bo = v3dv_bo_alloc(cache->device, total_assembly_size,
+                                      "pipeline shader assembly", true);
+   if (!bo) {
+      fprintf(stderr, "failed to allocate memory for shaders assembly\n");
+      v3dv_pipeline_shared_data_unref(cache->device, new_entry);
+      return NULL;
+   }
+
+   bool ok = v3dv_bo_map(cache->device, bo, total_assembly_size);
+   if (!ok) {
+      fprintf(stderr, "failed to map source shader buffer\n");
+      v3dv_pipeline_shared_data_unref(cache->device, new_entry);
+      return NULL;
+   }
+
+   memcpy(bo->map, total_assembly, total_assembly_size);
+
+   new_entry->assembly_bo = bo;
+
+   return new_entry;
+}
+
+/* Uploads all the "cacheable" or shared data from the pipeline */
+void
+v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
+                                    struct v3dv_pipeline_cache *cache)
+{
+   if (!cache || !cache->cache)
       return;
 
-   if (cache->variant_stats.count > V3DV_MAX_PIPELINE_CACHE_ENTRIES)
+   if (cache->stats.count > V3DV_MAX_PIPELINE_CACHE_ENTRIES)
       return;
 
    pthread_mutex_lock(&cache->mutex);
    struct hash_entry *entry =
-      _mesa_hash_table_search(cache->variant_cache, variant->variant_sha1);
+      _mesa_hash_table_search(cache->cache, pipeline->shared_data->sha1_key);
 
    if (entry) {
       pthread_mutex_unlock(&cache->mutex);
       return;
    }
 
-   v3dv_shader_variant_ref(variant);
-   _mesa_hash_table_insert(cache->variant_cache, variant->variant_sha1, variant);
-   cache->variant_stats.count++;
+   v3dv_pipeline_shared_data_ref(pipeline->shared_data);
+   _mesa_hash_table_insert(cache->cache, pipeline->shared_data->sha1_key,
+                           pipeline->shared_data);
+   cache->stats.count++;
    if (debug_cache) {
       char sha1buf[41];
-      _mesa_sha1_format(sha1buf, variant->variant_sha1);
+      _mesa_sha1_format(sha1buf, pipeline->shared_data->sha1_key);
 
-      fprintf(stderr, "pipeline cache %p, new variant entry with key %s\n\t%p\n",
-              cache, sha1buf, variant);
+      fprintf(stderr, "pipeline cache %p, new cache entry with sha1 key %s:%p\n\n",
+              cache, sha1buf, pipeline->shared_data);
       if (dump_stats)
          cache_dump_stats(cache);
    }
@@ -321,8 +402,6 @@ shader_variant_create_from_blob(struct v3dv_device *device,
 
    broadcom_shader_stage stage = blob_read_uint32(blob);
 
-   const unsigned char *variant_sha1 = blob_read_bytes(blob, 20);
-
    uint32_t prog_data_size = blob_read_uint32(blob);
    /* FIXME: as we include the stage perhaps we can avoid prog_data_size? */
    assert(prog_data_size == v3d_prog_data_size(broadcom_shader_stage_to_gl(stage)));
@@ -342,10 +421,8 @@ shader_variant_create_from_blob(struct v3dv_device *device,
    if (blob->overrun)
       return NULL;
 
+   uint32_t assembly_offset = blob_read_uint32(blob);
    uint32_t qpu_insts_size = blob_read_uint32(blob);
-   const uint64_t *qpu_insts = blob_read_bytes(blob, qpu_insts_size);
-   if (blob->overrun)
-      return NULL;
 
    /* shader_variant_create expects a newly created prog_data for their own,
     * as it is what the v3d compiler returns. So we are also allocating one
@@ -362,12 +439,53 @@ shader_variant_create_from_blob(struct v3dv_device *device,
    memcpy(ulist->data, ulist_data_data, ulist_data_size);
 
    return v3dv_shader_variant_create(device, stage,
-                                     variant_sha1,
                                      new_prog_data, prog_data_size,
-                                     qpu_insts, qpu_insts_size,
+                                     assembly_offset,
+                                     NULL, qpu_insts_size,
                                      &result);
 }
 
+static struct v3dv_pipeline_shared_data *
+v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache,
+                                           struct blob_reader *blob)
+{
+   const unsigned char *sha1_key = blob_read_bytes(blob, 20);
+
+   const struct v3dv_descriptor_map *ubo_map =
+      blob_read_bytes(blob, sizeof(struct v3dv_descriptor_map));
+   const struct v3dv_descriptor_map *ssbo_map =
+      blob_read_bytes(blob, sizeof(struct v3dv_descriptor_map));
+   const struct v3dv_descriptor_map *sampler_map =
+      blob_read_bytes(blob, sizeof(struct v3dv_descriptor_map));
+   const struct v3dv_descriptor_map *texture_map =
+      blob_read_bytes(blob, sizeof(struct v3dv_descriptor_map));
+
+   if (blob->overrun)
+      return NULL;
+
+   uint8_t variant_count = blob_read_uint8(blob);
+
+   struct v3dv_shader_variant *variants[BROADCOM_SHADER_STAGES] = { 0 };
+
+   for (uint8_t count = 0; count < variant_count; count++) {
+      uint8_t stage = blob_read_uint8(blob);
+      struct v3dv_shader_variant *variant =
+         shader_variant_create_from_blob(cache->device, blob);
+      variants[stage] = variant;
+   }
+
+   uint32_t total_assembly_size = blob_read_uint32(blob);
+   const uint64_t *total_assembly =
+      blob_read_bytes(blob, total_assembly_size);
+
+   if (blob->overrun)
+      return NULL;
+
+   return v3dv_pipeline_shared_data_new(cache, sha1_key, variants,
+                                        ubo_map, ssbo_map, sampler_map, texture_map,
+                                        total_assembly, total_assembly_size);
+}
+
 static void
 pipeline_cache_load(struct v3dv_pipeline_cache *cache,
                     size_t size,
@@ -377,7 +495,7 @@ pipeline_cache_load(struct v3dv_pipeline_cache *cache,
    struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
    struct vk_pipeline_cache_header header;
 
-   if (cache->variant_cache == NULL)
+   if (cache->cache == NULL || cache->nir_cache == NULL)
       return;
 
    struct blob_reader blob;
@@ -418,17 +536,18 @@ pipeline_cache_load(struct v3dv_pipeline_cache *cache,
       return;
 
    for (uint32_t i = 0; i < count; i++) {
-      struct v3dv_shader_variant *variant =
-         shader_variant_create_from_blob(device, &blob);
-      if (!variant)
+      struct v3dv_pipeline_shared_data *cache_entry =
+         v3dv_pipeline_shared_data_create_from_blob(cache, &blob);
+      if (!cache_entry)
          break;
-      _mesa_hash_table_insert(cache->variant_cache, variant->variant_sha1, variant);
-      cache->variant_stats.count++;
+
+      _mesa_hash_table_insert(cache->cache, cache_entry->sha1_key, cache_entry);
+      cache->stats.count++;
    }
 
    if (debug_cache) {
       fprintf(stderr, "pipeline cache %p, loaded %i nir shaders and "
-              "%i variant entries\n", cache, nir_count, count);
+              "%i entries\n", cache, nir_count, count);
       if (dump_stats)
          cache_dump_stats(cache);
    }
@@ -482,15 +601,14 @@ v3dv_pipeline_cache_finish(struct v3dv_pipeline_cache *cache)
       _mesa_hash_table_destroy(cache->nir_cache, NULL);
    }
 
-   if (cache->variant_cache) {
-      hash_table_foreach(cache->variant_cache, entry) {
-         struct v3dv_shader_variant *variant = entry->data;
-         if (variant)
-            v3dv_shader_variant_unref(cache->device, variant);
+   if (cache->cache) {
+      hash_table_foreach(cache->cache, entry) {
+         struct v3dv_pipeline_shared_data *cache_entry = entry->data;
+         if (cache_entry)
+            v3dv_pipeline_shared_data_unref(cache->device, cache_entry);
       }
 
-      _mesa_hash_table_destroy(cache->variant_cache, NULL);
-
+      _mesa_hash_table_destroy(cache->cache, NULL);
    }
 }
 
@@ -518,12 +636,12 @@ v3dv_MergePipelineCaches(VkDevice device,
 {
    V3DV_FROM_HANDLE(v3dv_pipeline_cache, dst, dstCache);
 
-   if (!dst->variant_cache || !dst->nir_cache)
+   if (!dst->cache || !dst->nir_cache)
       return VK_SUCCESS;
 
    for (uint32_t i = 0; i < srcCacheCount; i++) {
       V3DV_FROM_HANDLE(v3dv_pipeline_cache, src, pSrcCaches[i]);
-      if (!src->variant_cache || !src->nir_cache)
+      if (!src->cache || !src->nir_cache)
          continue;
 
       hash_table_foreach(src->nir_cache, entry) {
@@ -559,22 +677,22 @@ v3dv_MergePipelineCaches(VkDevice device,
          }
       }
 
-      hash_table_foreach(src->variant_cache, entry) {
-         struct v3dv_shader_variant *variant = entry->data;
-         assert(variant);
+      hash_table_foreach(src->cache, entry) {
+         struct v3dv_pipeline_shared_data *cache_entry = entry->data;
+         assert(cache_entry);
 
-         if (_mesa_hash_table_search(dst->variant_cache, variant->variant_sha1))
+         if (_mesa_hash_table_search(dst->cache, cache_entry->sha1_key))
             continue;
 
-         v3dv_shader_variant_ref(variant);
-         _mesa_hash_table_insert(dst->variant_cache, variant->variant_sha1, variant);
+         v3dv_pipeline_shared_data_ref(cache_entry);
+         _mesa_hash_table_insert(dst->cache, cache_entry->sha1_key, cache_entry);
 
-         dst->variant_stats.count++;
+         dst->stats.count++;
          if (debug_cache) {
             char sha1buf[41];
-            _mesa_sha1_format(sha1buf, variant->variant_sha1);
+            _mesa_sha1_format(sha1buf, cache_entry->sha1_key);
 
-            fprintf(stderr, "pipeline cache %p, added variant entry %s "
+            fprintf(stderr, "pipeline cache %p, added entry %s "
                     "from pipeline cache %p\n",
                     dst, sha1buf, src);
             if (dump_stats)
@@ -592,8 +710,6 @@ shader_variant_write_to_blob(const struct v3dv_shader_variant *variant,
 {
    blob_write_uint32(blob, variant->stage);
 
-   blob_write_bytes(blob, variant->variant_sha1, sizeof(variant->variant_sha1));
-
    blob_write_uint32(blob, variant->prog_data_size);
    blob_write_bytes(blob, variant->prog_data.base, variant->prog_data_size);
 
@@ -602,13 +718,62 @@ shader_variant_write_to_blob(const struct v3dv_shader_variant *variant,
    blob_write_bytes(blob, ulist->contents, sizeof(enum quniform_contents) * ulist->count);
    blob_write_bytes(blob, ulist->data, sizeof(uint32_t) * ulist->count);
 
+   blob_write_uint32(blob, variant->assembly_offset);
    blob_write_uint32(blob, variant->qpu_insts_size);
-   assert(variant->assembly_bo->map);
-   blob_write_bytes(blob, variant->assembly_bo->map, variant->qpu_insts_size);
 
    return !blob->out_of_memory;
 }
 
+static bool
+v3dv_pipeline_shared_data_write_to_blob(const struct v3dv_pipeline_shared_data *cache_entry,
+                                        struct blob *blob)
+{
+   blob_write_bytes(blob, cache_entry->sha1_key, 20);
+
+   blob_write_bytes(blob, &cache_entry->ubo_map,
+                    sizeof(struct v3dv_descriptor_map));
+   blob_write_bytes(blob, &cache_entry->ssbo_map,
+                    sizeof(struct v3dv_descriptor_map));
+   blob_write_bytes(blob, &cache_entry->sampler_map,
+                    sizeof(struct v3dv_descriptor_map));
+   blob_write_bytes(blob, &cache_entry->texture_map,
+                    sizeof(struct v3dv_descriptor_map));
+
+   uint8_t variant_count = 0;
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      if (cache_entry->variants[stage] == NULL)
+         continue;
+      variant_count++;
+   }
+
+   /* Right now we only support compute pipeline, or graphics pipeline with
+    * vertex, vertex bin, and fragment shader.
+    */
+   assert(variant_count == 3 ||
+          (variant_count == 1 && cache_entry->variants[BROADCOM_SHADER_COMPUTE]));
+   blob_write_uint8(blob, variant_count);
+
+   uint32_t total_assembly_size = 0;
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      if (cache_entry->variants[stage] == NULL)
+         continue;
+
+      blob_write_uint8(blob, stage);
+      if (!shader_variant_write_to_blob(cache_entry->variants[stage], blob))
+         return false;
+
+      total_assembly_size += cache_entry->variants[stage]->qpu_insts_size;
+   }
+   blob_write_uint32(blob, total_assembly_size);
+
+   assert(cache_entry->assembly_bo->map);
+   assert(cache_entry->assembly_bo->size > total_assembly_size);
+   blob_write_bytes(blob, cache_entry->assembly_bo->map, total_assembly_size);
+
+   return !blob->out_of_memory;
+}
+
+
 VkResult
 v3dv_GetPipelineCacheData(VkDevice _device,
                           VkPipelineCache _cache,
@@ -679,12 +844,12 @@ v3dv_GetPipelineCacheData(VkDevice _device,
       return VK_INCOMPLETE;
    }
 
-   if (cache->variant_cache) {
-      hash_table_foreach(cache->variant_cache, entry) {
-         struct v3dv_shader_variant *variant = entry->data;
+   if (cache->cache) {
+      hash_table_foreach(cache->cache, entry) {
+         struct v3dv_pipeline_shared_data *cache_entry = entry->data;
 
          size_t save_size = blob.size;
-         if (!shader_variant_write_to_blob(variant, &blob)) {
+         if (!v3dv_pipeline_shared_data_write_to_blob(cache_entry, &blob)) {
             /* If it fails reset to the previous size and bail */
             blob.size = save_size;
             pthread_mutex_unlock(&cache->mutex);
@@ -703,10 +868,10 @@ v3dv_GetPipelineCacheData(VkDevice _device,
    blob_finish(&blob);
 
    if (debug_cache) {
-      assert(count <= cache->variant_stats.count);
+      assert(count <= cache->stats.count);
       fprintf(stderr, "GetPipelineCacheData: serializing cache %p, "
               "%i nir shader entries "
-              "%i variant entries, %u DataSize\n",
+              "%i entries, %u DataSize\n",
               cache, nir_count, count, (uint32_t) *pDataSize);
    }
 
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index b6af5a7fc46..6266f5e3f52 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -254,6 +254,23 @@ struct v3dv_meta_texel_buffer_copy_pipeline {
    uint8_t key[V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE];
 };
 
+struct v3dv_pipeline_key {
+   bool robust_buffer_access;
+   uint8_t topology;
+   uint8_t logicop_func;
+   bool msaa;
+   bool sample_coverage;
+   bool sample_alpha_to_coverage;
+   bool sample_alpha_to_one;
+   uint8_t cbufs;
+   struct {
+      enum pipe_format format;
+      const uint8_t *swizzle;
+   } color_fmt[V3D_MAX_DRAW_BUFFERS];
+   uint8_t f32_color_rb;
+   uint32_t va_swap_rb_mask;
+};
+
 struct v3dv_pipeline_cache_stats {
    uint32_t miss;
    uint32_t hit;
@@ -314,8 +331,8 @@ struct v3dv_pipeline_cache {
    struct hash_table *nir_cache;
    struct v3dv_pipeline_cache_stats nir_stats;
 
-   struct hash_table *variant_cache;
-   struct v3dv_pipeline_cache_stats variant_stats;
+   struct hash_table *cache;
+   struct v3dv_pipeline_cache_stats stats;
 };
 
 struct v3dv_device {
@@ -1340,15 +1357,8 @@ vk_to_mesa_shader_stage(VkShaderStageFlagBits vk_stage)
 }
 
 struct v3dv_shader_variant {
-   uint32_t ref_cnt;
-
    broadcom_shader_stage stage;
 
-   /* key for the pipeline cache, it is p_stage shader_sha1 + v3d compiler
-    * sha1
-    */
-   unsigned char variant_sha1[20];
-
    union {
       struct v3d_prog_data *base;
       struct v3d_vs_prog_data *vs;
@@ -1360,11 +1370,17 @@ struct v3dv_shader_variant {
     * serialize
     */
    uint32_t prog_data_size;
-   /* FIXME: using one bo per shader. Eventually we would be interested on
-    * reusing the same bo for all the shaders, like a bo per v3dv_pipeline for
-    * shaders.
+
+   /* The assembly for this variant will be uploaded to a BO shared with all
+    * other shader stages in that pipeline. This is the offset in that BO.
     */
-   struct v3dv_bo *assembly_bo;
+   uint32_t assembly_offset;
+
+   /* Note: it is really likely that qpu_insts would be NULL, as it will be
+    * used only temporarily, to upload it to the shared bo, as we compile the
+    * different stages individually.
+    */
+   uint64_t *qpu_insts;
    uint32_t qpu_insts_size;
 };
 
@@ -1393,8 +1409,6 @@ struct v3dv_pipeline_stage {
 
    /** A name for this program, so you can track it in shader-db output. */
    uint32_t program_id;
-
-   struct v3dv_shader_variant*current_variant;
 };
 
 /* FIXME: although the full vpm_config is not required at this point, as we
@@ -1606,6 +1620,25 @@ v3dv_pipeline_combined_index_key_unpack(uint32_t combined_index_key,
       *sampler_index = sampler;
 }
 
+/* The structure represents data shared between different objects, like the
+ * pipeline and the pipeline cache, so we ref count it to know when it should
+ * be freed.
+ */
+struct v3dv_pipeline_shared_data {
+   uint32_t ref_cnt;
+
+   unsigned char sha1_key[20];
+
+   struct v3dv_descriptor_map ubo_map;
+   struct v3dv_descriptor_map ssbo_map;
+   struct v3dv_descriptor_map sampler_map;
+   struct v3dv_descriptor_map texture_map;
+
+   struct v3dv_shader_variant *variants[BROADCOM_SHADER_STAGES];
+
+   struct v3dv_bo *assembly_bo;
+};
+
 struct v3dv_pipeline {
    struct vk_object_base base;
 
@@ -1668,11 +1701,7 @@ struct v3dv_pipeline {
 
    enum pipe_prim_type topology;
 
-   struct v3dv_descriptor_map ubo_map;
-   struct v3dv_descriptor_map ssbo_map;
-
-   struct v3dv_descriptor_map sampler_map;
-   struct v3dv_descriptor_map texture_map;
+   struct v3dv_pipeline_shared_data *shared_data;
 
    /* FIXME: this bo is another candidate to data to be uploaded using a
     * resource manager, instead of a individual bo
@@ -1848,9 +1877,12 @@ void v3d_store_tiled_image(void *dst, uint32_t dst_stride,
                            const struct pipe_box *box);
 
 struct v3dv_cl_reloc v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
-                                         struct v3dv_pipeline_stage *p_stage);
+                                         struct v3dv_pipeline *pipeline,
+                                         struct v3dv_shader_variant *variant);
+
 struct v3dv_cl_reloc v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
-                                                    struct v3dv_pipeline_stage *p_stage,
+                                                    struct v3dv_pipeline *pipeline,
+                                                    struct v3dv_shader_variant *variant,
                                                     uint32_t **wg_count_offsets);
 
 struct v3dv_shader_variant *
@@ -1864,10 +1896,10 @@ v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage,
 struct v3dv_shader_variant *
 v3dv_shader_variant_create(struct v3dv_device *device,
                            broadcom_shader_stage stage,
-                           const unsigned char *variant_sha1,
                            struct v3d_prog_data *prog_data,
                            uint32_t prog_data_size,
-                           const uint64_t *qpu_insts,
+                           uint32_t assembly_offset,
+                           uint64_t *qpu_insts,
                            uint32_t qpu_insts_size,
                            VkResult *out_vk_result);
 
@@ -1876,19 +1908,23 @@ v3dv_shader_variant_destroy(struct v3dv_device *device,
                             struct v3dv_shader_variant *variant);
 
 static inline void
-v3dv_shader_variant_ref(struct v3dv_shader_variant *variant)
+v3dv_pipeline_shared_data_ref(struct v3dv_pipeline_shared_data *shared_data)
 {
-   assert(variant && variant->ref_cnt >= 1);
-   p_atomic_inc(&variant->ref_cnt);
+   assert(shared_data && shared_data->ref_cnt >= 1);
+   p_atomic_inc(&shared_data->ref_cnt);
 }
 
+void
+v3dv_pipeline_shared_data_destroy(struct v3dv_device *device,
+                                  struct v3dv_pipeline_shared_data *shared_data);
+
 static inline void
-v3dv_shader_variant_unref(struct v3dv_device *device,
-                          struct v3dv_shader_variant *variant)
+v3dv_pipeline_shared_data_unref(struct v3dv_device *device,
+                                struct v3dv_pipeline_shared_data *shared_data)
 {
-   assert(variant && variant->ref_cnt >= 1);
-   if (p_atomic_dec_zero(&variant->ref_cnt))
-      v3dv_shader_variant_destroy(device, variant);
+   assert(shared_data && shared_data->ref_cnt >= 1);
+   if (p_atomic_dec_zero(&shared_data->ref_cnt))
+      v3dv_pipeline_shared_data_destroy(device, shared_data);
 }
 
 struct v3dv_descriptor *
@@ -1953,15 +1989,13 @@ nir_shader* v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline,
                                                const nir_shader_compiler_options *nir_options,
                                                unsigned char sha1_key[20]);
 
-struct v3dv_shader_variant*
-v3dv_pipeline_cache_search_for_variant(struct v3dv_pipeline *pipeline,
-                                       struct v3dv_pipeline_cache *cache,
-                                       unsigned char sha1_key[20]);
+struct v3dv_pipeline_shared_data *
+v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
+                                        unsigned char sha1_key[20]);
 
 void
-v3dv_pipeline_cache_upload_variant(struct v3dv_pipeline *pipeline,
-                                   struct v3dv_pipeline_cache *cache,
-                                   struct v3dv_shader_variant  *variant);
+v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
+                                    struct v3dv_pipeline_cache *cache);
 
 void v3dv_shader_module_internal_init(struct v3dv_device *device,
                                       struct vk_shader_module *module,
diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c
index e0eea03d194..8dd085862e8 100644
--- a/src/broadcom/vulkan/v3dv_uniforms.c
+++ b/src/broadcom/vulkan/v3dv_uniforms.c
@@ -97,14 +97,15 @@ write_tmu_p0(struct v3dv_cmd_buffer *cmd_buffer,
 
    /* We need to ensure that the texture bo is added to the job */
    struct v3dv_bo *texture_bo =
-      v3dv_descriptor_map_get_texture_bo(descriptor_state, &pipeline->texture_map,
+      v3dv_descriptor_map_get_texture_bo(descriptor_state,
+                                         &pipeline->shared_data->texture_map,
                                          pipeline->layout, texture_idx);
    assert(texture_bo);
    v3dv_job_add_bo(job, texture_bo);
 
    struct v3dv_cl_reloc state_reloc =
       v3dv_descriptor_map_get_texture_shader_state(descriptor_state,
-                                                   &pipeline->texture_map,
+                                                   &pipeline->shared_data->texture_map,
                                                    pipeline->layout,
                                                    texture_idx);
 
@@ -130,12 +131,14 @@ write_tmu_p1(struct v3dv_cmd_buffer *cmd_buffer,
           sampler_idx != V3DV_NO_SAMPLER_32BIT_IDX);
 
    struct v3dv_cl_reloc sampler_state_reloc =
-      v3dv_descriptor_map_get_sampler_state(descriptor_state, &pipeline->sampler_map,
+      v3dv_descriptor_map_get_sampler_state(descriptor_state,
+                                            &pipeline->shared_data->sampler_map,
                                             pipeline->layout, sampler_idx);
 
    const struct v3dv_sampler *sampler =
-      v3dv_descriptor_map_get_sampler(descriptor_state, &pipeline->sampler_map,
-                                         pipeline->layout, sampler_idx);
+      v3dv_descriptor_map_get_sampler(descriptor_state,
+                                      &pipeline->shared_data->sampler_map,
+                                      pipeline->layout, sampler_idx);
    assert(sampler);
 
    /* Set unnormalized coordinates flag from sampler object */
@@ -167,7 +170,7 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
 
    struct v3dv_descriptor_map *map =
       content == QUNIFORM_UBO_ADDR || content == QUNIFORM_GET_UBO_SIZE ?
-      &pipeline->ubo_map : &pipeline->ssbo_map;
+      &pipeline->shared_data->ubo_map : &pipeline->shared_data->ssbo_map;
 
    uint32_t offset =
       content == QUNIFORM_UBO_ADDR ?
@@ -285,7 +288,7 @@ get_texture_size(struct v3dv_cmd_buffer *cmd_buffer,
 
    struct v3dv_descriptor *descriptor =
       v3dv_descriptor_map_get_descriptor(descriptor_state,
-                                         &pipeline->texture_map,
+                                         &pipeline->shared_data->texture_map,
                                          pipeline->layout,
                                          texture_idx, NULL);
 
@@ -309,13 +312,13 @@ get_texture_size(struct v3dv_cmd_buffer *cmd_buffer,
 
 struct v3dv_cl_reloc
 v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
-                               struct v3dv_pipeline_stage *p_stage,
+                               struct v3dv_pipeline *pipeline,
+                               struct v3dv_shader_variant *variant,
                                uint32_t **wg_count_offsets)
 {
    struct v3d_uniform_list *uinfo =
-      &p_stage->current_variant->prog_data.base->uniforms;
+      &variant->prog_data.base->uniforms;
    struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
-   struct v3dv_pipeline *pipeline = p_stage->pipeline;
 
    struct v3dv_job *job = cmd_buffer->state.job;
    assert(job);
@@ -432,7 +435,8 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
 
 struct v3dv_cl_reloc
 v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
-                    struct v3dv_pipeline_stage *p_stage)
+                    struct v3dv_pipeline *pipeline,
+                    struct v3dv_shader_variant *variant)
 {
-   return v3dv_write_uniforms_wg_offsets(cmd_buffer, p_stage, NULL);
+   return v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline, variant, NULL);
 }