v3dv/pipeline: try to get the shader variant directly from the cache

Until now we were always doing a two-step cache lookup, as we were using the NIR shaders to fill up the key to lookup for the compiled shaders. But since we were already generating the sha1 key with the original SPIR-V shader (or its internal NIR representation) any info we were collecting from from NIR is already implicit in the original shader, so we can avoid using the NIR in most cases. Because the v3d_key that is used to compile a shader is populated with data coming directly from the NIR shader or produced during NIR lowerings, we can't use it directly as part of the pipeline cache entry. We could split them, but that would be confusing, so we add a new struct, v3dv_pipeline_key used specifically to search for the compiled shaders on the pipeline cache. v3d_key would be still used to compile the shaders. As we are using the same sha1 key for all compiled shaders in a pipeline, we can also group all of them in the same cache entry, so we don't need a lookup for each stage. This also allows to cache pipeline data shared by all the stages (like the descriptor maps). While we are here, we also create a single BO to store the assembly for all the pipeline stages. Finally, we remove the link to the variant on the pipeline stage struct, to avoid the confusion of having two links to the same data. This mostly means that we stop to use the pipeline stage structures after the pipeline is created, so we can freed them. Reviewed-by: Iago Toral Quiroga <itoral@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9403>
2026-02-16 00:00:28 +01:00 · 2021-02-27 01:05:54 +01:00 · 2021-02-27 01:05:54 +01:00 · e354c52801
commit e354c52801
parent 6afb8a9fec
6 changed files with 819 additions and 391 deletions
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@ -3080,7 +3080,9 @@ job_update_ez_state(struct v3dv_job *job,
    */

   /* If the FS writes Z, then it may update against the chosen EZ direction */
-   if (pipeline->fs->current_variant->prog_data.fs->writes_z) {
+   struct v3dv_shader_variant *fs_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+   if (fs_variant->prog_data.fs->writes_z) {
      job->ez_state = VC5_EZ_DISABLED;
      return;
   }
@ -3673,7 +3675,7 @@ emit_varyings_state(struct v3dv_cmd_buffer *cmd_buffer)
   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;

   struct v3d_fs_prog_data *prog_data_fs =
-      pipeline->fs->current_variant->prog_data.fs;
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs;

   const uint32_t num_flags =
      ARRAY_SIZE(prog_data_fs->flat_shade_flags);
@ -3753,8 +3755,11 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
      (pipeline->layout->shader_stages & VK_SHADER_STAGE_FRAGMENT_BIT);

   if (needs_fs_update) {
+      struct v3dv_shader_variant *fs_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+
      cmd_buffer->state.uniforms.fs =
-         v3dv_write_uniforms(cmd_buffer, pipeline->fs);
+         v3dv_write_uniforms(cmd_buffer, pipeline, fs_variant);
   }

   const bool needs_vs_update =
@ -3762,11 +3767,17 @@ update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
      (pipeline->layout->shader_stages & VK_SHADER_STAGE_VERTEX_BIT);

   if (needs_vs_update) {
+      struct v3dv_shader_variant *vs_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
+
+       struct v3dv_shader_variant *vs_bin_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
+
      cmd_buffer->state.uniforms.vs =
-         v3dv_write_uniforms(cmd_buffer, pipeline->vs);
+         v3dv_write_uniforms(cmd_buffer, pipeline, vs_variant);

      cmd_buffer->state.uniforms.vs_bin =
-         v3dv_write_uniforms(cmd_buffer, pipeline->vs_bin);
+         v3dv_write_uniforms(cmd_buffer, pipeline, vs_bin_variant);
   }
 }

@ -3780,10 +3791,17 @@ emit_gl_shader_state(struct v3dv_cmd_buffer *cmd_buffer)
   struct v3dv_pipeline *pipeline = state->gfx.pipeline;
   assert(pipeline);

+   struct v3d_vs_prog_data *prog_data_vs =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]->prog_data.vs;
+   struct v3d_vs_prog_data *prog_data_vs_bin =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]->prog_data.vs;
+   struct v3d_fs_prog_data *prog_data_fs =
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs;
+
   /* Update the cache dirty flag based on the shader progs data */
-   job->tmu_dirty_rcl |= pipeline->vs_bin->current_variant->prog_data.vs->base.tmu_dirty_rcl;
-   job->tmu_dirty_rcl |= pipeline->vs->current_variant->prog_data.vs->base.tmu_dirty_rcl;
-   job->tmu_dirty_rcl |= pipeline->fs->current_variant->prog_data.fs->base.tmu_dirty_rcl;
+   job->tmu_dirty_rcl |= prog_data_vs_bin->base.tmu_dirty_rcl;
+   job->tmu_dirty_rcl |= prog_data_vs->base.tmu_dirty_rcl;
+   job->tmu_dirty_rcl |= prog_data_fs->base.tmu_dirty_rcl;

   /* See GFXH-930 workaround below */
   uint32_t num_elements_to_emit = MAX2(pipeline->va_count, 1);
@ -3796,6 +3814,14 @@ emit_gl_shader_state(struct v3dv_cmd_buffer *cmd_buffer)
                           32);
   v3dv_return_if_oom(cmd_buffer, NULL);

+   struct v3dv_shader_variant *vs_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
+   struct v3dv_shader_variant *vs_bin_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
+   struct v3dv_shader_variant *fs_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
+   struct v3dv_bo *assembly_bo = pipeline->shared_data->assembly_bo;
+
   cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
                          pipeline->shader_state_record, shader) {

@ -3810,11 +3836,11 @@ emit_gl_shader_state(struct v3dv_cmd_buffer *cmd_buffer)
         pipeline->vpm_cfg.As;

      shader.coordinate_shader_code_address =
-         v3dv_cl_address(pipeline->vs_bin->current_variant->assembly_bo, 0);
+         v3dv_cl_address(assembly_bo, vs_bin_variant->assembly_offset);
      shader.vertex_shader_code_address =
-         v3dv_cl_address(pipeline->vs->current_variant->assembly_bo, 0);
+         v3dv_cl_address(assembly_bo, vs_variant->assembly_offset);
      shader.fragment_shader_code_address =
-         v3dv_cl_address(pipeline->fs->current_variant->assembly_bo, 0);
+         v3dv_cl_address(assembly_bo, fs_variant->assembly_offset);

      shader.coordinate_shader_uniforms_address = cmd_buffer->state.uniforms.vs_bin;
      shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
@ -3825,12 +3851,6 @@ emit_gl_shader_state(struct v3dv_cmd_buffer *cmd_buffer)
   }

   /* Upload vertex element attributes (SHADER_STATE_ATTRIBUTE_RECORD) */
-   struct v3d_vs_prog_data *prog_data_vs =
-      pipeline->vs->current_variant->prog_data.vs;
-
-   struct v3d_vs_prog_data *prog_data_vs_bin =
-      pipeline->vs_bin->current_variant->prog_data.vs;
-
   bool cs_loaded_any = false;
   const bool cs_uses_builtins = prog_data_vs_bin->uses_iid ||
                                 prog_data_vs_bin->uses_biid ||
@ -5122,7 +5142,8 @@ static void
 cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
 {
   assert(cmd_buffer->state.compute.pipeline);
-   assert(cmd_buffer->state.compute.pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
+   assert(cmd_buffer->state.compute.pipeline->active_stages ==
+          VK_SHADER_STAGE_COMPUTE_BIT);

   uint32_t *dirty = &cmd_buffer->state.dirty;
   *dirty &= ~(V3DV_CMD_DIRTY_COMPUTE_PIPELINE |
@ -5198,7 +5219,9 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
                          uint32_t *wg_size_out)
 {
   struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
-   assert(pipeline && pipeline->cs && pipeline->cs->current_variant);
+   assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
+   struct v3dv_shader_variant *cs_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE];

   struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
                                    sizeof(struct v3dv_job), 8,
@ -5222,7 +5245,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
   submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT;

   const struct v3d_compute_prog_data *cpd =
-      pipeline->cs->current_variant->prog_data.cs;
+      cs_variant->prog_data.cs;

   const uint32_t wgs_per_sg = 1; /* FIXME */
   const uint32_t wg_size = cpd->local_size[0] *
@ -5230,7 +5253,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
                            cpd->local_size[2];
   submit->cfg[3] |= wgs_per_sg << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
   submit->cfg[3] |= ((DIV_ROUND_UP(wgs_per_sg * wg_size, 16) - 1) <<
-                     V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT);
+                       V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT);
   submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
   if (wg_size_out)
      *wg_size_out = wg_size;
@ -5240,20 +5263,20 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
                    (group_count_x * group_count_y * group_count_z) - 1;
   assert(submit->cfg[4] != ~0);

-   assert(pipeline->cs->current_variant &&
-          pipeline->cs->current_variant->assembly_bo);
-   const struct v3dv_shader_variant *variant = pipeline->cs->current_variant;
-   submit->cfg[5] = variant->assembly_bo->offset;
+   assert(pipeline->shared_data->assembly_bo);
+   struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;
+
+   submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;
   submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
-   if (variant->prog_data.base->single_seg)
+   if (cs_variant->prog_data.base->single_seg)
      submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
-   if (variant->prog_data.base->threads == 4)
+   if (cs_variant->prog_data.base->threads == 4)
      submit->cfg[5] |= V3D_CSD_CFG5_THREADING;

-   if (variant->prog_data.cs->shared_size > 0) {
+   if (cs_variant->prog_data.cs->shared_size > 0) {
      job->csd.shared_memory =
         v3dv_bo_alloc(cmd_buffer->device,
-                       variant->prog_data.cs->shared_size * wgs_per_sg,
+                       cs_variant->prog_data.cs->shared_size * wgs_per_sg,
                       "shared_vars", true);
      if (!job->csd.shared_memory) {
         v3dv_flag_oom(cmd_buffer, NULL);
@ -5261,10 +5284,10 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
      }
   }

-   v3dv_job_add_bo(job, variant->assembly_bo);
-
+   v3dv_job_add_bo(job, cs_assembly_bo);
   struct v3dv_cl_reloc uniforms =
-      v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline->cs,
+      v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline,
+                                     cs_variant,
                                     wg_uniform_offsets_out);
   submit->cfg[6] = uniforms.bo->offset + uniforms.offset;

--- a/src/broadcom/vulkan/v3dv_meta_clear.c
+++ b/src/broadcom/vulkan/v3dv_meta_clear.c
@ -235,7 +235,8 @@ create_pipeline(struct v3dv_device *device,
   struct vk_shader_module fs_m;

   v3dv_shader_module_internal_init(device, &vs_m, vs_nir);
-   v3dv_shader_module_internal_init(device, &fs_m, fs_nir);
+   if (fs_nir)
+      v3dv_shader_module_internal_init(device, &fs_m, fs_nir);

   VkPipelineShaderStageCreateInfo stages[2] = {
      {
@ -247,7 +248,7 @@ create_pipeline(struct v3dv_device *device,
      {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
         .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
-         .module = vk_shader_module_to_handle(&fs_m),
+         .module = fs_nir ? vk_shader_module_to_handle(&fs_m) : VK_NULL_HANDLE,
         .pName = "main",
      },
   };
--- a/src/broadcom/vulkan/v3dv_pipeline.c
+++ b/src/broadcom/vulkan/v3dv_pipeline.c
@ -83,8 +83,9 @@ void
 v3dv_shader_variant_destroy(struct v3dv_device *device,
                            struct v3dv_shader_variant *variant)
 {
-   if (variant->assembly_bo)
-      v3dv_bo_free(device, variant->assembly_bo);
+   /* The assembly BO is shared by all variants in the pipeline, so it can't
+    * be freed here and should be freed with the pipeline
+    */
   ralloc_free(variant->prog_data.base);
   vk_free(&device->vk.alloc, variant);
 }
@ -98,11 +99,30 @@ destroy_pipeline_stage(struct v3dv_device *device,
      return;

   ralloc_free(p_stage->nir);
-   if (p_stage->current_variant)
-      v3dv_shader_variant_unref(device, p_stage->current_variant);
   vk_free2(&device->vk.alloc, pAllocator, p_stage);
 }

+static void
+pipeline_free_stages(struct v3dv_device *device,
+                     struct v3dv_pipeline *pipeline,
+                     const VkAllocationCallbacks *pAllocator)
+{
+   assert(pipeline);
+
+   /* FIXME: we can't just use a loop over mesa stage due the bin, would be
+    * good to find an alternative.
+    */
+   destroy_pipeline_stage(device, pipeline->vs, pAllocator);
+   destroy_pipeline_stage(device, pipeline->vs_bin, pAllocator);
+   destroy_pipeline_stage(device, pipeline->fs, pAllocator);
+   destroy_pipeline_stage(device, pipeline->cs, pAllocator);
+
+   pipeline->vs = NULL;
+   pipeline->vs_bin = NULL;
+   pipeline->fs = NULL;
+   pipeline->cs = NULL;
+}
+
 static void
 v3dv_destroy_pipeline(struct v3dv_pipeline *pipeline,
                      struct v3dv_device *device,
@ -111,13 +131,12 @@ v3dv_destroy_pipeline(struct v3dv_pipeline *pipeline,
   if (!pipeline)
      return;

-   /* FIXME: we can't just use a loop over mesa stage due the bin, would be
-    * good to find an alternative.
-    */
-   destroy_pipeline_stage(device, pipeline->vs, pAllocator);
-   destroy_pipeline_stage(device, pipeline->vs_bin, pAllocator);
-   destroy_pipeline_stage(device, pipeline->fs, pAllocator);
-   destroy_pipeline_stage(device, pipeline->cs, pAllocator);
+   pipeline_free_stages(device, pipeline, pAllocator);
+
+   if (pipeline->shared_data) {
+      v3dv_pipeline_shared_data_unref(device, pipeline->shared_data);
+      pipeline->shared_data = NULL;
+   }

   if (pipeline->spill.bo) {
      assert(pipeline->spill.size_per_thread > 0);
@ -432,6 +451,7 @@ shader_module_compile_to_nir(struct v3dv_device *device,
                         broadcom_shader_stage_to_gl(stage->stage),
                         stage->entrypoint,
                         &spirv_options, nir_options);
+      assert(nir);
      nir_validate_shader(nir, "after spirv_to_nir");
      free(spec_entries);
   } else {
@ -565,7 +585,7 @@ lower_vulkan_resource_index(nir_builder *b,
   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
      struct v3dv_descriptor_map *descriptor_map =
         nir_intrinsic_desc_type(instr) == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ?
-         &pipeline->ubo_map : &pipeline->ssbo_map;
+         &pipeline->shared_data->ubo_map : &pipeline->shared_data->ssbo_map;

      if (!const_val)
         unreachable("non-constant vulkan_resource_index array index");
@ -680,9 +700,11 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,

   uint8_t return_size = relaxed_precision || instr->is_shadow ? 16 : 32;

+   struct v3dv_descriptor_map *map = is_sampler ?
+      &pipeline->shared_data->sampler_map :
+      &pipeline->shared_data->texture_map;
   int desc_index =
-      descriptor_map_add(is_sampler ?
-                         &pipeline->sampler_map : &pipeline->texture_map,
+      descriptor_map_add(map,
                         deref->var->data.descriptor_set,
                         deref->var->data.binding,
                         array_index,
@ -784,7 +806,7 @@ lower_image_deref(nir_builder *b,
          binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER);

   int desc_index =
-      descriptor_map_add(&pipeline->texture_map,
+      descriptor_map_add(&pipeline->shared_data->texture_map,
                         deref->var->data.descriptor_set,
                         deref->var->data.binding,
                         array_index,
@ -957,8 +979,10 @@ pipeline_populate_v3d_key(struct v3d_key *key,
   /* The following values are default values used at pipeline create. We use
    * there 32 bit as default return size.
    */
-   struct v3dv_descriptor_map *sampler_map = &p_stage->pipeline->sampler_map;
-   struct v3dv_descriptor_map *texture_map = &p_stage->pipeline->texture_map;
+   struct v3dv_descriptor_map *sampler_map =
+      &p_stage->pipeline->shared_data->sampler_map;
+   struct v3dv_descriptor_map *texture_map =
+      &p_stage->pipeline->shared_data->texture_map;

   key->num_tex_used = texture_map->num_desc;
   assert(key->num_tex_used <= V3D_MAX_TEXTURE_SAMPLERS);
@ -1171,7 +1195,8 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
      key->num_used_outputs = 0;
   } else {
      struct v3dv_pipeline *pipeline = p_stage->pipeline;
-      struct v3dv_shader_variant *fs_variant = pipeline->fs->current_variant;
+      struct v3dv_shader_variant *fs_variant =
+         pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];

      key->num_used_outputs = fs_variant->prog_data.fs->num_inputs;

@ -1217,113 +1242,123 @@ pipeline_stage_create_vs_bin(const struct v3dv_pipeline_stage *src,
   p_stage->stage = BROADCOM_SHADER_VERTEX_BIN;
   p_stage->entrypoint = src->entrypoint;
   p_stage->module = src->module;
-   p_stage->nir = nir_shader_clone(NULL, src->nir);
+   p_stage->nir = src->nir ? nir_shader_clone(NULL, src->nir) : NULL;
   p_stage->spec_info = src->spec_info;
   memcpy(p_stage->shader_sha1, src->shader_sha1, 20);

   return p_stage;
 }

-/* FIXME: right now this just asks for an bo for the exact size of the qpu
- * assembly. It would be good to be able to re-use bos to avoid bo
- * fragmentation. This could be tricky though, as right now we are uploading
- * the assembly from two paths, when compiling a shader, or when deserializing
- * from the pipeline cache. This also means that the same variant can be
- * shared by different objects. So with the current approach it is clear who
- * owns the assembly bo, but if shared, who owns the shared bo?
- *
- * For now one-bo per-assembly would work.
- *
+/**
 * Returns false if it was not able to allocate or map the assembly bo memory.
 */
 static bool
-upload_assembly(struct v3dv_device *device,
-                struct v3dv_shader_variant *variant,
-                broadcom_shader_stage stage,
-                const void *data,
-                uint32_t size)
+upload_assembly(struct v3dv_pipeline *pipeline)
 {
-   const char *name = NULL;
-   /* We are uploading the assembly just once, so at this point we shouldn't
-    * have any bo
-    */
-   assert(variant->assembly_bo == NULL);
+   uint32_t total_size = 0;
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      struct v3dv_shader_variant *variant =
+         pipeline->shared_data->variants[stage];

-   switch (stage) {
-   case BROADCOM_SHADER_VERTEX:
-      name = "vertex_shader_assembly";
-      break;
-   case BROADCOM_SHADER_VERTEX_BIN:
-      name = "vs_bin_shader_assembly";
-      break;
-   case BROADCOM_SHADER_FRAGMENT:
-      name = "fragment_shader_assembly";
-      break;
-   case BROADCOM_SHADER_COMPUTE:
-      name = "compute_shader_assembly";
-      break;
-   default:
-      unreachable("Stage not supported\n");
-      break;
-   };
+      if (variant != NULL)
+         total_size += variant->qpu_insts_size;
+   }

-   struct v3dv_bo *bo = v3dv_bo_alloc(device, size, name, true);
+   struct v3dv_bo *bo = v3dv_bo_alloc(pipeline->device, total_size,
+                                      "pipeline shader assembly", true);
   if (!bo) {
      fprintf(stderr, "failed to allocate memory for shader\n");
      return false;
   }

-   bool ok = v3dv_bo_map(device, bo, size);
+   bool ok = v3dv_bo_map(pipeline->device, bo, total_size);
   if (!ok) {
      fprintf(stderr, "failed to map source shader buffer\n");
      return false;
   }

-   memcpy(bo->map, data, size);
+   uint32_t offset = 0;
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      struct v3dv_shader_variant *variant =
+         pipeline->shared_data->variants[stage];

-   /* We don't unmap the assembly bo, as we would use to gather the assembly
-    * when serializing the variant.
-    */
-   variant->assembly_bo = bo;
+      if (variant != NULL) {
+         variant->assembly_offset = offset;
+
+         memcpy(bo->map + offset, variant->qpu_insts, variant->qpu_insts_size);
+         offset += variant->qpu_insts_size;
+
+         /* We dont need qpu_insts anymore. */
+         free(variant->qpu_insts);
+         variant->qpu_insts = NULL;
+      }
+   }
+   assert(total_size == offset);
+
+   pipeline->shared_data->assembly_bo = bo;

   return true;
 }

 static void
-pipeline_hash_variant(const struct v3dv_pipeline_stage *p_stage,
-                      struct v3d_key *key,
-                      size_t key_size,
-                      unsigned char *sha1_out)
+pipeline_hash_graphics(const struct v3dv_pipeline *pipeline,
+                       struct v3dv_pipeline_key *key,
+                       unsigned char *sha1_out)
 {
   struct mesa_sha1 ctx;
-   struct v3dv_pipeline *pipeline = p_stage->pipeline;
   _mesa_sha1_init(&ctx);

-   if (p_stage->stage == BROADCOM_SHADER_COMPUTE) {
-      _mesa_sha1_update(&ctx, p_stage->shader_sha1, sizeof(p_stage->shader_sha1));
-   } else {
-      /* We need to include both on the sha1 key as one could affect the other
-       * during linking (like if vertex output are constants, then the
-       * fragment shader would load_const intead of load_input). An
-       * alternative would be to use the serialized nir, but that seems like
-       * an overkill
-       */
-      _mesa_sha1_update(&ctx, pipeline->vs->shader_sha1,
-                        sizeof(pipeline->vs->shader_sha1));
-      _mesa_sha1_update(&ctx, pipeline->fs->shader_sha1,
-                        sizeof(pipeline->fs->shader_sha1));
-   }
-   _mesa_sha1_update(&ctx, key, key_size);
+   /* We need to include both on the sha1 key as one could affect the other
+    * during linking (like if vertex output are constants, then the
+    * fragment shader would load_const intead of load_input). An
+    * alternative would be to use the serialized nir, but that seems like
+    * an overkill
+    */
+   _mesa_sha1_update(&ctx, pipeline->vs->shader_sha1,
+                     sizeof(pipeline->vs->shader_sha1));
+   _mesa_sha1_update(&ctx, pipeline->fs->shader_sha1,
+                     sizeof(pipeline->fs->shader_sha1));
+
+   _mesa_sha1_update(&ctx, key, sizeof(struct v3dv_pipeline_key));

   _mesa_sha1_final(&ctx, sha1_out);
 }

-/* Checks that the pipeline has enough spill size to use a specific variant */
 static void
-pipeline_check_spill_size(struct v3dv_pipeline *pipeline,
-                          struct v3dv_shader_variant *variant)
+pipeline_hash_compute(const struct v3dv_pipeline *pipeline,
+                      struct v3dv_pipeline_key *key,
+                      unsigned char *sha1_out)
 {
-   if (variant->prog_data.base->spill_size > pipeline->spill.size_per_thread) {
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+
+   _mesa_sha1_update(&ctx, pipeline->cs->shader_sha1,
+                     sizeof(pipeline->cs->shader_sha1));
+
+   _mesa_sha1_update(&ctx, key, sizeof(struct v3dv_pipeline_key));
+
+   _mesa_sha1_final(&ctx, sha1_out);
+}
+
+/* Checks that the pipeline has enough spill size to use for any of their
+ * variants
+ */
+static void
+pipeline_check_spill_size(struct v3dv_pipeline *pipeline)
+{
+   uint32_t max_spill_size = 0;
+
+   for(uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      struct v3dv_shader_variant *variant =
+         pipeline->shared_data->variants[stage];
+
+      if (variant != NULL) {
+         max_spill_size = MAX2(variant->prog_data.base->spill_size,
+                               max_spill_size);
+      }
+   }
+
+   if (max_spill_size > 0) {
      struct v3dv_device *device = pipeline->device;

      /* The TIDX register we use for choosing the area to access
@ -1332,30 +1367,35 @@ pipeline_check_spill_size(struct v3dv_pipeline *pipeline,
       * means we still multiply by qpus by 4.
       */
      const uint32_t total_spill_size =
-         4 * device->devinfo.qpu_count * variant->prog_data.base->spill_size;
+         4 * device->devinfo.qpu_count * max_spill_size;
      if (pipeline->spill.bo) {
         assert(pipeline->spill.size_per_thread > 0);
         v3dv_bo_free(device, pipeline->spill.bo);
      }
      pipeline->spill.bo =
         v3dv_bo_alloc(device, total_spill_size, "spill", true);
-      pipeline->spill.size_per_thread = variant->prog_data.base->spill_size;
+      pipeline->spill.size_per_thread = max_spill_size;
   }
 }

-/*
- * Creates a new shader_variant_create. Note that for prog_data is const, so
- * it is used only to copy to their own prog_data
+/**
+ * Creates a new shader_variant_create. Note that for prog_data is not const,
+ * so it is assumed that the caller will prove a pointer that the
+ * shader_variant will own.
 *
- * Creation includes allocating a shader source bo, and filling it up.
+ * Creation doesn't include allocate a BD to store the content of qpu_insts,
+ * as we will try to share the same bo for several shader variants. Also note
+ * that qpu_ints being NULL is valid, for example if we are creating the
+ * shader_variants from the cache, so we can just upload the assembly of all
+ * the shader stages at once.
 */
 struct v3dv_shader_variant *
 v3dv_shader_variant_create(struct v3dv_device *device,
                           broadcom_shader_stage stage,
-                           const unsigned char *variant_sha1,
                           struct v3d_prog_data *prog_data,
                           uint32_t prog_data_size,
-                           const uint64_t *qpu_insts,
+                           uint32_t assembly_offset,
+                           uint64_t *qpu_insts,
                           uint32_t qpu_insts_size,
                           VkResult *out_vk_result)
 {
@ -1368,70 +1408,35 @@ v3dv_shader_variant_create(struct v3dv_device *device,
      return NULL;
   }

-   variant->ref_cnt = 1;
   variant->stage = stage;
-   memcpy(variant->variant_sha1, variant_sha1, sizeof(variant->variant_sha1));
   variant->prog_data_size = prog_data_size;
   variant->prog_data.base = prog_data;

-   if (qpu_insts) {
-      if (!upload_assembly(device, variant, stage,
-                           qpu_insts, qpu_insts_size)) {
-         ralloc_free(variant->prog_data.base);
-         vk_free(&device->vk.alloc, variant);
-
-         *out_vk_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
-         return NULL;
-      }
-      variant->qpu_insts_size = qpu_insts_size;
-   }
+   variant->assembly_offset = assembly_offset;
+   variant->qpu_insts_size = qpu_insts_size;
+   variant->qpu_insts = qpu_insts;

   *out_vk_result = VK_SUCCESS;

   return variant;
 }

-/* For a given key, it returns the compiled version of the shader. If it was
- * already compiled, it gets it from the p_stage cache, if not it compiles is
- * through the v3d compiler
+/* For a given key, it returns the compiled version of the shader.
 *
 * If the method returns NULL it means that it was not able to allocate the
- * resources for the variant. out_vk_result would return which OOM applies.
+ * resources for the variant. out_vk_result would return the corresponding OOM
+ * error.
 *
- * Returns a new reference of the shader_variant to the caller.
+ * Returns a new reference to the shader_variant to the caller.
 */
-struct v3dv_shader_variant*
-v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage,
-                        struct v3dv_pipeline_cache *cache,
-                        struct v3d_key *key,
-                        size_t key_size,
-                        const VkAllocationCallbacks *pAllocator,
-                        VkResult *out_vk_result)
+static struct v3dv_shader_variant*
+pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
+                                struct v3d_key *key,
+                                size_t key_size,
+                                const VkAllocationCallbacks *pAllocator,
+                                VkResult *out_vk_result)
 {
-   /* We search on the pipeline cache if provided by the user, or the default
-    * one
-    */
-   unsigned char variant_sha1[20];
-   pipeline_hash_variant(p_stage, key, key_size, variant_sha1);
-
   struct v3dv_pipeline *pipeline = p_stage->pipeline;
-   struct v3dv_device *device = pipeline->device;
-   if (cache == NULL && device->instance->default_pipeline_cache_enabled)
-       cache = &device->default_pipeline_cache;
-
-   struct v3dv_shader_variant *variant =
-      v3dv_pipeline_cache_search_for_variant(pipeline,
-                                             cache,
-                                             variant_sha1);
-
-   if (variant) {
-      pipeline_check_spill_size(pipeline, variant);
-      *out_vk_result = VK_SUCCESS;
-      return variant;
-   }
-   /* If we don't find the variant in any cache, we compile one and add the
-    * variant to the cache
-    */
   struct v3dv_physical_device *physical_device =
      &pipeline->device->instance->physicalDevice;
   const struct v3d_compiler *compiler = physical_device->compiler;
@ -1448,6 +1453,8 @@ v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage,
   uint64_t *qpu_insts;
   uint32_t qpu_insts_size;
   struct v3d_prog_data *prog_data;
+   uint32_t prog_data_size =
+      v3d_prog_data_size(broadcom_shader_stage_to_gl(p_stage->stage));

   qpu_insts = v3d_compile(compiler,
                           key, &prog_data,
@ -1462,30 +1469,17 @@ v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage,
              p_stage->program_id);
   }

-   variant = v3dv_shader_variant_create(device, p_stage->stage,
-                                        variant_sha1,
-                                        prog_data, v3d_prog_data_size(p_stage->stage),
-                                        qpu_insts, qpu_insts_size,
-                                        out_vk_result);
-   if (qpu_insts)
-      free(qpu_insts);
+   struct v3dv_shader_variant *variant =
+      v3dv_shader_variant_create(pipeline->device, p_stage->stage,
+                                 prog_data, prog_data_size,
+                                 0, /* assembly_offset, no final value yet */
+                                 qpu_insts, qpu_insts_size,
+                                 out_vk_result);

-   if (variant)
-      pipeline_check_spill_size(pipeline, variant);
-
-   if (*out_vk_result == VK_SUCCESS) {
-      struct v3dv_pipeline_cache *default_cache =
-         &pipeline->device->default_pipeline_cache;
-
-      v3dv_pipeline_cache_upload_variant(pipeline, cache, variant);
-
-      /* Ensure that the NIR shader is on the default cache, as cmd_buffer could
-       * need to change the current variant.
-       */
-      if (default_cache != cache) {
-         v3dv_pipeline_cache_upload_variant(pipeline, default_cache, variant);
-      }
-   }
+   /* At this point we don't need anymore the nir shader, but we are freeing
+    * all the temporary p_stage structs used during the pipeline creation when
+    * we finish it, so let's not worry about freeing the nir here.
+    */

   return variant;
 }
@ -1596,12 +1590,12 @@ pipeline_lower_nir(struct v3dv_pipeline *pipeline,
    * another for the case we need a 32bit return size.
    */
   UNUSED unsigned index =
-      descriptor_map_add(&pipeline->sampler_map,
+      descriptor_map_add(&pipeline->shared_data->sampler_map,
                         -1, -1, -1, 0, 16);
   assert(index == V3DV_NO_SAMPLER_16BIT_IDX);

   index =
-      descriptor_map_add(&pipeline->sampler_map,
+      descriptor_map_add(&pipeline->shared_data->sampler_map,
                         -2, -2, -2, 0, 32);
   assert(index == V3DV_NO_SAMPLER_32BIT_IDX);

@ -1693,75 +1687,184 @@ pipeline_hash_shader(const struct vk_shader_module *module,
   _mesa_sha1_final(&ctx, sha1_out);
 }

-
 static VkResult
 pipeline_compile_vertex_shader(struct v3dv_pipeline *pipeline,
-                               struct v3dv_pipeline_cache *cache,
-                               const VkGraphicsPipelineCreateInfo *pCreateInfo,
-                               const VkAllocationCallbacks *pAllocator)
+                               const VkAllocationCallbacks *pAllocator,
+                               const VkGraphicsPipelineCreateInfo *pCreateInfo)
 {
   struct v3dv_pipeline_stage *p_stage = pipeline->vs;

-   pipeline_lower_nir(pipeline, p_stage, pipeline->layout);
   /* Right now we only support pipelines with both vertex and fragment
    * shader.
    */
-   assert(pipeline->fs);
+   assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);

-   /* Make sure we do all our common lowering *before* we create the vs
-    * and vs_bin pipeline stages, since from that point forward we need to
-    * run lowerings for both of them separately, since each stage will
-    * own its NIR code.
-    */
-   lower_vs_io(p_stage->nir);
-
-   pipeline->vs_bin = pipeline_stage_create_vs_bin(pipeline->vs, pAllocator);
-   if (pipeline->vs_bin == NULL)
-      return VK_ERROR_OUT_OF_HOST_MEMORY;
+   assert(pipeline->vs_bin != NULL);
+   if (pipeline->vs_bin->nir == NULL) {
+      assert(pipeline->vs->nir);
+      pipeline->vs_bin->nir = nir_shader_clone(NULL, pipeline->vs->nir);
+   }

+   VkResult vk_result;
   struct v3d_vs_key key;
   pipeline_populate_v3d_vs_key(&key, pCreateInfo, pipeline->vs);
-   VkResult vk_result;
-   pipeline->vs->current_variant =
-      v3dv_get_shader_variant(pipeline->vs, cache, &key.base, sizeof(key),
-                              pAllocator, &vk_result);
+   pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX] =
+      pipeline_compile_shader_variant(pipeline->vs, &key.base, sizeof(key),
+                                      pAllocator, &vk_result);
   if (vk_result != VK_SUCCESS)
      return vk_result;

-   pipeline_populate_v3d_vs_key(&key, pCreateInfo, pipeline->vs_bin);
-   pipeline->vs_bin->current_variant =
-      v3dv_get_shader_variant(pipeline->vs_bin, cache, &key.base, sizeof(key),
-                              pAllocator, &vk_result);
+   p_stage = pipeline->vs_bin;
+   pipeline_populate_v3d_vs_key(&key, pCreateInfo, p_stage);
+   pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN] =
+      pipeline_compile_shader_variant(pipeline->vs_bin, &key.base, sizeof(key),
+                                      pAllocator, &vk_result);

   return vk_result;
 }

 static VkResult
 pipeline_compile_fragment_shader(struct v3dv_pipeline *pipeline,
-                                 struct v3dv_pipeline_cache *cache,
-                                 const VkGraphicsPipelineCreateInfo *pCreateInfo,
-                                 const VkAllocationCallbacks *pAllocator)
+                                 const VkAllocationCallbacks *pAllocator,
+                                 const VkGraphicsPipelineCreateInfo *pCreateInfo)
 {
   struct v3dv_pipeline_stage *p_stage = pipeline->vs;

   p_stage = pipeline->fs;
-   pipeline_lower_nir(pipeline, p_stage, pipeline->layout);

   struct v3d_fs_key key;

   pipeline_populate_v3d_fs_key(&key, pCreateInfo, p_stage,
                                get_ucp_enable_mask(pipeline->vs));

-   lower_fs_io(p_stage->nir);
-
   VkResult vk_result;
-   p_stage->current_variant =
-      v3dv_get_shader_variant(p_stage, cache, &key.base, sizeof(key),
-                              pAllocator, &vk_result);
+   pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT] =
+      pipeline_compile_shader_variant(p_stage, &key.base, sizeof(key),
+                                      pAllocator, &vk_result);

   return vk_result;
 }

+static void
+pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline,
+                               struct v3dv_pipeline_key *key,
+                               const VkGraphicsPipelineCreateInfo *pCreateInfo)
+{
+   memset(key, 0, sizeof(*key));
+   key->robust_buffer_access =
+      pipeline->device->features.robustBufferAccess;
+
+   const VkPipelineInputAssemblyStateCreateInfo *ia_info =
+      pCreateInfo->pInputAssemblyState;
+   key->topology = vk_to_pipe_prim_type[ia_info->topology];
+
+   const VkPipelineColorBlendStateCreateInfo *cb_info =
+      pCreateInfo->pColorBlendState;
+   key->logicop_func = cb_info && cb_info->logicOpEnable == VK_TRUE ?
+      vk_to_pipe_logicop[cb_info->logicOp] :
+      PIPE_LOGICOP_COPY;
+
+   const bool raster_enabled =
+      !pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
+
+   /* Multisample rasterization state must be ignored if rasterization
+    * is disabled.
+    */
+   const VkPipelineMultisampleStateCreateInfo *ms_info =
+      raster_enabled ? pCreateInfo->pMultisampleState : NULL;
+   if (ms_info) {
+      assert(ms_info->rasterizationSamples == VK_SAMPLE_COUNT_1_BIT ||
+             ms_info->rasterizationSamples == VK_SAMPLE_COUNT_4_BIT);
+      key->msaa = ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT;
+
+      if (key->msaa) {
+         key->sample_coverage =
+            pipeline->sample_mask != (1 << V3D_MAX_SAMPLES) - 1;
+         key->sample_alpha_to_coverage = ms_info->alphaToCoverageEnable;
+         key->sample_alpha_to_one = ms_info->alphaToOneEnable;
+      }
+   }
+
+   const struct v3dv_render_pass *pass =
+      v3dv_render_pass_from_handle(pCreateInfo->renderPass);
+   const struct v3dv_subpass *subpass = pipeline->subpass;
+   for (uint32_t i = 0; i < subpass->color_count; i++) {
+      const uint32_t att_idx = subpass->color_attachments[i].attachment;
+      if (att_idx == VK_ATTACHMENT_UNUSED)
+         continue;
+
+      key->cbufs |= 1 << i;
+
+      VkFormat fb_format = pass->attachments[att_idx].desc.format;
+      enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format);
+
+      /* If logic operations are enabled then we might emit color reads and we
+       * need to know the color buffer format and swizzle for that
+       */
+      if (key->logicop_func != PIPE_LOGICOP_COPY) {
+         key->color_fmt[i].format = fb_pipe_format;
+         key->color_fmt[i].swizzle = v3dv_get_format_swizzle(fb_format);
+      }
+
+      const struct util_format_description *desc =
+         vk_format_description(fb_format);
+
+      if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+          desc->channel[0].size == 32) {
+         key->f32_color_rb |= 1 << i;
+      }
+   }
+
+   const VkPipelineVertexInputStateCreateInfo *vi_info =
+      pCreateInfo->pVertexInputState;
+   for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
+      const VkVertexInputAttributeDescription *desc =
+         &vi_info->pVertexAttributeDescriptions[i];
+      assert(desc->location < MAX_VERTEX_ATTRIBS);
+      if (desc->format == VK_FORMAT_B8G8R8A8_UNORM)
+         key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location);
+   }
+
+}
+
+static void
+pipeline_populate_compute_key(struct v3dv_pipeline *pipeline,
+                              struct v3dv_pipeline_key *key,
+                              const VkComputePipelineCreateInfo *pCreateInfo)
+{
+   /* We use the same pipeline key for graphics and compute, but we don't need
+    * to add a field to flag compute keys because this key is not used alone
+    * to search in the cache, we also use the SPIR-V or the serialized NIR for
+    * example, which already flags compute shaders.
+    */
+   memset(key, 0, sizeof(*key));
+   key->robust_buffer_access =
+      pipeline->device->features.robustBufferAccess;
+}
+
+static struct v3dv_pipeline_shared_data *
+v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20],
+                                    struct v3dv_device *device)
+{
+   size_t size = sizeof(struct v3dv_pipeline_shared_data);
+   /* We create new_entry using the device alloc. Right now shared_data is ref
+    * and unref by both the pipeline and the pipeline cache, so we can't
+    * ensure that the cache or pipeline alloc will be available on the last
+    * unref.
+    */
+   struct v3dv_pipeline_shared_data *new_entry =
+      vk_zalloc2(&device->vk.alloc, NULL, size, 8,
+                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   if (new_entry == NULL)
+      return NULL;
+
+   new_entry->ref_cnt = 1;
+   memcpy(new_entry->sha1_key, sha1_key, 20);
+
+   return new_entry;
+}
+
 /*
 * It compiles a pipeline. Note that it also allocate internal object, but if
 * some allocations success, but other fails, the method is not freeing the
@ -1782,8 +1885,8 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
   struct v3dv_physical_device *physical_device =
      &device->instance->physicalDevice;

-   /* First pass to get the the common info from the shader and the nir
-    * shader. We don't care of the coord shader for now.
+   /* First pass to get some common info from the shader, and create the
+    * individual pipeline_stage objects
    */
   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
      const VkPipelineShaderStageCreateInfo *sinfo = &pCreateInfo->pStages[i];
@ -1819,11 +1922,19 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,

      pipeline->active_stages |= sinfo->stage;

-      p_stage->nir = pipeline_stage_get_nir(p_stage, pipeline, cache);
+      /* We will try to get directly the compiled shader variant, so let's not
+       * worry about getting the nir shader for now.
+       */
+      p_stage->nir = NULL;

      switch(stage) {
      case MESA_SHADER_VERTEX:
         pipeline->vs = p_stage;
+         pipeline->vs_bin =
+            pipeline_stage_create_vs_bin(pipeline->vs, pAllocator);
+         if (pipeline->vs_bin == NULL)
+            return VK_ERROR_OUT_OF_HOST_MEMORY;
+
         break;
      case MESA_SHADER_FRAGMENT:
         pipeline->fs = p_stage;
@ -1864,34 +1975,86 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
      pipeline->active_stages |= MESA_SHADER_FRAGMENT;
   }

-   /* Linking */
+   /* Now we will try to get the variants from the pipeline cache */
+   struct v3dv_pipeline_key pipeline_key;
+   pipeline_populate_graphics_key(pipeline, &pipeline_key, pCreateInfo);
+   unsigned char pipeline_sha1[20];
+   pipeline_hash_graphics(pipeline, &pipeline_key, pipeline_sha1);
+
+   pipeline->shared_data =
+      v3dv_pipeline_cache_search_for_pipeline(cache, pipeline_sha1);
+
+   if (pipeline->shared_data != NULL) {
+      assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]);
+      assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
+      assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
+
+      goto success;
+   }
+
+   pipeline->shared_data =
+      v3dv_pipeline_shared_data_new_empty(pipeline_sha1, pipeline->device);
+   /* If not, we try to get the nir shaders (from the SPIR-V shader, or from
+    * the pipeline cache again) and compile.
+    */
+   if (!pipeline->vs->nir)
+      pipeline->vs->nir = pipeline_stage_get_nir(pipeline->vs, pipeline, cache);
+   if (!pipeline->fs->nir)
+      pipeline->fs->nir = pipeline_stage_get_nir(pipeline->fs, pipeline, cache);
+
+   /* Linking + pipeline lowerings */
   link_shaders(pipeline->vs->nir, pipeline->fs->nir);

-   /* Compiling to vir (or getting it from a cache);
-    */
+   pipeline_lower_nir(pipeline, pipeline->fs, pipeline->layout);
+   lower_fs_io(pipeline->fs->nir);
+
+   pipeline_lower_nir(pipeline, pipeline->vs, pipeline->layout);
+   lower_vs_io(pipeline->vs->nir);
+
+   /* Compiling to vir */
   VkResult vk_result;
-   vk_result = pipeline_compile_fragment_shader(pipeline, cache,
-                                                pCreateInfo, pAllocator);
+
+   /* We should have got all the variants or no variants from the cache */
+   assert(!pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
+   vk_result = pipeline_compile_fragment_shader(pipeline, pAllocator, pCreateInfo);
   if (vk_result != VK_SUCCESS)
      return vk_result;

-   vk_result = pipeline_compile_vertex_shader(pipeline, cache,
-                                              pCreateInfo, pAllocator);
+   assert(!pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX] &&
+          !pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
+
+   vk_result = pipeline_compile_vertex_shader(pipeline, pAllocator, pCreateInfo);
   if (vk_result != VK_SUCCESS)
      return vk_result;

+   if (!upload_assembly(pipeline))
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+   v3dv_pipeline_cache_upload_pipeline(pipeline, cache);
+
+   /* As we got the variants in pipeline->shared_data, after compiling we
+    * don't need the pipeline_stages
+    */
+   pipeline_free_stages(device, pipeline, pAllocator);
+
+ success:
+   pipeline_check_spill_size(pipeline);
+
   /* FIXME: values below are default when non-GS is available. Would need to
    * provide real values if GS gets supported
    */
+   struct v3dv_shader_variant *vs_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
+   struct v3dv_shader_variant *vs_bin_variant =
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
+
   pipeline->vpm_cfg_bin.As = 1;
   pipeline->vpm_cfg_bin.Ve = 0;
-   pipeline->vpm_cfg_bin.Vc =
-      pipeline->vs_bin->current_variant->prog_data.vs->vcm_cache_size;
+   pipeline->vpm_cfg_bin.Vc = vs_bin_variant->prog_data.vs->vcm_cache_size;

   pipeline->vpm_cfg.As = 1;
   pipeline->vpm_cfg.Ve = 0;
-   pipeline->vpm_cfg.Vc =
-      pipeline->vs->current_variant->prog_data.vs->vcm_cache_size;
+   pipeline->vpm_cfg.Vc = vs_variant->prog_data.vs->vcm_cache_size;

   return VK_SUCCESS;
 }
@ -2397,13 +2560,13 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
          cl_packet_length(GL_SHADER_STATE_RECORD));

   struct v3d_fs_prog_data *prog_data_fs =
-      pipeline->fs->current_variant->prog_data.fs;
+      pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs;

   struct v3d_vs_prog_data *prog_data_vs =
-      pipeline->vs->current_variant->prog_data.vs;
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]->prog_data.vs;

   struct v3d_vs_prog_data *prog_data_vs_bin =
-      pipeline->vs_bin->current_variant->prog_data.vs;
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]->prog_data.vs;


   /* Note: we are not packing addresses, as we need the job (see
@ -2787,7 +2950,7 @@ pipeline_init(struct v3dv_pipeline *pipeline,

   pipeline->va_count = 0;
   struct v3d_vs_prog_data *prog_data_vs =
-      pipeline->vs->current_variant->prog_data.vs;
+      pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]->prog_data.vs;

   for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
      const VkVertexInputAttributeDescription *desc =
@ -2835,7 +2998,7 @@ graphics_pipeline_create(VkDevice _device,

   /* Use the default pipeline cache if none is specified */
   if (cache == NULL && device->instance->default_pipeline_cache_enabled)
-       cache = &device->default_pipeline_cache;
+      cache = &device->default_pipeline_cache;

   pipeline = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pipeline),
                               VK_OBJECT_TYPE_PIPELINE);
@ -2945,24 +3108,62 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
                        p_stage->spec_info,
                        p_stage->shader_sha1);

-   p_stage->nir = pipeline_stage_get_nir(p_stage, pipeline, cache);
+   /* We try to get directly the variant first from the cache */
+   p_stage->nir = NULL;

+   pipeline->cs = p_stage;
   pipeline->active_stages |= sinfo->stage;
+
+   struct v3dv_pipeline_key pipeline_key;
+   pipeline_populate_compute_key(pipeline, &pipeline_key, info);
+   unsigned char pipeline_sha1[20];
+   pipeline_hash_compute(pipeline, &pipeline_key, pipeline_sha1);
+
+   pipeline->shared_data =
+      v3dv_pipeline_cache_search_for_pipeline(cache, pipeline_sha1);
+
+   if (pipeline->shared_data != NULL) {
+      assert(pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
+      goto success;
+   }
+
+   pipeline->shared_data = v3dv_pipeline_shared_data_new_empty(pipeline_sha1,
+                                                               pipeline->device);
+
+   /* If not found on cache, compile it */
+   p_stage->nir = pipeline_stage_get_nir(p_stage, pipeline, cache);
+   assert(p_stage->nir);
+
   st_nir_opts(p_stage->nir);
   pipeline_lower_nir(pipeline, p_stage, pipeline->layout);
   lower_cs_shared(p_stage->nir);

-   pipeline->cs = p_stage;
+   VkResult result = VK_SUCCESS;

   struct v3d_key key;
   memset(&key, 0, sizeof(key));
   pipeline_populate_v3d_key(&key, p_stage, 0,
                             pipeline->device->features.robustBufferAccess);
+   pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE] =
+      pipeline_compile_shader_variant(p_stage, &key, sizeof(key),
+                                      alloc, &result);

-   VkResult result;
-   p_stage->current_variant =
-      v3dv_get_shader_variant(p_stage, cache, &key, sizeof(key), alloc, &result);
-   return result;
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (!upload_assembly(pipeline))
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+   v3dv_pipeline_cache_upload_pipeline(pipeline, cache);
+   /* As we got the variants in pipeline->shared_data, after compiling we
+    * don't need the pipeline_stages
+    */
+   pipeline_free_stages(device, pipeline, alloc);
+
+ success:
+   pipeline_check_spill_size(pipeline);
+
+   return VK_SUCCESS;
 }

 static VkResult
@ -2997,7 +3198,7 @@ compute_pipeline_create(VkDevice _device,

   /* Use the default pipeline cache if none is specified */
   if (cache == NULL && device->instance->default_pipeline_cache_enabled)
-       cache = &device->default_pipeline_cache;
+      cache = &device->default_pipeline_cache;

   pipeline = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pipeline),
                               VK_OBJECT_TYPE_PIPELINE);
--- a/src/broadcom/vulkan/v3dv_pipeline_cache.c
+++ b/src/broadcom/vulkan/v3dv_pipeline_cache.c
@ -58,9 +58,9 @@ cache_dump_stats(struct v3dv_pipeline_cache *cache)
   fprintf(stderr, "  NIR cache miss count:   %d\n", cache->nir_stats.miss);
   fprintf(stderr, "  NIR cache hit  count:   %d\n", cache->nir_stats.hit);

-   fprintf(stderr, "  variant cache entries:      %d\n", cache->variant_stats.count);
-   fprintf(stderr, "  variant cache miss count:   %d\n", cache->variant_stats.miss);
-   fprintf(stderr, "  variant cache hit  count:   %d\n", cache->variant_stats.hit);
+   fprintf(stderr, "  cache entries:      %d\n", cache->stats.count);
+   fprintf(stderr, "  cache miss count:   %d\n", cache->stats.miss);
+   fprintf(stderr, "  cache hit  count:   %d\n", cache->stats.hit);
 }

 void
@ -197,59 +197,65 @@ v3dv_pipeline_cache_init(struct v3dv_pipeline_cache *cache,
      cache->nir_stats.hit = 0;
      cache->nir_stats.count = 0;

-      cache->variant_cache = _mesa_hash_table_create(NULL, sha1_hash_func,
-                                                     sha1_compare_func);
-      cache->variant_stats.miss = 0;
-      cache->variant_stats.hit = 0;
-      cache->variant_stats.count = 0;
+      cache->cache = _mesa_hash_table_create(NULL, sha1_hash_func,
+                                             sha1_compare_func);
+      cache->stats.miss = 0;
+      cache->stats.hit = 0;
+      cache->stats.count = 0;
   } else {
      cache->nir_cache = NULL;
-      cache->variant_cache = NULL;
+      cache->cache = NULL;
   }

 }

-struct v3dv_shader_variant*
-v3dv_pipeline_cache_search_for_variant(struct v3dv_pipeline *pipeline,
-                                       struct v3dv_pipeline_cache *cache,
-                                       unsigned char sha1_key[20])
+/**
+ * It searchs for pipeline cached data, and returns a v3dv_pipeline_shared_data with
+ * it, or NULL if doesn't have it cached. On the former, it will increases the
+ * ref_count, so caller is responsible to unref it.
+ */
+struct v3dv_pipeline_shared_data *
+v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
+                                        unsigned char sha1_key[20])
 {
-   if (!cache || !cache->variant_cache)
+   if (!cache || !cache->cache)
      return NULL;

   if (debug_cache) {
      char sha1buf[41];
      _mesa_sha1_format(sha1buf, sha1_key);

-      fprintf(stderr, "pipeline cache %p, search variant with key %s\n", cache, sha1buf);
+      fprintf(stderr, "pipeline cache %p, search pipeline with key %s\n", cache, sha1buf);
   }

   pthread_mutex_lock(&cache->mutex);

   struct hash_entry *entry =
-      _mesa_hash_table_search(cache->variant_cache, sha1_key);
+      _mesa_hash_table_search(cache->cache, sha1_key);

   if (entry) {
-      struct v3dv_shader_variant *variant =
-         (struct v3dv_shader_variant *) entry->data;
+      struct v3dv_pipeline_shared_data *cache_entry =
+         (struct v3dv_pipeline_shared_data *) entry->data;
+      assert(cache_entry);

-      cache->variant_stats.hit++;
+      cache->stats.hit++;
      if (debug_cache) {
-         fprintf(stderr, "\tvariant cache hit: %p\n", variant);
+         fprintf(stderr, "\tcache hit: %p\n", cache_entry);
         if (dump_stats)
            cache_dump_stats(cache);
      }

-      if (variant)
-         v3dv_shader_variant_ref(variant);
+
+      v3dv_pipeline_shared_data_ref(cache_entry);

      pthread_mutex_unlock(&cache->mutex);
-      return variant;
+
+      return cache_entry;
   }

-   cache->variant_stats.miss++;
+   cache->stats.miss++;
   if (debug_cache) {
-      fprintf(stderr, "\tvariant cache miss\n");
+      fprintf(stderr, "\tcache miss\n");
      if (dump_stats)
         cache_dump_stats(cache);
   }
@ -259,34 +265,109 @@ v3dv_pipeline_cache_search_for_variant(struct v3dv_pipeline *pipeline,
 }

 void
-v3dv_pipeline_cache_upload_variant(struct v3dv_pipeline *pipeline,
-                                   struct v3dv_pipeline_cache *cache,
-                                   struct v3dv_shader_variant  *variant)
+v3dv_pipeline_shared_data_destroy(struct v3dv_device *device,
+                                  struct v3dv_pipeline_shared_data *shared_data)
 {
-   if (!cache || !cache->variant_cache)
+   assert(shared_data->ref_cnt == 0);
+
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      if (shared_data->variants[stage] != NULL)
+         v3dv_shader_variant_destroy(device, shared_data->variants[stage]);
+   }
+
+   if (shared_data->assembly_bo)
+      v3dv_bo_free(device, shared_data->assembly_bo);
+
+   vk_free(&device->vk.alloc, shared_data);
+}
+
+static struct v3dv_pipeline_shared_data *
+v3dv_pipeline_shared_data_new(struct v3dv_pipeline_cache *cache,
+                              const unsigned char sha1_key[20],
+                              struct v3dv_shader_variant **variants,
+                              const struct v3dv_descriptor_map *ubo_map,
+                              const struct v3dv_descriptor_map *ssbo_map,
+                              const struct v3dv_descriptor_map *sampler_map,
+                              const struct v3dv_descriptor_map *texture_map,
+                              const uint64_t *total_assembly,
+                              const uint32_t total_assembly_size)
+{
+   size_t size = sizeof(struct v3dv_pipeline_shared_data);
+   /* We create new_entry using the device alloc. Right now shared_data is ref
+    * and unref by both the pipeline and the pipeline cache, so we can't
+    * ensure that the cache or pipeline alloc will be available on the last
+    * unref.
+    */
+   struct v3dv_pipeline_shared_data *new_entry =
+      vk_zalloc2(&cache->device->vk.alloc, NULL, size, 8,
+                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   if (new_entry == NULL)
+      return NULL;
+
+   new_entry->ref_cnt = 1;
+   memcpy(new_entry->sha1_key, sha1_key, 20);
+
+   memcpy(&new_entry->ubo_map, ubo_map, sizeof(struct v3dv_descriptor_map));
+   memcpy(&new_entry->ssbo_map, ssbo_map, sizeof(struct v3dv_descriptor_map));
+   memcpy(&new_entry->sampler_map, sampler_map, sizeof(struct v3dv_descriptor_map));
+   memcpy(&new_entry->texture_map, texture_map, sizeof(struct v3dv_descriptor_map));
+
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++)
+      new_entry->variants[stage] = variants[stage];
+
+   struct v3dv_bo *bo = v3dv_bo_alloc(cache->device, total_assembly_size,
+                                      "pipeline shader assembly", true);
+   if (!bo) {
+      fprintf(stderr, "failed to allocate memory for shaders assembly\n");
+      v3dv_pipeline_shared_data_unref(cache->device, new_entry);
+      return NULL;
+   }
+
+   bool ok = v3dv_bo_map(cache->device, bo, total_assembly_size);
+   if (!ok) {
+      fprintf(stderr, "failed to map source shader buffer\n");
+      v3dv_pipeline_shared_data_unref(cache->device, new_entry);
+      return NULL;
+   }
+
+   memcpy(bo->map, total_assembly, total_assembly_size);
+
+   new_entry->assembly_bo = bo;
+
+   return new_entry;
+}
+
+/* Uploads all the "cacheable" or shared data from the pipeline */
+void
+v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
+                                    struct v3dv_pipeline_cache *cache)
+{
+   if (!cache || !cache->cache)
      return;

-   if (cache->variant_stats.count > V3DV_MAX_PIPELINE_CACHE_ENTRIES)
+   if (cache->stats.count > V3DV_MAX_PIPELINE_CACHE_ENTRIES)
      return;

   pthread_mutex_lock(&cache->mutex);
   struct hash_entry *entry =
-      _mesa_hash_table_search(cache->variant_cache, variant->variant_sha1);
+      _mesa_hash_table_search(cache->cache, pipeline->shared_data->sha1_key);

   if (entry) {
      pthread_mutex_unlock(&cache->mutex);
      return;
   }

-   v3dv_shader_variant_ref(variant);
-   _mesa_hash_table_insert(cache->variant_cache, variant->variant_sha1, variant);
-   cache->variant_stats.count++;
+   v3dv_pipeline_shared_data_ref(pipeline->shared_data);
+   _mesa_hash_table_insert(cache->cache, pipeline->shared_data->sha1_key,
+                           pipeline->shared_data);
+   cache->stats.count++;
   if (debug_cache) {
      char sha1buf[41];
-      _mesa_sha1_format(sha1buf, variant->variant_sha1);
+      _mesa_sha1_format(sha1buf, pipeline->shared_data->sha1_key);

-      fprintf(stderr, "pipeline cache %p, new variant entry with key %s\n\t%p\n",
-              cache, sha1buf, variant);
+      fprintf(stderr, "pipeline cache %p, new cache entry with sha1 key %s:%p\n\n",
+              cache, sha1buf, pipeline->shared_data);
      if (dump_stats)
         cache_dump_stats(cache);
   }
@ -321,8 +402,6 @@ shader_variant_create_from_blob(struct v3dv_device *device,

   broadcom_shader_stage stage = blob_read_uint32(blob);

-   const unsigned char *variant_sha1 = blob_read_bytes(blob, 20);
-
   uint32_t prog_data_size = blob_read_uint32(blob);
   /* FIXME: as we include the stage perhaps we can avoid prog_data_size? */
   assert(prog_data_size == v3d_prog_data_size(broadcom_shader_stage_to_gl(stage)));
@ -342,10 +421,8 @@ shader_variant_create_from_blob(struct v3dv_device *device,
   if (blob->overrun)
      return NULL;

+   uint32_t assembly_offset = blob_read_uint32(blob);
   uint32_t qpu_insts_size = blob_read_uint32(blob);
-   const uint64_t *qpu_insts = blob_read_bytes(blob, qpu_insts_size);
-   if (blob->overrun)
-      return NULL;

   /* shader_variant_create expects a newly created prog_data for their own,
    * as it is what the v3d compiler returns. So we are also allocating one
@ -362,12 +439,53 @@ shader_variant_create_from_blob(struct v3dv_device *device,
   memcpy(ulist->data, ulist_data_data, ulist_data_size);

   return v3dv_shader_variant_create(device, stage,
-                                     variant_sha1,
                                     new_prog_data, prog_data_size,
-                                     qpu_insts, qpu_insts_size,
+                                     assembly_offset,
+                                     NULL, qpu_insts_size,
                                     &result);
 }

+static struct v3dv_pipeline_shared_data *
+v3dv_pipeline_shared_data_create_from_blob(struct v3dv_pipeline_cache *cache,
+                                           struct blob_reader *blob)
+{
+   const unsigned char *sha1_key = blob_read_bytes(blob, 20);
+
+   const struct v3dv_descriptor_map *ubo_map =
+      blob_read_bytes(blob, sizeof(struct v3dv_descriptor_map));
+   const struct v3dv_descriptor_map *ssbo_map =
+      blob_read_bytes(blob, sizeof(struct v3dv_descriptor_map));
+   const struct v3dv_descriptor_map *sampler_map =
+      blob_read_bytes(blob, sizeof(struct v3dv_descriptor_map));
+   const struct v3dv_descriptor_map *texture_map =
+      blob_read_bytes(blob, sizeof(struct v3dv_descriptor_map));
+
+   if (blob->overrun)
+      return NULL;
+
+   uint8_t variant_count = blob_read_uint8(blob);
+
+   struct v3dv_shader_variant *variants[BROADCOM_SHADER_STAGES] = { 0 };
+
+   for (uint8_t count = 0; count < variant_count; count++) {
+      uint8_t stage = blob_read_uint8(blob);
+      struct v3dv_shader_variant *variant =
+         shader_variant_create_from_blob(cache->device, blob);
+      variants[stage] = variant;
+   }
+
+   uint32_t total_assembly_size = blob_read_uint32(blob);
+   const uint64_t *total_assembly =
+      blob_read_bytes(blob, total_assembly_size);
+
+   if (blob->overrun)
+      return NULL;
+
+   return v3dv_pipeline_shared_data_new(cache, sha1_key, variants,
+                                        ubo_map, ssbo_map, sampler_map, texture_map,
+                                        total_assembly, total_assembly_size);
+}
+
 static void
 pipeline_cache_load(struct v3dv_pipeline_cache *cache,
                    size_t size,
@ -377,7 +495,7 @@ pipeline_cache_load(struct v3dv_pipeline_cache *cache,
   struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
   struct vk_pipeline_cache_header header;

-   if (cache->variant_cache == NULL)
+   if (cache->cache == NULL || cache->nir_cache == NULL)
      return;

   struct blob_reader blob;
@ -418,17 +536,18 @@ pipeline_cache_load(struct v3dv_pipeline_cache *cache,
      return;

   for (uint32_t i = 0; i < count; i++) {
-      struct v3dv_shader_variant *variant =
-         shader_variant_create_from_blob(device, &blob);
-      if (!variant)
+      struct v3dv_pipeline_shared_data *cache_entry =
+         v3dv_pipeline_shared_data_create_from_blob(cache, &blob);
+      if (!cache_entry)
         break;
-      _mesa_hash_table_insert(cache->variant_cache, variant->variant_sha1, variant);
-      cache->variant_stats.count++;
+
+      _mesa_hash_table_insert(cache->cache, cache_entry->sha1_key, cache_entry);
+      cache->stats.count++;
   }

   if (debug_cache) {
      fprintf(stderr, "pipeline cache %p, loaded %i nir shaders and "
-              "%i variant entries\n", cache, nir_count, count);
+              "%i entries\n", cache, nir_count, count);
      if (dump_stats)
         cache_dump_stats(cache);
   }
@ -482,15 +601,14 @@ v3dv_pipeline_cache_finish(struct v3dv_pipeline_cache *cache)
      _mesa_hash_table_destroy(cache->nir_cache, NULL);
   }

-   if (cache->variant_cache) {
-      hash_table_foreach(cache->variant_cache, entry) {
-         struct v3dv_shader_variant *variant = entry->data;
-         if (variant)
-            v3dv_shader_variant_unref(cache->device, variant);
+   if (cache->cache) {
+      hash_table_foreach(cache->cache, entry) {
+         struct v3dv_pipeline_shared_data *cache_entry = entry->data;
+         if (cache_entry)
+            v3dv_pipeline_shared_data_unref(cache->device, cache_entry);
      }

-      _mesa_hash_table_destroy(cache->variant_cache, NULL);
-
+      _mesa_hash_table_destroy(cache->cache, NULL);
   }
 }

@ -518,12 +636,12 @@ v3dv_MergePipelineCaches(VkDevice device,
 {
   V3DV_FROM_HANDLE(v3dv_pipeline_cache, dst, dstCache);

-   if (!dst->variant_cache || !dst->nir_cache)
+   if (!dst->cache || !dst->nir_cache)
      return VK_SUCCESS;

   for (uint32_t i = 0; i < srcCacheCount; i++) {
      V3DV_FROM_HANDLE(v3dv_pipeline_cache, src, pSrcCaches[i]);
-      if (!src->variant_cache || !src->nir_cache)
+      if (!src->cache || !src->nir_cache)
         continue;

      hash_table_foreach(src->nir_cache, entry) {
@ -559,22 +677,22 @@ v3dv_MergePipelineCaches(VkDevice device,
         }
      }

-      hash_table_foreach(src->variant_cache, entry) {
-         struct v3dv_shader_variant *variant = entry->data;
-         assert(variant);
+      hash_table_foreach(src->cache, entry) {
+         struct v3dv_pipeline_shared_data *cache_entry = entry->data;
+         assert(cache_entry);

-         if (_mesa_hash_table_search(dst->variant_cache, variant->variant_sha1))
+         if (_mesa_hash_table_search(dst->cache, cache_entry->sha1_key))
            continue;

-         v3dv_shader_variant_ref(variant);
-         _mesa_hash_table_insert(dst->variant_cache, variant->variant_sha1, variant);
+         v3dv_pipeline_shared_data_ref(cache_entry);
+         _mesa_hash_table_insert(dst->cache, cache_entry->sha1_key, cache_entry);

-         dst->variant_stats.count++;
+         dst->stats.count++;
         if (debug_cache) {
            char sha1buf[41];
-            _mesa_sha1_format(sha1buf, variant->variant_sha1);
+            _mesa_sha1_format(sha1buf, cache_entry->sha1_key);

-            fprintf(stderr, "pipeline cache %p, added variant entry %s "
+            fprintf(stderr, "pipeline cache %p, added entry %s "
                    "from pipeline cache %p\n",
                    dst, sha1buf, src);
            if (dump_stats)
@ -592,8 +710,6 @@ shader_variant_write_to_blob(const struct v3dv_shader_variant *variant,
 {
   blob_write_uint32(blob, variant->stage);

-   blob_write_bytes(blob, variant->variant_sha1, sizeof(variant->variant_sha1));
-
   blob_write_uint32(blob, variant->prog_data_size);
   blob_write_bytes(blob, variant->prog_data.base, variant->prog_data_size);

@ -602,13 +718,62 @@ shader_variant_write_to_blob(const struct v3dv_shader_variant *variant,
   blob_write_bytes(blob, ulist->contents, sizeof(enum quniform_contents) * ulist->count);
   blob_write_bytes(blob, ulist->data, sizeof(uint32_t) * ulist->count);

+   blob_write_uint32(blob, variant->assembly_offset);
   blob_write_uint32(blob, variant->qpu_insts_size);
-   assert(variant->assembly_bo->map);
-   blob_write_bytes(blob, variant->assembly_bo->map, variant->qpu_insts_size);

   return !blob->out_of_memory;
 }

+static bool
+v3dv_pipeline_shared_data_write_to_blob(const struct v3dv_pipeline_shared_data *cache_entry,
+                                        struct blob *blob)
+{
+   blob_write_bytes(blob, cache_entry->sha1_key, 20);
+
+   blob_write_bytes(blob, &cache_entry->ubo_map,
+                    sizeof(struct v3dv_descriptor_map));
+   blob_write_bytes(blob, &cache_entry->ssbo_map,
+                    sizeof(struct v3dv_descriptor_map));
+   blob_write_bytes(blob, &cache_entry->sampler_map,
+                    sizeof(struct v3dv_descriptor_map));
+   blob_write_bytes(blob, &cache_entry->texture_map,
+                    sizeof(struct v3dv_descriptor_map));
+
+   uint8_t variant_count = 0;
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      if (cache_entry->variants[stage] == NULL)
+         continue;
+      variant_count++;
+   }
+
+   /* Right now we only support compute pipeline, or graphics pipeline with
+    * vertex, vertex bin, and fragment shader.
+    */
+   assert(variant_count == 3 ||
+          (variant_count == 1 && cache_entry->variants[BROADCOM_SHADER_COMPUTE]));
+   blob_write_uint8(blob, variant_count);
+
+   uint32_t total_assembly_size = 0;
+   for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
+      if (cache_entry->variants[stage] == NULL)
+         continue;
+
+      blob_write_uint8(blob, stage);
+      if (!shader_variant_write_to_blob(cache_entry->variants[stage], blob))
+         return false;
+
+      total_assembly_size += cache_entry->variants[stage]->qpu_insts_size;
+   }
+   blob_write_uint32(blob, total_assembly_size);
+
+   assert(cache_entry->assembly_bo->map);
+   assert(cache_entry->assembly_bo->size > total_assembly_size);
+   blob_write_bytes(blob, cache_entry->assembly_bo->map, total_assembly_size);
+
+   return !blob->out_of_memory;
+}
+
+
 VkResult
 v3dv_GetPipelineCacheData(VkDevice _device,
                          VkPipelineCache _cache,
@ -679,12 +844,12 @@ v3dv_GetPipelineCacheData(VkDevice _device,
      return VK_INCOMPLETE;
   }

-   if (cache->variant_cache) {
-      hash_table_foreach(cache->variant_cache, entry) {
-         struct v3dv_shader_variant *variant = entry->data;
+   if (cache->cache) {
+      hash_table_foreach(cache->cache, entry) {
+         struct v3dv_pipeline_shared_data *cache_entry = entry->data;

         size_t save_size = blob.size;
-         if (!shader_variant_write_to_blob(variant, &blob)) {
+         if (!v3dv_pipeline_shared_data_write_to_blob(cache_entry, &blob)) {
            /* If it fails reset to the previous size and bail */
            blob.size = save_size;
            pthread_mutex_unlock(&cache->mutex);
@ -703,10 +868,10 @@ v3dv_GetPipelineCacheData(VkDevice _device,
   blob_finish(&blob);

   if (debug_cache) {
-      assert(count <= cache->variant_stats.count);
+      assert(count <= cache->stats.count);
      fprintf(stderr, "GetPipelineCacheData: serializing cache %p, "
              "%i nir shader entries "
-              "%i variant entries, %u DataSize\n",
+              "%i entries, %u DataSize\n",
              cache, nir_count, count, (uint32_t) *pDataSize);
   }

--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@ -254,6 +254,23 @@ struct v3dv_meta_texel_buffer_copy_pipeline {
   uint8_t key[V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE];
 };

+struct v3dv_pipeline_key {
+   bool robust_buffer_access;
+   uint8_t topology;
+   uint8_t logicop_func;
+   bool msaa;
+   bool sample_coverage;
+   bool sample_alpha_to_coverage;
+   bool sample_alpha_to_one;
+   uint8_t cbufs;
+   struct {
+      enum pipe_format format;
+      const uint8_t *swizzle;
+   } color_fmt[V3D_MAX_DRAW_BUFFERS];
+   uint8_t f32_color_rb;
+   uint32_t va_swap_rb_mask;
+};
+
 struct v3dv_pipeline_cache_stats {
   uint32_t miss;
   uint32_t hit;
@ -314,8 +331,8 @@ struct v3dv_pipeline_cache {
   struct hash_table *nir_cache;
   struct v3dv_pipeline_cache_stats nir_stats;

-   struct hash_table *variant_cache;
-   struct v3dv_pipeline_cache_stats variant_stats;
+   struct hash_table *cache;
+   struct v3dv_pipeline_cache_stats stats;
 };

 struct v3dv_device {
@ -1340,15 +1357,8 @@ vk_to_mesa_shader_stage(VkShaderStageFlagBits vk_stage)
 }

 struct v3dv_shader_variant {
-   uint32_t ref_cnt;
-
   broadcom_shader_stage stage;

-   /* key for the pipeline cache, it is p_stage shader_sha1 + v3d compiler
-    * sha1
-    */
-   unsigned char variant_sha1[20];
-
   union {
      struct v3d_prog_data *base;
      struct v3d_vs_prog_data *vs;
@ -1360,11 +1370,17 @@ struct v3dv_shader_variant {
    * serialize
    */
   uint32_t prog_data_size;
-   /* FIXME: using one bo per shader. Eventually we would be interested on
-    * reusing the same bo for all the shaders, like a bo per v3dv_pipeline for
-    * shaders.
+
+   /* The assembly for this variant will be uploaded to a BO shared with all
+    * other shader stages in that pipeline. This is the offset in that BO.
    */
-   struct v3dv_bo *assembly_bo;
+   uint32_t assembly_offset;
+
+   /* Note: it is really likely that qpu_insts would be NULL, as it will be
+    * used only temporarily, to upload it to the shared bo, as we compile the
+    * different stages individually.
+    */
+   uint64_t *qpu_insts;
   uint32_t qpu_insts_size;
 };

@ -1393,8 +1409,6 @@ struct v3dv_pipeline_stage {

   /** A name for this program, so you can track it in shader-db output. */
   uint32_t program_id;
-
-   struct v3dv_shader_variant*current_variant;
 };

 /* FIXME: although the full vpm_config is not required at this point, as we
@ -1606,6 +1620,25 @@ v3dv_pipeline_combined_index_key_unpack(uint32_t combined_index_key,
      *sampler_index = sampler;
 }

+/* The structure represents data shared between different objects, like the
+ * pipeline and the pipeline cache, so we ref count it to know when it should
+ * be freed.
+ */
+struct v3dv_pipeline_shared_data {
+   uint32_t ref_cnt;
+
+   unsigned char sha1_key[20];
+
+   struct v3dv_descriptor_map ubo_map;
+   struct v3dv_descriptor_map ssbo_map;
+   struct v3dv_descriptor_map sampler_map;
+   struct v3dv_descriptor_map texture_map;
+
+   struct v3dv_shader_variant *variants[BROADCOM_SHADER_STAGES];
+
+   struct v3dv_bo *assembly_bo;
+};
+
 struct v3dv_pipeline {
   struct vk_object_base base;

@ -1668,11 +1701,7 @@ struct v3dv_pipeline {

   enum pipe_prim_type topology;

-   struct v3dv_descriptor_map ubo_map;
-   struct v3dv_descriptor_map ssbo_map;
-
-   struct v3dv_descriptor_map sampler_map;
-   struct v3dv_descriptor_map texture_map;
+   struct v3dv_pipeline_shared_data *shared_data;

   /* FIXME: this bo is another candidate to data to be uploaded using a
    * resource manager, instead of a individual bo
@ -1848,9 +1877,12 @@ void v3d_store_tiled_image(void *dst, uint32_t dst_stride,
                           const struct pipe_box *box);

 struct v3dv_cl_reloc v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
-                                         struct v3dv_pipeline_stage *p_stage);
+                                         struct v3dv_pipeline *pipeline,
+                                         struct v3dv_shader_variant *variant);
+
 struct v3dv_cl_reloc v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
-                                                    struct v3dv_pipeline_stage *p_stage,
+                                                    struct v3dv_pipeline *pipeline,
+                                                    struct v3dv_shader_variant *variant,
                                                    uint32_t **wg_count_offsets);

 struct v3dv_shader_variant *
@ -1864,10 +1896,10 @@ v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage,
 struct v3dv_shader_variant *
 v3dv_shader_variant_create(struct v3dv_device *device,
                           broadcom_shader_stage stage,
-                           const unsigned char *variant_sha1,
                           struct v3d_prog_data *prog_data,
                           uint32_t prog_data_size,
-                           const uint64_t *qpu_insts,
+                           uint32_t assembly_offset,
+                           uint64_t *qpu_insts,
                           uint32_t qpu_insts_size,
                           VkResult *out_vk_result);

@ -1876,19 +1908,23 @@ v3dv_shader_variant_destroy(struct v3dv_device *device,
                            struct v3dv_shader_variant *variant);

 static inline void
-v3dv_shader_variant_ref(struct v3dv_shader_variant *variant)
+v3dv_pipeline_shared_data_ref(struct v3dv_pipeline_shared_data *shared_data)
 {
-   assert(variant && variant->ref_cnt >= 1);
-   p_atomic_inc(&variant->ref_cnt);
+   assert(shared_data && shared_data->ref_cnt >= 1);
+   p_atomic_inc(&shared_data->ref_cnt);
 }

+void
+v3dv_pipeline_shared_data_destroy(struct v3dv_device *device,
+                                  struct v3dv_pipeline_shared_data *shared_data);
+
 static inline void
-v3dv_shader_variant_unref(struct v3dv_device *device,
-                          struct v3dv_shader_variant *variant)
+v3dv_pipeline_shared_data_unref(struct v3dv_device *device,
+                                struct v3dv_pipeline_shared_data *shared_data)
 {
-   assert(variant && variant->ref_cnt >= 1);
-   if (p_atomic_dec_zero(&variant->ref_cnt))
-      v3dv_shader_variant_destroy(device, variant);
+   assert(shared_data && shared_data->ref_cnt >= 1);
+   if (p_atomic_dec_zero(&shared_data->ref_cnt))
+      v3dv_pipeline_shared_data_destroy(device, shared_data);
 }

 struct v3dv_descriptor *
@ -1953,15 +1989,13 @@ nir_shader* v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline,
                                               const nir_shader_compiler_options *nir_options,
                                               unsigned char sha1_key[20]);

-struct v3dv_shader_variant*
-v3dv_pipeline_cache_search_for_variant(struct v3dv_pipeline *pipeline,
-                                       struct v3dv_pipeline_cache *cache,
-                                       unsigned char sha1_key[20]);
+struct v3dv_pipeline_shared_data *
+v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
+                                        unsigned char sha1_key[20]);

 void
-v3dv_pipeline_cache_upload_variant(struct v3dv_pipeline *pipeline,
-                                   struct v3dv_pipeline_cache *cache,
-                                   struct v3dv_shader_variant  *variant);
+v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
+                                    struct v3dv_pipeline_cache *cache);

 void v3dv_shader_module_internal_init(struct v3dv_device *device,
                                      struct vk_shader_module *module,
--- a/src/broadcom/vulkan/v3dv_uniforms.c
+++ b/src/broadcom/vulkan/v3dv_uniforms.c
@ -97,14 +97,15 @@ write_tmu_p0(struct v3dv_cmd_buffer *cmd_buffer,

   /* We need to ensure that the texture bo is added to the job */
   struct v3dv_bo *texture_bo =
-      v3dv_descriptor_map_get_texture_bo(descriptor_state, &pipeline->texture_map,
+      v3dv_descriptor_map_get_texture_bo(descriptor_state,
+                                         &pipeline->shared_data->texture_map,
                                         pipeline->layout, texture_idx);
   assert(texture_bo);
   v3dv_job_add_bo(job, texture_bo);

   struct v3dv_cl_reloc state_reloc =
      v3dv_descriptor_map_get_texture_shader_state(descriptor_state,
-                                                   &pipeline->texture_map,
+                                                   &pipeline->shared_data->texture_map,
                                                   pipeline->layout,
                                                   texture_idx);

@ -130,12 +131,14 @@ write_tmu_p1(struct v3dv_cmd_buffer *cmd_buffer,
          sampler_idx != V3DV_NO_SAMPLER_32BIT_IDX);

   struct v3dv_cl_reloc sampler_state_reloc =
-      v3dv_descriptor_map_get_sampler_state(descriptor_state, &pipeline->sampler_map,
+      v3dv_descriptor_map_get_sampler_state(descriptor_state,
+                                            &pipeline->shared_data->sampler_map,
                                            pipeline->layout, sampler_idx);

   const struct v3dv_sampler *sampler =
-      v3dv_descriptor_map_get_sampler(descriptor_state, &pipeline->sampler_map,
-                                         pipeline->layout, sampler_idx);
+      v3dv_descriptor_map_get_sampler(descriptor_state,
+                                      &pipeline->shared_data->sampler_map,
+                                      pipeline->layout, sampler_idx);
   assert(sampler);

   /* Set unnormalized coordinates flag from sampler object */
@ -167,7 +170,7 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer,

   struct v3dv_descriptor_map *map =
      content == QUNIFORM_UBO_ADDR || content == QUNIFORM_GET_UBO_SIZE ?
-      &pipeline->ubo_map : &pipeline->ssbo_map;
+      &pipeline->shared_data->ubo_map : &pipeline->shared_data->ssbo_map;

   uint32_t offset =
      content == QUNIFORM_UBO_ADDR ?
@ -285,7 +288,7 @@ get_texture_size(struct v3dv_cmd_buffer *cmd_buffer,

   struct v3dv_descriptor *descriptor =
      v3dv_descriptor_map_get_descriptor(descriptor_state,
-                                         &pipeline->texture_map,
+                                         &pipeline->shared_data->texture_map,
                                         pipeline->layout,
                                         texture_idx, NULL);

@ -309,13 +312,13 @@ get_texture_size(struct v3dv_cmd_buffer *cmd_buffer,

 struct v3dv_cl_reloc
 v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
-                               struct v3dv_pipeline_stage *p_stage,
+                               struct v3dv_pipeline *pipeline,
+                               struct v3dv_shader_variant *variant,
                               uint32_t **wg_count_offsets)
 {
   struct v3d_uniform_list *uinfo =
-      &p_stage->current_variant->prog_data.base->uniforms;
+      &variant->prog_data.base->uniforms;
   struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
-   struct v3dv_pipeline *pipeline = p_stage->pipeline;

   struct v3dv_job *job = cmd_buffer->state.job;
   assert(job);
@ -432,7 +435,8 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,

 struct v3dv_cl_reloc
 v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
-                    struct v3dv_pipeline_stage *p_stage)
+                    struct v3dv_pipeline *pipeline,
+                    struct v3dv_shader_variant *variant)
 {
-   return v3dv_write_uniforms_wg_offsets(cmd_buffer, p_stage, NULL);
+   return v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline, variant, NULL);
 }