From 91f1bd8f81dff866c769ea3bcc58450b9293b1a4 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@collabora.com>
Date: Wed, 22 Jun 2022 10:43:04 +0200
Subject: [PATCH] dzn: Cache pipeline info

We are already caching DXIL shaders individually, but that forces us
to retrieve the NIR shader, do the linking and binding translation
steps, to finally query the cache for each DXIL shader. This pipeline
caching is about skipping those steps when we can.

Note that a graphics/compute pipeline cache entry doesn't contain the
DXIL shaders, but hashes to retrieve those shaders from the same cache.

Reviewed-by: Jesse Natalie <jenatali@microsoft.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17140>
---
 src/microsoft/vulkan/dzn_pipeline.c | 274 ++++++++++++++++++++++++++--
 1 file changed, 257 insertions(+), 17 deletions(-)

diff --git a/src/microsoft/vulkan/dzn_pipeline.c b/src/microsoft/vulkan/dzn_pipeline.c
index f998c420f45..f2df809e343 100644
--- a/src/microsoft/vulkan/dzn_pipeline.c
+++ b/src/microsoft/vulkan/dzn_pipeline.c
@@ -515,6 +515,134 @@ dzn_pipeline_cache_add_dxil_shader(struct vk_pipeline_cache *cache,
    vk_pipeline_cache_object_unref(cache_obj);
 }
 
+struct dzn_cached_gfx_pipeline_header {
+   uint32_t stages;
+   uint32_t input_count;
+};
+
+static VkResult
+dzn_pipeline_cache_lookup_gfx_pipeline(struct dzn_graphics_pipeline *pipeline,
+                                       struct vk_pipeline_cache *cache,
+                                       const uint8_t *pipeline_hash,
+                                       bool *cache_hit)
+{
+   *cache_hit = false;
+
+   if (!cache)
+      return VK_SUCCESS;
+
+   struct vk_pipeline_cache_object *cache_obj = NULL;
+
+   cache_obj =
+      vk_pipeline_cache_lookup_object(cache, pipeline_hash, SHA1_DIGEST_LENGTH,
+                                      &dzn_cached_blob_ops,
+                                      NULL);
+   if (!cache_obj)
+      return VK_SUCCESS;
+
+   struct dzn_cached_blob *cached_blob =
+      container_of(cache_obj, struct dzn_cached_blob, base);
+   D3D12_PIPELINE_STATE_STREAM_DESC *stream_desc =
+      &pipeline->templates.stream_desc;
+
+   const struct dzn_cached_gfx_pipeline_header *info =
+      (const struct dzn_cached_gfx_pipeline_header *)(cached_blob->data);
+   size_t offset = sizeof(*info);
+
+   assert(cached_blob->size >= sizeof(*info));
+
+   if (info->input_count > 0) {
+      offset = ALIGN_POT(offset, alignof(D3D12_INPUT_LAYOUT_DESC));
+      const D3D12_INPUT_ELEMENT_DESC *inputs =
+         (const D3D12_INPUT_ELEMENT_DESC *)((uint8_t *)cached_blob->data + offset);
+
+      assert(cached_blob->size >= offset + sizeof(*inputs) * info->input_count);
+
+      memcpy(pipeline->templates.inputs, inputs,
+             sizeof(*inputs) * info->input_count);
+      d3d12_gfx_pipeline_state_stream_new_desc(stream_desc, INPUT_LAYOUT, D3D12_INPUT_LAYOUT_DESC, desc);
+      desc->pInputElementDescs = pipeline->templates.inputs;
+      desc->NumElements = info->input_count;
+      offset += sizeof(*inputs) * info->input_count;
+   }
+
+   assert(cached_blob->size == offset + util_bitcount(info->stages) * SHA1_DIGEST_LENGTH);
+
+   u_foreach_bit(s, info->stages) {
+      uint8_t *dxil_hash = (uint8_t *)cached_blob->data + offset;
+      gl_shader_stage stage;
+
+      D3D12_SHADER_BYTECODE *slot =
+         dzn_pipeline_get_gfx_shader_slot(stream_desc, s);
+
+      VkResult ret =
+         dzn_pipeline_cache_lookup_dxil_shader(cache, dxil_hash, &stage, slot);
+      if (ret != VK_SUCCESS)
+         return ret;
+
+      assert(stage == s);
+      offset += SHA1_DIGEST_LENGTH;
+   }
+
+   *cache_hit = true;
+
+   vk_pipeline_cache_object_unref(cache_obj);
+   return VK_SUCCESS;
+}
+
+static void
+dzn_pipeline_cache_add_gfx_pipeline(struct dzn_graphics_pipeline *pipeline,
+                                    struct vk_pipeline_cache *cache,
+                                    uint32_t vertex_input_count,
+                                    const uint8_t *pipeline_hash,
+                                    const uint8_t *const *dxil_hashes)
+{
+   size_t offset =
+      ALIGN_POT(sizeof(struct dzn_cached_gfx_pipeline_header), alignof(D3D12_INPUT_ELEMENT_DESC)) +
+      (sizeof(D3D12_INPUT_ELEMENT_DESC) * vertex_input_count);
+   uint32_t stages = 0;
+
+   for (uint32_t i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
+      if (pipeline->templates.shaders[i].bc) {
+         stages |= BITFIELD_BIT(i);
+         offset += SHA1_DIGEST_LENGTH;
+      }
+   }
+
+   struct vk_pipeline_cache_object *cache_obj =
+      dzn_cached_blob_create(cache->base.device, pipeline_hash, NULL, offset);
+   if (!cache_obj)
+      return;
+
+   struct dzn_cached_blob *cached_blob =
+      container_of(cache_obj, struct dzn_cached_blob, base);
+
+   offset = 0;
+   struct dzn_cached_gfx_pipeline_header *info =
+      (struct dzn_cached_gfx_pipeline_header *)(cached_blob->data);
+
+   info->input_count = vertex_input_count;
+   info->stages = stages;
+
+   offset = ALIGN_POT(offset + sizeof(*info), alignof(D3D12_INPUT_ELEMENT_DESC));
+
+   D3D12_INPUT_ELEMENT_DESC *inputs =
+      (D3D12_INPUT_ELEMENT_DESC *)((uint8_t *)cached_blob->data + offset);
+   memcpy(inputs, pipeline->templates.inputs,
+          sizeof(*inputs) * vertex_input_count);
+   offset += sizeof(*inputs) * vertex_input_count;
+
+   u_foreach_bit(s, stages) {
+      uint8_t *dxil_hash = (uint8_t *)cached_blob->data + offset;
+
+      memcpy(dxil_hash, dxil_hashes[s], SHA1_DIGEST_LENGTH);
+      offset += SHA1_DIGEST_LENGTH;
+   }
+
+   cache_obj = vk_pipeline_cache_add_object(cache, cache_obj);
+   vk_pipeline_cache_object_unref(cache_obj);
+}
+
 static void
 dzn_graphics_pipeline_hash_attribs(D3D12_INPUT_ELEMENT_DESC *attribs,
                                    enum pipe_format *vi_conversions,
@@ -546,7 +674,9 @@ dzn_graphics_pipeline_compile_shaders(struct dzn_device *device,
       uint8_t spirv_hash[SHA1_DIGEST_LENGTH];
       uint8_t dxil_hash[SHA1_DIGEST_LENGTH];
    } stages[MESA_VULKAN_SHADER_STAGES] = { 0 };
+   const uint8_t *dxil_hashes[MESA_VULKAN_SHADER_STAGES] = { 0 };
    uint8_t attribs_hash[SHA1_DIGEST_LENGTH];
+   uint8_t pipeline_hash[SHA1_DIGEST_LENGTH];
    gl_shader_stage yz_flip_stage = MESA_SHADER_NONE;
    uint32_t active_stage_mask = 0;
    VkResult ret;
@@ -609,9 +739,31 @@ dzn_graphics_pipeline_compile_shaders(struct dzn_device *device,
 
    if (cache) {
       dzn_graphics_pipeline_hash_attribs(attribs, vi_conversions, attribs_hash);
+
+      struct mesa_sha1 pipeline_hash_ctx;
+
+      _mesa_sha1_init(&pipeline_hash_ctx);
+      _mesa_sha1_update(&pipeline_hash_ctx, attribs_hash, sizeof(attribs_hash));
+      _mesa_sha1_update(&pipeline_hash_ctx, &yz_flip_mode, sizeof(yz_flip_mode));
+      _mesa_sha1_update(&pipeline_hash_ctx, &y_flip_mask, sizeof(y_flip_mask));
+      _mesa_sha1_update(&pipeline_hash_ctx, &z_flip_mask, sizeof(z_flip_mask));
+      _mesa_sha1_update(&pipeline_hash_ctx, &force_sample_rate_shading, sizeof(force_sample_rate_shading));
+
       u_foreach_bit(stage, active_stage_mask) {
          vk_pipeline_hash_shader_stage(stages[stage].info, stages[stage].spirv_hash);
+         _mesa_sha1_update(&pipeline_hash_ctx, stages[stage].spirv_hash, sizeof(stages[stage].spirv_hash));
+         _mesa_sha1_update(&pipeline_hash_ctx, layout->stages[stage].hash, sizeof(layout->stages[stage].hash));
       }
+      _mesa_sha1_final(&pipeline_hash_ctx, pipeline_hash);
+
+      bool cache_hit;
+      ret = dzn_pipeline_cache_lookup_gfx_pipeline(pipeline, cache, pipeline_hash,
+                                                   &cache_hit);
+      if (ret != VK_SUCCESS)
+         return ret;
+
+      if (cache_hit)
+         return VK_SUCCESS;
    }
 
    /* Second step: get NIR shaders for all stages. */
@@ -687,6 +839,7 @@ dzn_graphics_pipeline_compile_shaders(struct dzn_device *device,
          _mesa_sha1_update(&dxil_hash_ctx, stages[stage].spirv_hash, sizeof(stages[stage].spirv_hash));
          _mesa_sha1_update(&dxil_hash_ctx, bindings_hash, sizeof(bindings_hash));
          _mesa_sha1_final(&dxil_hash_ctx, stages[stage].dxil_hash);
+         dxil_hashes[stage] = stages[stage].dxil_hash;
 
          gl_shader_stage cached_stage;
          D3D12_SHADER_BYTECODE bc;
@@ -704,26 +857,26 @@ dzn_graphics_pipeline_compile_shaders(struct dzn_device *device,
       }
    }
 
+   uint32_t vert_input_count = 0;
    if (pipeline->templates.shaders[MESA_SHADER_VERTEX].nir) {
       /* Now, declare one D3D12_INPUT_ELEMENT_DESC per VS input variable, so
        * we can handle location overlaps properly.
        */
-      unsigned drv_loc = 0;
       nir_foreach_shader_in_variable(var, pipeline->templates.shaders[MESA_SHADER_VERTEX].nir) {
          assert(var->data.location >= VERT_ATTRIB_GENERIC0);
          unsigned loc = var->data.location - VERT_ATTRIB_GENERIC0;
-         assert(drv_loc < D3D12_VS_INPUT_REGISTER_COUNT);
+         assert(vert_input_count < D3D12_VS_INPUT_REGISTER_COUNT);
          assert(loc < MAX_VERTEX_GENERIC_ATTRIBS);
 
-         pipeline->templates.inputs[drv_loc] = attribs[loc];
-         pipeline->templates.inputs[drv_loc].SemanticIndex = drv_loc;
-         var->data.driver_location = drv_loc++;
+         pipeline->templates.inputs[vert_input_count] = attribs[loc];
+         pipeline->templates.inputs[vert_input_count].SemanticIndex = vert_input_count;
+         var->data.driver_location = vert_input_count++;
       }
 
-      if (drv_loc > 0) {
+      if (vert_input_count > 0) {
          d3d12_gfx_pipeline_state_stream_new_desc(out, INPUT_LAYOUT, D3D12_INPUT_LAYOUT_DESC, desc);
          desc->pInputElementDescs = pipeline->templates.inputs;
-         desc->NumElements = drv_loc;
+         desc->NumElements = vert_input_count;
       }
    }
 
@@ -757,6 +910,10 @@ dzn_graphics_pipeline_compile_shaders(struct dzn_device *device,
          dzn_pipeline_cache_add_dxil_shader(cache, stages[stage].dxil_hash, stage, slot);
    }
 
+   if (cache)
+      dzn_pipeline_cache_add_gfx_pipeline(pipeline, cache, vert_input_count, pipeline_hash,
+                                          dxil_hashes);
+
    return VK_SUCCESS;
 }
 
@@ -1852,6 +2009,71 @@ dzn_compute_pipeline_destroy(struct dzn_compute_pipeline *pipeline,
    vk_free2(&pipeline->base.base.device->alloc, alloc, pipeline);
 }
 
+static VkResult
+dzn_pipeline_cache_lookup_compute_pipeline(struct vk_pipeline_cache *cache,
+                                           uint8_t *pipeline_hash,
+                                           D3D12_PIPELINE_STATE_STREAM_DESC *stream_desc,
+                                           D3D12_SHADER_BYTECODE *dxil,
+                                           bool *cache_hit)
+{
+   *cache_hit = false;
+
+   if (!cache)
+      return VK_SUCCESS;
+
+   struct vk_pipeline_cache_object *cache_obj = NULL;
+
+   cache_obj =
+      vk_pipeline_cache_lookup_object(cache, pipeline_hash, SHA1_DIGEST_LENGTH,
+                                      &dzn_cached_blob_ops,
+                                      NULL);
+   if (!cache_obj)
+      return VK_SUCCESS;
+
+   struct dzn_cached_blob *cached_blob =
+      container_of(cache_obj, struct dzn_cached_blob, base);
+
+   assert(cached_blob->size == SHA1_DIGEST_LENGTH);
+
+   const uint8_t *dxil_hash = cached_blob->data;
+   gl_shader_stage stage;
+
+   VkResult ret =
+      dzn_pipeline_cache_lookup_dxil_shader(cache, dxil_hash, &stage, dxil);
+
+   if (ret != VK_SUCCESS || stage == MESA_SHADER_NONE)
+      goto out;
+
+   assert(stage == MESA_SHADER_COMPUTE);
+
+   d3d12_compute_pipeline_state_stream_new_desc(stream_desc, CS, D3D12_SHADER_BYTECODE, slot);
+   *slot = *dxil;
+   *cache_hit = true;
+
+out:
+   vk_pipeline_cache_object_unref(cache_obj);
+   return ret;
+}
+
+static void
+dzn_pipeline_cache_add_compute_pipeline(struct vk_pipeline_cache *cache,
+                                        uint8_t *pipeline_hash,
+                                        uint8_t *dxil_hash)
+{
+   struct vk_pipeline_cache_object *cache_obj =
+      dzn_cached_blob_create(cache->base.device, pipeline_hash, NULL, SHA1_DIGEST_LENGTH);
+   if (!cache_obj)
+      return;
+
+   struct dzn_cached_blob *cached_blob =
+      container_of(cache_obj, struct dzn_cached_blob, base);
+
+   memcpy((void *)cached_blob->data, dxil_hash, SHA1_DIGEST_LENGTH);
+
+   cache_obj = vk_pipeline_cache_add_object(cache, cache_obj);
+   vk_pipeline_cache_object_unref(cache_obj);
+}
+
 static VkResult
 dzn_compute_pipeline_compile_shader(struct dzn_device *device,
                                     struct dzn_compute_pipeline *pipeline,
@@ -1861,18 +2083,33 @@ dzn_compute_pipeline_compile_shader(struct dzn_device *device,
                                     D3D12_SHADER_BYTECODE *shader,
                                     const VkComputePipelineCreateInfo *info)
 {
-   uint8_t spirv_hash[SHA1_DIGEST_LENGTH];
+   uint8_t spirv_hash[SHA1_DIGEST_LENGTH], pipeline_hash[SHA1_DIGEST_LENGTH];
+   VkResult ret = VK_SUCCESS;
    nir_shader *nir = NULL;
 
-   if (cache)
-      vk_pipeline_hash_shader_stage(&info->stage, spirv_hash);
+   if (cache) {
+      struct mesa_sha1 pipeline_hash_ctx;
 
-   VkResult ret =
-      dzn_pipeline_get_nir_shader(device, layout, cache, spirv_hash,
-                                  &info->stage, MESA_SHADER_COMPUTE,
-                                  DXIL_SPIRV_YZ_FLIP_NONE, 0, 0,
-                                  false, NULL,
-                                  dxil_get_nir_compiler_options(), &nir);
+      _mesa_sha1_init(&pipeline_hash_ctx);
+      vk_pipeline_hash_shader_stage(&info->stage, spirv_hash);
+      _mesa_sha1_update(&pipeline_hash_ctx, spirv_hash, sizeof(spirv_hash));
+      _mesa_sha1_update(&pipeline_hash_ctx, layout->stages[MESA_SHADER_COMPUTE].hash,
+                        sizeof(layout->stages[MESA_SHADER_COMPUTE].hash));
+      _mesa_sha1_final(&pipeline_hash_ctx, pipeline_hash);
+
+      bool cache_hit = false;
+      ret = dzn_pipeline_cache_lookup_compute_pipeline(cache, pipeline_hash,
+                                                       stream_desc, shader,
+                                                       &cache_hit);
+      if (ret != VK_SUCCESS || cache_hit)
+         goto out;
+   }
+
+   ret = dzn_pipeline_get_nir_shader(device, layout, cache, spirv_hash,
+                                     &info->stage, MESA_SHADER_COMPUTE,
+                                     DXIL_SPIRV_YZ_FLIP_NONE, 0, 0,
+                                     false, NULL,
+                                     dxil_get_nir_compiler_options(), &nir);
    if (ret != VK_SUCCESS)
       return ret;
 
@@ -1898,6 +2135,7 @@ dzn_compute_pipeline_compile_shader(struct dzn_device *device,
          assert(stage == MESA_SHADER_COMPUTE);
          d3d12_compute_pipeline_state_stream_new_desc(stream_desc, CS, D3D12_SHADER_BYTECODE, cs);
          *cs = *shader;
+         dzn_pipeline_cache_add_compute_pipeline(cache, pipeline_hash, dxil_hash);
          goto out;
       }
    }
@@ -1909,8 +2147,10 @@ dzn_compute_pipeline_compile_shader(struct dzn_device *device,
    d3d12_compute_pipeline_state_stream_new_desc(stream_desc, CS, D3D12_SHADER_BYTECODE, cs);
    *cs = *shader;
 
-   if (cache)
+   if (cache) {
       dzn_pipeline_cache_add_dxil_shader(cache, dxil_hash, MESA_SHADER_COMPUTE, shader);
+      dzn_pipeline_cache_add_compute_pipeline(cache, pipeline_hash, dxil_hash);
+   }
 
 out:
    ralloc_free(nir);