mesa/src/intel/vulkan/anv_shader_compile.c

/* Copyright © 2024 Intel Corporation
 * SPDX-License-Identifier: MIT
 */

#include "anv_private.h"
#include "anv_nir.h"
#include "anv_shader.h"

#include "vk_nir_convert_ycbcr.h"
#include "vk_pipeline.h"

#include "common/intel_compute_slm.h"
#include "common/intel_l3_config.h"

#include "compiler/brw/brw_nir.h"
#include "compiler/brw/brw_nir_rt.h"
#include "compiler/intel_nir.h"
#include "compiler/jay/jay.h"

#include "git_sha1.h"

typedef void (*game_wa_callback)(nir_shader *nir);

/* Structure to hold a game-specific workaround entry */
struct game_wa_entry {
   game_wa_callback cb;
   uint32_t shader_blake3s[16][BLAKE3_OUT_LEN32];
};

/* Workaround for a shader in Horizon Forbidden West that causes
 * visual corruption.  The shader writes the result of fsqrt to
 * storage images with a 16-bit image format and misrendering
 * occurs when those values are denormal for an unknown reason.
 *
 * This clamps the image writes to the smallest fp16 normalized
 * value.  (Pattern matching against fsqrt is easy to do in a one
 * line algebraic pass, while matching image stores is harder.)
 *
 * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/12555
 */
static void
wa_forbidden_west(nir_shader *nir)
{
   NIR_PASS(_, nir, brw_nir_apply_sqrt_workarounds);
}

/* Try to detect shaders relying on 32-wide subgroups. Usually they have a
 * pattern like this:
 *
 * div 32    %1096 = @load_subgroup_invocation
 * div 32    %1245 = iand %1096, %1228 (0x1f)
 * div 32    %1246 = ixor %1245, %13 (0x1)
 * div 32    %1247 = @shuffle (%1244, %1246)
 */
static bool is_alu1_iand_0x1f(nir_alu_instr *alu)
{
   if (!alu || alu->op != nir_op_iand)
      return false;

   for (uint32_t i = 0; i < 2; i++) {
       if (nir_src_is_const(alu->src[i].src) &&
           nir_alu_src_as_uint(alu->src[i]) == 0x1f)
          return true;
   }

   return false;
}

static bool is_simd32_shuffle(nir_intrinsic_instr *intrin)
{
   nir_alu_instr *alu1 = nir_src_as_alu(intrin->src[1]);
   if (alu1 == NULL)
      return false;

   if (is_alu1_iand_0x1f(alu1))
      return true;

   for (uint32_t i = 0; i < nir_op_infos[alu1->op].num_inputs; i++) {
      if (is_alu1_iand_0x1f(nir_src_as_alu(alu1->src[i].src)))
         return true;
   }

   return false;
}

/* Try to detect shaders testing with a sequence like this :
 *
 * 32x3    %49 = @load_local_invocation_id
 * 32    %1673 = load_const (0xffffffe0 = -32 = 4294967264)
 * 32    %1674 = iand %49.x, %1673 (0xffffffe0)
 * 32    %1675 = @load_subgroup_size
 * 32    %1676 = umod %1674, %1675
 *
 * This sequence appears to be targetted at subgroup sizes larger than 32. The
 * problem in this sequence is that subgroup size is expected to be >= 32 to
 * match the masking of local_invocation_id above. If inferior, the umod
 * operation returns the same value as if the subgroup was 32.
 */
static bool is_alu_used_for_umod_subgroup_size(nir_alu_instr *in_alu)
{
   nir_foreach_use(src, &in_alu->def) {
      nir_instr *instr = nir_src_use_instr(src);
      if (instr->type != nir_instr_type_alu)
         continue;

      nir_alu_instr *alu = nir_instr_as_alu(instr);
      if (alu->op != nir_op_umod &&
          alu->op != nir_op_imod)
         continue;

      for (uint32_t i = 0; i < 2; i++) {
         if (&alu->src[i].src == src)
            continue;

         if (!nir_src_is_intrinsic(alu->src[i].src) ||
             nir_src_as_intrinsic(alu->src[i].src)->intrinsic != nir_intrinsic_load_subgroup_size)
            continue;

         return true;
      }
   }

   return false;
}

static bool
is_local_invoc_id_used_with_simd32_assumption(nir_intrinsic_instr *subgroup_inv)
{
   nir_foreach_use(src, &subgroup_inv->def) {
      nir_instr *instr = nir_src_use_instr(src);
      if (instr->type != nir_instr_type_alu)
         continue;

      nir_alu_instr *alu = nir_instr_as_alu(instr);
      if (alu->op != nir_op_iand)
         continue;

      /* nir_print_instr(&alu->instr, stderr); */
      /* fprintf(stderr, "\n"); */

      for (uint32_t i = 0; i < 2; i++) {
         if (&alu->src[i].src == src)
            continue;

         if (!nir_src_is_const(alu->src[i].src))
            continue;

         if (nir_src_as_uint(alu->src[i].src) != 0xffffffe0)
            continue;

         if (is_alu_used_for_umod_subgroup_size(alu))
            return true;
      }
   }

   return false;
}

static bool
detect_simd32_requirement(nir_builder *b,
                          nir_intrinsic_instr *intrin,
                          void *data)
{
   switch (intrin->intrinsic) {
   case nir_intrinsic_shuffle:
      return is_simd32_shuffle(intrin);

   case nir_intrinsic_load_local_invocation_id:
      return is_local_invoc_id_used_with_simd32_assumption(intrin);

   default:
      return false;
   }
}

/* List of game-specific workarounds identified by BLAKE3 hash of the shader.
 * Add new workarounds here as needed.
 */
static const struct game_wa_entry game_was[] = {
   {
      .cb = wa_forbidden_west,
      .shader_blake3s = {
         {0x51683151, 0xe044f0ce, 0xc210a762, 0xb12b2da4, 0x4e69ddc0, 0x237b1cc1, 0xc84bcf09, 0x31cfe883},
      },
   },
};

/* Apply game-specific workarounds based on the shader's BLAKE3 hash */
static void
anv_nir_apply_shader_workarounds(nir_shader *nir)
{
   for (unsigned i = 0; i < ARRAY_SIZE(game_was); i++) {
      const struct game_wa_entry *wa = &game_was[i];
      for (unsigned j = 0; j < ARRAY_SIZE(wa->shader_blake3s); j++) {
         if (_mesa_printed_blake3_equal(nir->info.source_blake3, wa->shader_blake3s[j])) {
            wa->cb(nir);
            return;
         }
      }
   }
}

static enum brw_robustness_flags
anv_get_robust_flags(const struct vk_pipeline_robustness_state *rstate)
{
   return
      ((rstate->storage_buffers !=
        VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT) ?
       BRW_ROBUSTNESS_SSBO : 0) |
      ((rstate->uniform_buffers !=
        VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT) ?
       BRW_ROBUSTNESS_UBO : 0);
}

static enum anv_descriptor_set_layout_type
set_layouts_get_layout_type(struct anv_descriptor_set_layout * const *set_layouts,
                            uint32_t set_layout_count)
{
   for (uint32_t s = 0; s < set_layout_count; s++) {
      if (set_layouts[s]) {
         return set_layouts[s]->type;
      }
   }

   return ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_UNKNOWN;
}

void
anv_shader_init_uuid(struct anv_physical_device *device)
{
   /* We should include any parameter here that will change the compiler's
    * output. Mostly it's workarounds, but there is also settings for using
    * indirect descriptors (a different binding model).
    *
    * The fp64 workaround is skipped because although it changes the
    * compiler's output, not having that workaroung enabled with an app
    * expecting fp64 support will just crash in the backend.
    */
   blake3_hasher ctx;
   _mesa_blake3_init(&ctx);

   const bool indirect_descriptors = device->indirect_descriptors;
   _mesa_blake3_update(&ctx, &indirect_descriptors, sizeof(indirect_descriptors));

   const int spilling_rate = device->compiler->spilling_rate;
   _mesa_blake3_update(&ctx, &spilling_rate, sizeof(spilling_rate));

   const uint8_t afs = device->instance->assume_full_subgroups;
   _mesa_blake3_update(&ctx, &afs, sizeof(afs));

   const bool afswb = device->instance->assume_full_subgroups_with_barrier;
   _mesa_blake3_update(&ctx, &afswb, sizeof(afswb));

   const bool afs_shm = device->instance->assume_full_subgroups_with_shared_memory;
   _mesa_blake3_update(&ctx, &afs_shm, sizeof(afs_shm));

   const bool erwf = device->instance->emulate_read_without_format;
   _mesa_blake3_update(&ctx, &erwf, sizeof(erwf));

   const bool lttd = device->instance->lower_terminate_to_discard;
   _mesa_blake3_update(&ctx, &lttd, sizeof(lttd));

   const bool large_wg_wa =
      device->instance->large_workgroup_non_coherent_image_workaround;
   _mesa_blake3_update(&ctx, &large_wg_wa, sizeof(large_wg_wa));

   const bool lto_disable = device->instance->disable_lto;
   _mesa_blake3_update(&ctx, &lto_disable, sizeof(lto_disable));

   const bool btp_bti_rcc = device->rt_change_needs_flush;
   _mesa_blake3_update(&ctx, &btp_bti_rcc, sizeof(btp_bti_rcc));

   const bool cbv_push_buffer = device->instance->promote_cbv_to_push_buffers;
   _mesa_blake3_update(&ctx, &cbv_push_buffer, sizeof(cbv_push_buffer));

   uint8_t blake3[BLAKE3_KEY_LEN];
   _mesa_blake3_final(&ctx, blake3);
   memcpy(device->shader_binary_uuid, blake3, sizeof(device->shader_binary_uuid));
}

static const struct nir_shader_compiler_options *
anv_shader_get_nir_options(struct vk_physical_device *device,
                           mesa_shader_stage stage,
                           const struct vk_pipeline_robustness_state *rs)

{
   struct anv_physical_device *pdevice =
      container_of(device, struct anv_physical_device, vk);
   const struct brw_compiler *compiler = pdevice->compiler;

   return &compiler->nir_options[stage];
}

static struct spirv_to_nir_options
anv_shader_get_spirv_options(struct vk_physical_device *device,
                             mesa_shader_stage stage,
                             const struct vk_pipeline_robustness_state *rs)
{
   struct anv_physical_device *pdevice =
      container_of(device, struct anv_physical_device, vk);
   enum brw_robustness_flags robust_flags = anv_get_robust_flags(rs);

   return (struct spirv_to_nir_options) {
      .ubo_addr_format = anv_nir_ubo_addr_format(pdevice, robust_flags),
      .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_flags),
      .phys_ssbo_addr_format = nir_address_format_64bit_global,
      .push_const_addr_format = nir_address_format_logical,

      .printf = ANV_DEBUG(SHADER_PRINT),

      /* TODO: Consider changing this to an address format that has the NULL
       * pointer equals to 0.  That might be a better format to play nice
       * with certain code / code generators.
       */
      .shared_addr_format = nir_address_format_32bit_offset,

      .min_ubo_alignment = ANV_UBO_ALIGNMENT,
      .min_ssbo_alignment = ANV_SSBO_ALIGNMENT,

      .workarounds = {
         .lower_terminate_to_discard = pdevice->instance->lower_terminate_to_discard,
      },
   };
}

static void
anv_shader_preprocess_nir(struct vk_physical_device *device,
                          nir_shader *nir,
                          const struct vk_pipeline_robustness_state *rs)
{
   struct anv_physical_device *pdevice =
      container_of(device, struct anv_physical_device, vk);
   const struct brw_compiler *compiler = pdevice->compiler;

   NIR_PASS(_, nir, nir_lower_io_vars_to_temporaries,
            nir_shader_get_entrypoint(nir), nir_var_shader_out);

   const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
      .point_coord = true,
      .primitive_id = nir->info.stage == MESA_SHADER_FRAGMENT,
   };
   NIR_PASS(_, nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);

   NIR_PASS(_, nir, nir_update_image_intrinsic_from_var);

   const nir_opt_access_options opt_access_options = {
      .is_vulkan = true,
   };
   NIR_PASS(_, nir, nir_opt_access, &opt_access_options);

   struct brw_nir_compiler_opts opts = {
      .robust_image_access = rs->images == VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS ||
                             rs->images == VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_2_EXT,
   };
   brw_preprocess_nir(compiler, nir, &opts);

   NIR_PASS(_, nir, nir_opt_barrier_modes);
   NIR_PASS(_, nir, nir_opt_acquire_release_barriers, SCOPE_QUEUE_FAMILY);

   if (ANV_DEBUG(SHADER_PRINT)) {
      const nir_lower_printf_options printf_opts = {
         .ptr_bit_size = 64,
         .hash_format_strings = true,
      };
      NIR_PASS(_, nir, nir_lower_printf, &printf_opts);
   }

   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
}

static void
populate_base_prog_key(struct brw_base_prog_key *key,
                       const struct vk_physical_device *device,
                       const struct vk_pipeline_robustness_state *rs)
{
   const struct anv_physical_device *pdevice =
      container_of(device, const struct anv_physical_device, vk);

   /* We can avoid including this for hashing because the runtime already
    * hashes that information. We just put it here for at compile time.
    */
   if (rs != NULL)
      key->robust_flags = anv_get_robust_flags(rs);
   key->divergent_atomics_flags = pdevice->instance->enable_opt_divergent_atomics;
   key->limit_trig_input_range = pdevice->instance->limit_trig_input_range;
}

static void
populate_base_gfx_prog_key(struct brw_base_prog_key *key,
                           const struct vk_physical_device *device,
                           const struct vk_pipeline_robustness_state *rs,
                           const struct vk_graphics_pipeline_state *gfx_state,
                           VkShaderStageFlags link_stages)
{
   const struct anv_physical_device *pdevice =
      container_of(device, const struct anv_physical_device, vk);

   populate_base_prog_key(key, device, rs);

   key->view_mask = (gfx_state && gfx_state->mv) ? gfx_state->mv->view_mask : 0;

   key->vue_layout =
      (util_bitcount(link_stages) > 1 && (link_stages & VK_SHADER_STAGE_FRAGMENT_BIT)) ?
      INTEL_VUE_LAYOUT_FIXED :
      pdevice->info.verx10 >= 125 ? INTEL_VUE_LAYOUT_SEPARATE_MESH :
      INTEL_VUE_LAYOUT_SEPARATE;
}

static void
populate_vs_prog_key(struct brw_vs_prog_key *key,
                     const struct vk_physical_device *device,
                     const struct vk_pipeline_robustness_state *rs,
                     const struct vk_graphics_pipeline_state *state,
                     VkShaderStageFlags link_stages)
{
   const struct anv_physical_device *pdevice =
      container_of(device, const struct anv_physical_device, vk);

   populate_base_gfx_prog_key(&key->base, device, rs, state, link_stages);

   key->vf_component_packing = pdevice->instance->vf_component_packing;
}

static void
populate_tcs_prog_key(struct brw_tcs_prog_key *key,
                      const struct vk_physical_device *device,
                      const struct vk_pipeline_robustness_state *rs,
                      const struct vk_graphics_pipeline_state *state,
                      VkShaderStageFlags link_stages)
{
   populate_base_gfx_prog_key(&key->base, device, rs, state, link_stages);

   if (state && state->ts &&
       !BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS))
      key->input_vertices = state->ts->patch_control_points;

   key->separate_tess_vue_layout =
      !(link_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT);
}

static void
populate_tes_prog_key(struct brw_tes_prog_key *key,
                      const struct vk_physical_device *device,
                      const struct vk_pipeline_robustness_state *rs,
                      const struct vk_graphics_pipeline_state *state,
                      VkShaderStageFlags link_stages)
{
   populate_base_gfx_prog_key(&key->base, device, rs, state, link_stages);

   key->separate_tess_vue_layout =
      !(link_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT);
}

static void
populate_gs_prog_key(struct brw_gs_prog_key *key,
                     const struct vk_physical_device *device,
                     const struct vk_pipeline_robustness_state *rs,
                     const struct vk_graphics_pipeline_state *state,
                     VkShaderStageFlags link_stages)
{
   populate_base_gfx_prog_key(&key->base, device, rs, state, link_stages);
}

static void
populate_task_prog_key(struct brw_task_prog_key *key,
                       const struct vk_physical_device *device,
                       const struct vk_pipeline_robustness_state *rs,
                       const struct vk_graphics_pipeline_state *state,
                       VkShaderStageFlags link_stages)
{
   populate_base_gfx_prog_key(&key->base, device, rs, state, link_stages);
}

static void
populate_mesh_prog_key(struct brw_mesh_prog_key *key,
                       const struct vk_physical_device *device,
                       const struct vk_pipeline_robustness_state *rs,
                       const struct vk_graphics_pipeline_state *state,
                       VkShaderStageFlags link_stages)
{
   populate_base_gfx_prog_key(&key->base, device, rs, state, link_stages);
}

static bool
pipeline_has_coarse_pixel(const struct vk_graphics_pipeline_state *state)
{
   if (state == NULL)
      return true;

   /* The Vulkan 1.2.199 spec says:
    *
    *    "If any of the following conditions are met, Cxy' must be set to
    *    {1,1}:
    *
    *     * If Sample Shading is enabled.
    *     * [...]"
    *
    * And "sample shading" is defined as follows:
    *
    *    "Sample shading is enabled for a graphics pipeline:
    *
    *     * If the interface of the fragment shader entry point of the
    *       graphics pipeline includes an input variable decorated with
    *       SampleId or SamplePosition. In this case minSampleShadingFactor
    *       takes the value 1.0.
    *
    *     * Else if the sampleShadingEnable member of the
    *       VkPipelineMultisampleStateCreateInfo structure specified when
    *       creating the graphics pipeline is set to VK_TRUE. In this case
    *       minSampleShadingFactor takes the value of
    *       VkPipelineMultisampleStateCreateInfo::minSampleShading.
    *
    *    Otherwise, sample shading is considered disabled."
    *
    * The first bullet above is handled by the back-end compiler because those
    * inputs both force per-sample dispatch.  The second bullet is handled
    * here.  Note that this sample shading being enabled has nothing to do
    * with minSampleShading.
    */
   if (state->ms != NULL && state->ms->sample_shading_enable)
      return false;

   /* Not dynamic & pipeline has a 1x1 fragment shading rate with no
    * possibility for element of the pipeline to change the value or fragment
    * shading rate not specified at all.
    */
   if (!BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_FSR) &&
       (state->fsr == NULL ||
        (state->fsr->fragment_size.width <= 1 &&
         state->fsr->fragment_size.height <= 1 &&
         state->fsr->combiner_ops[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR &&
         state->fsr->combiner_ops[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR)))
      return false;

   return true;
}

static uint32_t
rp_color_mask(const struct vk_graphics_pipeline_state *state)
{
   if (state == NULL || state->rp == NULL ||
       !vk_render_pass_state_has_attachment_info(state->rp))
      return ((1u << MAX_RTS) - 1);

   assert(state->rp->color_attachment_count <= MAX_RTS);

   uint32_t color_mask = 0;
   for (uint32_t i = 0; i < state->rp->color_attachment_count; i++) {
      if (state->rp->color_attachment_formats[i] != VK_FORMAT_UNDEFINED) {
         if (state->cal) {
            if (state->cal->color_map[i] != MESA_VK_ATTACHMENT_UNUSED)
               color_mask |= BITFIELD_BIT(state->cal->color_map[i]);
         } else {
            color_mask |= BITFIELD_BIT(i);
         }
      }
   }

   return color_mask;
}

static void
populate_fs_prog_key(struct brw_fs_prog_key *key,
                     const struct vk_physical_device *device,
                     const struct vk_pipeline_robustness_state *rs,
                     const struct vk_graphics_pipeline_state *state,
                     VkShaderStageFlags link_stages)
{
   const struct anv_physical_device *pdevice =
      container_of(device, const struct anv_physical_device, vk);

   populate_base_gfx_prog_key(&key->base, device, rs, state, link_stages);

   /* Consider all inputs as valid until look at the NIR variables. */
   key->nr_color_regions = util_last_bit(rp_color_mask(state));

   /* To reduce possible shader recompilations we would need to know if
    * there is a SampleMask output variable to compute if we should emit
    * code to workaround the issue that hardware disables alpha to coverage
    * when there is SampleMask output.
    *
    * If the pipeline we compile the fragment shader in includes the output
    * interface, then we can be sure whether alpha_coverage is enabled or not.
    * If we don't have that output interface, then we have to compile the
    * shader with some conditionals.
    */
   if (state != NULL && state->ms != NULL) {
      /* VUID-VkGraphicsPipelineCreateInfo-rasterizerDiscardEnable-00751:
       *
       *   "If the pipeline is being created with fragment shader state,
       *    pMultisampleState must be a valid pointer to a valid
       *    VkPipelineMultisampleStateCreateInfo structure"
       *
       * It's also required for the fragment output interface.
       */
      key->multisample_fbo =
         BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ?
         INTEL_SOMETIMES :
         state->ms->rasterization_samples > 1 ? INTEL_ALWAYS : INTEL_NEVER;
      key->persample_interp =
         BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ?
         INTEL_SOMETIMES :
         (state->ms->sample_shading_enable &&
          (state->ms->min_sample_shading * state->ms->rasterization_samples) > 1) ?
         INTEL_ALWAYS : INTEL_NEVER;
      key->alpha_to_coverage =
         BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ?
         INTEL_SOMETIMES :
         (state->ms->alpha_to_coverage_enable ? INTEL_ALWAYS : INTEL_NEVER);

      /* TODO: We should make this dynamic */
      if (pdevice->instance->sample_mask_out_opengl_behaviour)
         key->ignore_sample_mask_out = !key->multisample_fbo;
   } else {
      /* Consider all inputs as valid until we look at the NIR variables. */
      key->nr_color_regions = MAX_RTS;

      key->alpha_to_coverage = INTEL_SOMETIMES;
      key->multisample_fbo = INTEL_SOMETIMES;
      key->persample_interp = INTEL_SOMETIMES;
   }

   if (pdevice->info.verx10 >= 200) {
      if (state != NULL && state->rs != NULL) {
         key->provoking_vertex_last =
            BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX) ?
            INTEL_SOMETIMES :
            state->rs->provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ?
            INTEL_ALWAYS : INTEL_NEVER;
      } else {
         key->provoking_vertex_last = INTEL_SOMETIMES;
      }
   } else {
      /* Pre-Xe2 we don't care about this at all, make sure it's always set to
       * NEVER to avoid it influencing the push constant.
       */
      key->provoking_vertex_last = INTEL_NEVER;
   }

   if (state != NULL && state->rs != NULL) {
      key->conservative_raster =
         BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE) ?
         INTEL_SOMETIMES :
         state->rs->conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT ?
         INTEL_NEVER : INTEL_ALWAYS;
   } else {
      key->conservative_raster = INTEL_SOMETIMES;
   }

   key->mesh_input =
      (link_stages & VK_SHADER_STAGE_VERTEX_BIT) ? INTEL_NEVER :
      (link_stages & VK_SHADER_STAGE_MESH_BIT_EXT) ? INTEL_ALWAYS :
      pdevice->info.has_mesh_shading ? INTEL_SOMETIMES : INTEL_NEVER;

   if (state && state->ms) {
      key->min_sample_shading = state->ms->min_sample_shading;
      key->api_sample_shading = state->ms->sample_shading_enable;
   }

   key->coarse_pixel = pipeline_has_coarse_pixel(state);
}

static void
populate_cs_prog_key(struct brw_cs_prog_key *key,
                     const struct vk_physical_device *device,
                     const struct vk_pipeline_robustness_state *rs)
{
   populate_base_prog_key(&key->base, device, rs);
}

static void
populate_bs_prog_key(struct brw_bs_prog_key *key,
                     const struct vk_physical_device *device,
                     const struct vk_pipeline_robustness_state *rs,
                     VkPipelineCreateFlags2KHR flags)
{
   populate_base_prog_key(&key->base, device, rs);

   uint32_t ray_flags = 0;
   const bool rt_skip_triangles =
      flags & VK_PIPELINE_CREATE_2_RAY_TRACING_SKIP_TRIANGLES_BIT_KHR;
   const bool rt_skip_aabbs =
      flags & VK_PIPELINE_CREATE_2_RAY_TRACING_SKIP_AABBS_BIT_KHR;
   assert(!(rt_skip_triangles && rt_skip_aabbs));
   if (rt_skip_triangles)
      ray_flags |= BRW_RT_RAY_FLAG_SKIP_TRIANGLES;
   else if (rt_skip_aabbs)
      ray_flags |= BRW_RT_RAY_FLAG_SKIP_AABBS;

   key->pipeline_ray_flags = ray_flags;
}

static void
anv_shader_hash_state(struct vk_physical_device *device,
                      const struct vk_graphics_pipeline_state *state,
                      const struct vk_features *enabled_features,
                      VkShaderStageFlags stages,
                      blake3_hash blake3_out)
{
   struct mesa_blake3 blake3_ctx;
   _mesa_blake3_init(&blake3_ctx);

   anv_foreach_vk_stage(stage, stages) {
      union brw_any_prog_key key;
      memset(&key, 0, sizeof(key));

      switch (stage) {
      case VK_SHADER_STAGE_VERTEX_BIT:
         populate_vs_prog_key(&key.vs, device, NULL, state, stages);
         _mesa_blake3_update(&blake3_ctx, &key.vs, sizeof(key.vs));
         break;
      case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
         populate_tcs_prog_key(&key.tcs, device, NULL, state, stages);
         _mesa_blake3_update(&blake3_ctx, &key.tcs, sizeof(key.tcs));
         break;
      case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
         populate_tes_prog_key(&key.tes, device, NULL, state, stages);
         _mesa_blake3_update(&blake3_ctx, &key.tes, sizeof(key.tes));
         break;
      case VK_SHADER_STAGE_GEOMETRY_BIT:
         populate_gs_prog_key(&key.gs, device, NULL, state, stages);
         _mesa_blake3_update(&blake3_ctx, &key.gs, sizeof(key.gs));
         break;
      case VK_SHADER_STAGE_TASK_BIT_EXT:
         populate_task_prog_key(&key.task, device, NULL, state, stages);
         _mesa_blake3_update(&blake3_ctx, &key.task, sizeof(key.task));
         break;
      case VK_SHADER_STAGE_MESH_BIT_EXT:
         populate_mesh_prog_key(&key.mesh, device, NULL, state, stages);
         _mesa_blake3_update(&blake3_ctx, &key.mesh, sizeof(key.mesh));
         break;
      case VK_SHADER_STAGE_FRAGMENT_BIT:
         populate_fs_prog_key(&key.fs, device, NULL, state, stages);
         _mesa_blake3_update(&blake3_ctx, &key.fs, sizeof(key.fs));
         break;
      case VK_SHADER_STAGE_COMPUTE_BIT:
         populate_cs_prog_key(&key.cs, device, NULL);
         _mesa_blake3_update(&blake3_ctx, &key.cs, sizeof(key.cs));
         break;
      default:
         UNREACHABLE("Invalid stage");
      }
   }

   _mesa_blake3_final(&blake3_ctx, blake3_out);
}

static void
merge_tess_info(struct shader_info *tes_info,
                const struct shader_info *tcs_info)
{
   /* The Vulkan 1.0.38 spec, section 21.1 Tessellator says:
    *
    *    "PointMode. Controls generation of points rather than triangles
    *     or lines. This functionality defaults to disabled, and is
    *     enabled if either shader stage includes the execution mode.
    *
    * and about Triangles, Quads, IsoLines, VertexOrderCw, VertexOrderCcw,
    * PointMode, SpacingEqual, SpacingFractionalEven, SpacingFractionalOdd,
    * and OutputVertices, it says:
    *
    *    "One mode must be set in at least one of the tessellation
    *     shader stages."
    *
    * So, the fields can be set in either the TCS or TES, but they must
    * agree if set in both.  Our backend looks at TES, so bitwise-or in
    * the values from the TCS.
    */
   assert(tcs_info->tess.tcs_vertices_out == 0 ||
          tes_info->tess.tcs_vertices_out == 0 ||
          tcs_info->tess.tcs_vertices_out == tes_info->tess.tcs_vertices_out);
   tes_info->tess.tcs_vertices_out |= tcs_info->tess.tcs_vertices_out;

   assert(tcs_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
          tes_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
          tcs_info->tess.spacing == tes_info->tess.spacing);
   tes_info->tess.spacing |= tcs_info->tess.spacing;

   assert(tcs_info->tess._primitive_mode == 0 ||
          tes_info->tess._primitive_mode == 0 ||
          tcs_info->tess._primitive_mode == tes_info->tess._primitive_mode);
   tes_info->tess._primitive_mode |= tcs_info->tess._primitive_mode;
   tes_info->tess.ccw |= tcs_info->tess.ccw;
   tes_info->tess.point_mode |= tcs_info->tess.point_mode;
}

static void
anv_shader_link_tcs(const struct brw_compiler *compiler,
                    struct brw_tcs_prog_key *key,
                    struct vk_shader_compile_info *tcs_stage,
                    struct vk_shader_compile_info *tes_stage)
{
   assert(tes_stage && tes_stage->stage == MESA_SHADER_TESS_EVAL);

   brw_nir_link_shaders(compiler, tcs_stage->nir, tes_stage->nir);

   nir_lower_patch_vertices(tes_stage->nir,
                            tcs_stage->nir->info.tess.tcs_vertices_out,
                            NULL);

   /* Copy TCS info into the TES info */
   merge_tess_info(&tes_stage->nir->info, &tcs_stage->nir->info);

   /* Whacking the key after cache lookup is a bit sketchy, but all of
    * this comes from the SPIR-V, which is part of the hash used for the
    * pipeline cache.  So it should be safe.
    */
   key->_tes_primitive_mode = tes_stage->nir->info.tess._primitive_mode;
}

static void
anv_shader_link(const struct brw_compiler *compiler,
                struct vk_shader_compile_info *prev_stage,
                struct vk_shader_compile_info *next_stage)
{
   brw_nir_link_shaders(compiler, prev_stage->nir, next_stage->nir);
}

static const struct vk_ycbcr_conversion_state *
lookup_ycbcr_conversion(const void *_stage, uint32_t set,
                        uint32_t binding, uint32_t array_index)
{
   const struct vk_shader_compile_info *stage = _stage;

   if (set == VK_NIR_YCBCR_SET_IMMUTABLE_SAMPLERS) {
      assert(binding < stage->embedded_sampler_count);
      return &stage->embedded_samplers[binding].ycbcr_conversion;
   } else {
      assert(set < MAX_SETS);
      const struct anv_descriptor_set_layout *set_layout =
         container_of(stage->set_layouts[set],
                      struct anv_descriptor_set_layout, vk);

      assert(binding < set_layout->binding_count);
      const struct anv_descriptor_set_binding_layout *bind_layout =
         &set_layout->binding[binding];

      if (bind_layout->samplers == NULL)
         return NULL;

      array_index = MIN2(array_index, bind_layout->array_size - 1);

      const struct anv_descriptor_set_layout_sampler *sampler =
         &bind_layout->samplers[array_index];

      return sampler->has_ycbcr_conversion ?
             &sampler->ycbcr_conversion_state : NULL;
   }
}

static void
anv_fixup_subgroup_size(struct anv_device *device, nir_shader *shader)
{
   struct shader_info *info = &shader->info;
   const struct anv_instance *instance = device->physical->instance;

   if (!mesa_shader_stage_uses_workgroup(info->stage))
      return;

   unsigned local_size = info->workgroup_size[0] *
                         info->workgroup_size[1] *
                         info->workgroup_size[2];

   /* Games don't always request full subgroups when they should,
    * which can cause bugs, as they may expect bigger size of the
    * subgroup than we choose for the execution.
    */
   if (instance->assume_full_subgroups &&
       info->uses_wide_subgroup_intrinsics &&
       info->api_subgroup_size == BRW_SUBGROUP_SIZE &&
       local_size &&
       local_size % BRW_SUBGROUP_SIZE == 0) {
      info->max_subgroup_size = BRW_SUBGROUP_SIZE;
      info->min_subgroup_size = BRW_SUBGROUP_SIZE;
   }

   if (instance->assume_full_subgroups_with_barrier &&
       info->stage == MESA_SHADER_COMPUTE &&
       device->info->verx10 <= 125 &&
       info->uses_control_barrier &&
       info->min_subgroup_size != info->max_subgroup_size &&
       local_size &&
       local_size % BRW_SUBGROUP_SIZE == 0) {
      info->max_subgroup_size = BRW_SUBGROUP_SIZE;
      info->min_subgroup_size = BRW_SUBGROUP_SIZE;
   }

   /* Similarly, sometimes games rely on the implicit synchronization of
    * the shared memory accesses, and choosing smaller subgroups than the game
    * expects will cause bugs. */
   if (instance->assume_full_subgroups_with_shared_memory &&
       info->shared_size > 0 &&
       info->min_subgroup_size != info->max_subgroup_size &&
       local_size &&
       local_size % BRW_SUBGROUP_SIZE == 0) {
      info->max_subgroup_size = BRW_SUBGROUP_SIZE;
      info->min_subgroup_size = BRW_SUBGROUP_SIZE;
   }

   /* Cooperative matrix extension requires that all invocations in a subgroup
    * be active. As a result, when the application does not request a specific
    * subgroup size, we must use SIMD32.
    */
   if (info->stage == MESA_SHADER_COMPUTE && info->cs.has_cooperative_matrix &&
       info->max_subgroup_size > info->min_subgroup_size) {
      info->api_subgroup_size = info->max_subgroup_size;
      info->min_subgroup_size = info->max_subgroup_size;
   }

   /* Only promote to SIMD32 if the max allows it. */
   if (info->max_subgroup_size >= BRW_SUBGROUP_SIZE &&
       info->min_subgroup_size != info->max_subgroup_size &&
       info->uses_wide_subgroup_intrinsics &&
       nir_shader_intrinsics_pass(shader,
                                  detect_simd32_requirement,
                                  nir_metadata_all,
                                  NULL)) {
      info->max_subgroup_size = BRW_SUBGROUP_SIZE;
      info->min_subgroup_size = BRW_SUBGROUP_SIZE;
   }
}

static void
anv_shader_compile_vs(struct anv_device *device,
                      void *mem_ctx,
                      struct anv_shader_data *shader_data,
                      char **error_str)
{
   const struct brw_compiler *compiler = device->physical->compiler;
   const struct intel_device_info *devinfo = compiler->devinfo;
   nir_shader *nir = shader_data->info->nir;

   shader_data->num_stats = 1;

   struct brw_compile_vs_params params = {
      .base = {
         .nir = nir,
         .stats = shader_data->stats,
         .log_data = device,
         .mem_ctx = mem_ctx,
         .source_hash = shader_data->source_hash,
         .archiver = shader_data->archiver,
      },
      .key = &shader_data->key.vs,
      .prog_data = &shader_data->prog_data.vs,
   };

   if (intel_use_jay(devinfo, nir->info.stage)) {
      struct jay_shader_bin *bin =
         jay_compile(devinfo, mem_ctx, nir,
                     (union brw_any_prog_data *) params.prog_data,
                     (union brw_any_prog_key *) params.key);

      shader_data->code = (void *) bin->kernel;
   } else {
      shader_data->code = (void *) brw_compile_vs(compiler, &params);
   }

   *error_str = params.base.error_str;
}

static void
anv_shader_compile_tcs(struct anv_device *device,
                       void *mem_ctx,
                       struct anv_shader_data *shader_data,
                       char **error_str)
{
   const struct brw_compiler *compiler = device->physical->compiler;
   nir_shader *nir = shader_data->info->nir;

   shader_data->key.tcs.outputs_written = nir->info.outputs_written;
   shader_data->key.tcs.patch_outputs_written = nir->info.patch_outputs_written;

   shader_data->num_stats = 1;

   struct brw_compile_tcs_params params = {
      .base = {
         .nir = nir,
         .stats = shader_data->stats,
         .log_data = device,
         .mem_ctx = mem_ctx,
         .source_hash = shader_data->source_hash,
         .archiver = shader_data->archiver,
      },
      .key = &shader_data->key.tcs,
      .prog_data = &shader_data->prog_data.tcs,
   };

   shader_data->code = (void *)brw_compile_tcs(compiler, &params);
   *error_str = params.base.error_str;
}

static void
anv_shader_compile_tes(struct anv_device *device,
                       void *mem_ctx,
                       struct anv_shader_data *tes_shader_data,
                       struct anv_shader_data *tcs_shader_data,
                       char **error_str)
{
   const struct brw_compiler *compiler = device->physical->compiler;
   nir_shader *nir = tes_shader_data->info->nir;

   if (tcs_shader_data) {
      tes_shader_data->key.tes.inputs_read =
         tcs_shader_data->info->nir->info.outputs_written;
      tes_shader_data->key.tes.patch_inputs_read =
         tcs_shader_data->info->nir->info.patch_outputs_written;
   }

   tes_shader_data->num_stats = 1;

   struct brw_compile_tes_params params = {
      .base = {
         .nir = nir,
         .stats = tes_shader_data->stats,
         .log_data = device,
         .mem_ctx = mem_ctx,
         .source_hash = tes_shader_data->source_hash,
         .archiver = tes_shader_data->archiver,
      },
      .key = &tes_shader_data->key.tes,
      .prog_data = &tes_shader_data->prog_data.tes,
      .input_vue_map = tcs_shader_data ?
                       &tcs_shader_data->prog_data.tcs.base.vue_map : NULL,
   };

   tes_shader_data->code = (void *)brw_compile_tes(compiler, &params);
   *error_str = params.base.error_str;
}

static void
anv_shader_compile_gs(struct anv_device *device,
                      void *mem_ctx,
                      struct anv_shader_data *shader_data,
                      char **error_str)
{
   const struct brw_compiler *compiler = device->physical->compiler;
   nir_shader *nir = shader_data->info->nir;

   shader_data->num_stats = 1;

   struct brw_compile_gs_params params = {
      .base = {
         .nir = nir,
         .stats = shader_data->stats,
         .log_data = device,
         .mem_ctx = mem_ctx,
         .source_hash = shader_data->source_hash,
         .archiver = shader_data->archiver,
      },
      .key = &shader_data->key.gs,
      .prog_data = &shader_data->prog_data.gs,
   };

   shader_data->code = (void *)brw_compile_gs(compiler, &params);
   *error_str = params.base.error_str;
}

static void
anv_shader_compile_task(struct anv_device *device,
                        void *mem_ctx,
                        struct anv_shader_data *shader_data,
                        char **error_str)
{
   const struct brw_compiler *compiler = device->physical->compiler;
   nir_shader *nir = shader_data->info->nir;

   shader_data->num_stats = 1;

   struct brw_compile_task_params params = {
      .base = {
         .nir = nir,
         .stats = shader_data->stats,
         .log_data = device,
         .mem_ctx = mem_ctx,
         .source_hash = shader_data->source_hash,
         .archiver = shader_data->archiver,
      },
      .key = &shader_data->key.task,
      .prog_data = &shader_data->prog_data.task,
   };

   shader_data->code = (void *)brw_compile_task(compiler, &params);
   *error_str = params.base.error_str;
}

static nir_def *
wa_18019110168_load_provoking_vertex(nir_builder *b, void *data)
{
   const struct anv_pipeline_bind_map *bind_map = data;
   nir_def *val = NULL;

   for (uint32_t i = 0; i < bind_map->inline_dwords_count; i++) {
      if (bind_map->inline_dwords[i] == anv_drv_const_dword(gfx.wa_18019110168)) {
         val = nir_load_inline_data_intel(
            b, 1, 32, nir_imm_int(b, 0),
            .base = i * 4);
         break;
      }
   }

   if (val == NULL) {
      val = nir_load_push_data_intel(b, 1, 32, nir_imm_int(b, 0),
                                     .base = anv_drv_const_offset(gfx.wa_18019110168) -
                                             bind_map->push_ranges[0].start * 32,
                                     .range = anv_drv_const_size(gfx.wa_18019110168));
   }

   return nir_iand_imm(b, val, ANV_WA_18019110168_PROVOKING_VERTEX_MASK);
}

static nir_def *
wa_18019110168_load_per_primitive_remap_table(nir_builder *b, void *data)
{
   const struct anv_pipeline_bind_map *bind_map = data;
   nir_def *val = NULL;

   val = nir_load_push_data_intel(b, 1, 32, nir_imm_int(b, 0),
                                  .base = anv_drv_const_offset(gfx.wa_18019110168) -
                                          bind_map->push_ranges[0].start * 32,
                                  .range = anv_drv_const_size(gfx.wa_18019110168));

   return nir_iand_imm(b, val, ANV_WA_18019110168_PER_PRIMITIVE_REMAP_TABLE_OFFSET_MASK);
}

static void
anv_shader_compile_mesh(struct anv_device *device,
                        void *mem_ctx,
                        struct anv_shader_data *mesh_shader_data,
                        struct anv_shader_data *task_shader_data,
                        char **error_str)
{
   const struct brw_compiler *compiler = device->physical->compiler;
   nir_shader *nir = mesh_shader_data->info->nir;

   mesh_shader_data->num_stats = 1;

   struct brw_compile_mesh_params params = {
      .base = {
         .nir = nir,
         .stats = mesh_shader_data->stats,
         .log_data = device,
         .mem_ctx = mem_ctx,
         .source_hash = mesh_shader_data->source_hash,
         .archiver = mesh_shader_data->archiver,
      },
      .key = &mesh_shader_data->key.mesh,
      .prog_data = &mesh_shader_data->prog_data.mesh,
      .tue_map = task_shader_data ?
                 &task_shader_data->prog_data.task.map :
                 NULL,
      .wa_18019110168_load_provoking_vertex =
         wa_18019110168_load_provoking_vertex,
      .wa_18019110168_data = (void *)&mesh_shader_data->bind_map,
   };

   mesh_shader_data->code = (void *)brw_compile_mesh(compiler, &params);
   *error_str = params.base.error_str;
}

static void
anv_shader_compile_fs(struct anv_device *device,
                      void *mem_ctx,
                      struct anv_shader_data *shader_data,
                      const struct vk_graphics_pipeline_state *state,
                      char **error_str)
{
   const struct brw_compiler *compiler = device->physical->compiler;
   const struct intel_device_info *devinfo = compiler->devinfo;
   nir_shader *nir = shader_data->info->nir;

   /* When using Primitive Replication for multiview, each view gets its own
    * position slot.
    */
   uint32_t pos_slots = shader_data->use_primitive_replication ?
      MAX2(1, util_bitcount(shader_data->key.base.view_mask)) : 1;

   struct intel_vue_map prev_vue_map;
   brw_compute_vue_map(compiler->devinfo,
                       &prev_vue_map,
                       nir->info.inputs_read,
                       nir->info.separate_shader,
                       pos_slots);

   struct brw_compile_fs_params params = {
      .base = {
         .nir = nir,
         .stats = shader_data->stats,
         .log_data = device,
         .mem_ctx = mem_ctx,
         .source_hash = shader_data->source_hash,
         .archiver = shader_data->archiver,
      },
      .key = &shader_data->key.fs,
      .prog_data = &shader_data->prog_data.fs,
      .mue_map = shader_data->mue_map,

      .allow_spilling = true,
      .max_polygons = UCHAR_MAX,

      .wa_18019110168_load_per_primitive_remap_table_offset =
         wa_18019110168_load_per_primitive_remap_table,
      .wa_18019110168_data = (void *)&shader_data->bind_map,
   };

   if (intel_use_jay(devinfo, nir->info.stage)) {
      struct jay_shader_bin *bin =
         jay_compile(devinfo, mem_ctx, nir,
                     (union brw_any_prog_data *) params.prog_data,
                     (union brw_any_prog_key *) params.key);

      shader_data->code = (void *) bin->kernel;
   } else {
      shader_data->code = (void *) brw_compile_fs(compiler, &params);
   }

   *error_str = params.base.error_str;

   shader_data->num_stats = (uint32_t)!!shader_data->prog_data.fs.dispatch_multi +
                            (uint32_t)shader_data->prog_data.fs.dispatch_8 +
                            (uint32_t)shader_data->prog_data.fs.dispatch_16 +
                            (uint32_t)shader_data->prog_data.fs.dispatch_32;
   assert(shader_data->num_stats <= ARRAY_SIZE(shader_data->stats));

   /* Update the push constant padding range now that we know the amount of
    * per-primitive data delivered in the payload.
    */
   for (unsigned i = 0; i < ARRAY_SIZE(shader_data->bind_map.push_ranges); i++) {
      if (shader_data->bind_map.push_ranges[i].set == ANV_DESCRIPTOR_SET_PER_PRIM_PADDING) {
         shader_data->bind_map.push_ranges[i].length = MAX2(
            shader_data->prog_data.fs.num_per_primitive_inputs / 2,
            shader_data->bind_map.push_ranges[i].length);
         break;
      }
   }
}

static void
anv_shader_compile_cs(struct anv_device *device,
                      void *mem_ctx,
                      struct anv_shader_data *shader_data,
                      char **error_str)
{
   const struct brw_compiler *compiler = device->physical->compiler;
   const struct intel_device_info *devinfo = compiler->devinfo;
   nir_shader *nir = shader_data->info->nir;

   shader_data->num_stats = 1;

   struct brw_compile_cs_params params = {
      .base = {
         .nir = nir,
         .stats = shader_data->stats,
         .log_data = device,
         .mem_ctx = mem_ctx,
         .source_hash = shader_data->source_hash,
         .archiver = shader_data->archiver,
      },
      .key = &shader_data->key.cs,
      .prog_data = &shader_data->prog_data.cs,
   };

   if (intel_use_jay(devinfo, nir->info.stage)) {
      struct jay_shader_bin *bin = jay_compile(devinfo, mem_ctx, nir,
                             (union brw_any_prog_data*)params.prog_data,
                             (union brw_any_prog_key*)params.key);

       shader_data->code = (void*)bin->kernel;
       shader_data->stats[0] = bin->stats;

       params.prog_data->local_size[0] = nir->info.workgroup_size[0];
       params.prog_data->local_size[1] = nir->info.workgroup_size[1];
       params.prog_data->local_size[2] = nir->info.workgroup_size[2];
   } else {
       shader_data->code = (void*)brw_compile_cs(compiler, &params);
   }

   *error_str = params.base.error_str;
}

static bool
should_remat_cb(nir_instr *instr, void *data)
{
   if (instr->type != nir_instr_type_intrinsic)
      return false;

   return nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_resource_intel;
}

static void
anv_shader_compile_bs(struct anv_device *device,
                      void *mem_ctx,
                      struct anv_shader_data *shader_data,
                      char **error_str)
{
   const struct brw_compiler *compiler = device->physical->compiler;
   nir_shader *nir = shader_data->info->nir;
   const struct intel_device_info *devinfo = compiler->devinfo;

   struct brw_nir_lower_shader_calls_state lowering_state = {
      .devinfo = devinfo,
      .key = &shader_data->key.bs,
   };

   nir_shader **resume_shaders = NULL;
   uint32_t num_resume_shaders = 0;
   if (nir->info.stage != MESA_SHADER_COMPUTE) {
      struct brw_nir_vectorize_mem_cb_data vectorize_cb_data = {
         .devinfo = devinfo,
      };
      const nir_lower_shader_calls_options opts = {
         .address_format = nir_address_format_64bit_global,
         .stack_alignment = BRW_BTD_STACK_ALIGN,
         .localized_loads = true,
         .vectorizer_callback = brw_nir_should_vectorize_mem,
         .vectorizer_data = &vectorize_cb_data,
         .should_remat_callback = should_remat_cb,
      };

      NIR_PASS(_, nir, brw_nir_lower_rt_intrinsics_pre_trace);
      NIR_PASS(_, nir, nir_lower_shader_calls, &opts,
               &resume_shaders, &num_resume_shaders, mem_ctx);
      NIR_PASS(_, nir, brw_nir_lower_shader_calls, &lowering_state);
      NIR_PASS(_, nir, brw_nir_lower_rt_intrinsics,
               &shader_data->key.base, devinfo);
   }

   for (unsigned i = 0; i < num_resume_shaders; i++) {
      NIR_PASS(_, resume_shaders[i], brw_nir_lower_shader_calls,
               &lowering_state);
      NIR_PASS(_, resume_shaders[i], brw_nir_lower_rt_intrinsics,
               &shader_data->key.base, devinfo);
   }

   shader_data->num_stats = 1;

   struct brw_compile_bs_params params = {
      .base = {
         .nir = nir,
         .stats = shader_data->stats,
         .log_data = device,
         .mem_ctx = mem_ctx,
         .source_hash = shader_data->source_hash,
         .archiver = shader_data->archiver,
      },
      .key = &shader_data->key.bs,
      .prog_data = &shader_data->prog_data.bs,
      .num_resume_shaders = num_resume_shaders,
      .resume_shaders = resume_shaders,
   };

   shader_data->code = (void *)brw_compile_bs(compiler, &params);
   *error_str = params.base.error_str;
}

static void
shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
{
   assert(glsl_type_is_vector_or_scalar(type));

   uint32_t comp_size = glsl_type_is_boolean(type)
      ? 4 : glsl_get_bit_size(type) / 8;
   unsigned length = glsl_get_vector_elements(type);
   *size = comp_size * length,
   *align = comp_size * (length == 3 ? 4 : length);
}

static void
anv_shader_compute_fragment_rts(const struct intel_device_info *devinfo,
                                const struct vk_graphics_pipeline_state *state,
                                struct anv_shader_data *shader_data)
{
   assert(shader_data->bind_map.surface_count == 0);

   nir_shader *nir = shader_data->info->nir;
   const uint64_t rt_mask =
      (nir->info.outputs_written &
       ~BITFIELD_BIT(FRAG_RESULT_DUAL_SRC_BLEND)) >> FRAG_RESULT_DATA0;
   const unsigned num_rts = util_last_bit64(rt_mask);
   struct anv_pipeline_binding rt_bindings[MAX_RTS];

   shader_data->key.fs.nr_color_regions =
      util_last_bit(rt_mask & rp_color_mask(state));

   if (num_rts > 0) {
      for (unsigned rt = 0; rt < num_rts; rt++) {
         if (nir->info.outputs_written & BITFIELD_BIT(FRAG_RESULT_DATA0 + rt)) {
            rt_bindings[rt] = (struct anv_pipeline_binding) {
               .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS,
               .index = rt,
               .binding = UINT32_MAX,
            };
         } else {
            /* Setup a null render target */
            rt_bindings[rt] = (struct anv_pipeline_binding) {
               .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS,
               .index = ANV_COLOR_OUTPUT_UNUSED,
               .binding = UINT32_MAX,
            };
         }
      }
      shader_data->bind_map.surface_count = num_rts;
   } else if (brw_nir_fs_needs_null_rt(devinfo, nir,
                 shader_data->key.fs.alpha_to_coverage != INTEL_NEVER)) {
      /* Setup a null render target */
      rt_bindings[0] = (struct anv_pipeline_binding) {
         .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS,
         .index = ANV_COLOR_OUTPUT_DISABLED,
         .binding = UINT32_MAX,
      };
      shader_data->bind_map.surface_count = 1;
   }

   typed_memcpy(shader_data->bind_map.surface_to_descriptor,
                rt_bindings, shader_data->bind_map.surface_count);
}

static bool
accept_64bit_atomic_cb(const nir_intrinsic_instr *intrin, const void *data)
{
   return (intrin->intrinsic == nir_intrinsic_image_atomic ||
           intrin->intrinsic == nir_intrinsic_image_atomic_swap ||
           intrin->intrinsic == nir_intrinsic_image_heap_atomic ||
           intrin->intrinsic == nir_intrinsic_image_heap_atomic_swap ||
           intrin->intrinsic == nir_intrinsic_image_deref_atomic ||
           intrin->intrinsic == nir_intrinsic_image_deref_atomic_swap) &&
          intrin->def.bit_size == 64;
}

static bool
lower_non_tg4_non_uniform_offsets(const nir_tex_instr *tex,
                                  unsigned index, void *data)
{
   /* HW cannot deal with divergent surfaces/samplers */
   if (tex->src[index].src_type == nir_tex_src_texture_offset ||
       tex->src[index].src_type == nir_tex_src_texture_handle ||
       tex->src[index].src_type == nir_tex_src_texture_heap_offset ||
       tex->src[index].src_type == nir_tex_src_sampler_offset ||
       tex->src[index].src_type == nir_tex_src_sampler_handle ||
       tex->src[index].src_type == nir_tex_src_sampler_heap_offset)
      return true;

   if (tex->src[index].src_type == nir_tex_src_offset) {
      /* HW can deal with TG4 divergent offsets only */
      return tex->op != nir_texop_tg4;
   }

   return false;
}

static void
fixup_large_workgroup_image_coherency(nir_shader *nir)
{
   nir_foreach_function_impl(impl, nir) {
      nir_foreach_block(block, impl) {
         nir_foreach_instr(instr, block) {
            if (instr->type != nir_instr_type_intrinsic)
               continue;

            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);

            if (intr->intrinsic != nir_intrinsic_image_deref_store ||
                nir_intrinsic_image_dim(intr) != GLSL_SAMPLER_DIM_3D)
               continue;

            /* We have found image store access to 3D. */
            nir_deref_instr *array_deref = nir_src_as_deref(intr->src[0]);
            if (array_deref->deref_type != nir_deref_type_array)
               continue;

            nir_alu_instr *alu = nir_src_as_alu(intr->src[1]);
            if (!alu || !nir_op_is_vec(alu->op))
               return;

            /* Check if any src is from @load_local_invocation_id. */
            for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
               nir_instr *parent = nir_def_instr(alu->src[i].src.ssa);
               if (parent->type != nir_instr_type_intrinsic)
                  continue;

               nir_intrinsic_instr *parent_intr = nir_instr_as_intrinsic(parent);
               if (parent_intr->intrinsic !=
                   nir_intrinsic_load_local_invocation_id)
                  continue;

               /* Found a match, change image access qualifier coherent. */
               nir_deref_instr *parent_deref =
                  nir_src_as_deref(array_deref->parent);
               parent_deref->var->data.access = ACCESS_COHERENT;
               return;
            }
         }  /* instr */
      } /* block */
   } /* func */
}

static void
cleanup_nir(nir_shader *nir)
{
   /* First run copy-prop to get rid of all of the vec() that address
    * calculations often create and then constant-fold so that, when we get to
    * anv_nir_lower_ubo_loads, we can detect constant offsets.
    */
   bool progress;
   do {
      progress = false;
      NIR_PASS(progress, nir, nir_opt_algebraic);
      NIR_PASS(progress, nir, nir_opt_copy_prop);
      NIR_PASS(progress, nir, nir_opt_constant_folding);
      NIR_PASS(progress, nir, nir_opt_dce);
   } while (progress);
}

static void
anv_shader_lower_nir(struct anv_device *device,
                     void *mem_ctx,
                     const struct vk_graphics_pipeline_state *state,
                     struct anv_shader_data *shader_data)
{
   const struct anv_physical_device *pdevice = device->physical;
   const struct brw_compiler *compiler = pdevice->compiler;
   const struct intel_device_info *devinfo = compiler->devinfo;
   struct anv_descriptor_set_layout * const *set_layouts =
      (struct anv_descriptor_set_layout * const *) shader_data->info->set_layouts;
   const uint32_t set_layout_count = shader_data->info->set_layout_count;

   nir_shader *nir = shader_data->info->nir;

   /* Workaround for apps that need fp64 support */
   if (device->fp64_nir) {
      nir_shader *fp64_nir = anv_ensure_fp64_shader(device);

      NIR_PASS(_, nir, nir_lower_doubles, fp64_nir,
               nir->options->lower_doubles_options);

      bool fp_conv = false;
      NIR_PASS(fp_conv, nir, nir_lower_int64_float_conversions);
      if (fp_conv) {
         NIR_PASS(_, nir, nir_opt_algebraic);
         NIR_PASS(_, nir, nir_lower_doubles, fp64_nir,
                  nir->options->lower_doubles_options);
      }
   }

   if (nir->info.stage == MESA_SHADER_COMPUTE &&
       pdevice->instance->large_workgroup_non_coherent_image_workaround) {
      const unsigned local_size = nir->info.workgroup_size[0] *
                                  nir->info.workgroup_size[1] *
                                  nir->info.workgroup_size[2];
      if (local_size == 64)
         fixup_large_workgroup_image_coherency(nir);
   }

   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
      NIR_PASS(_, nir, nir_lower_wpos_center);
      NIR_PASS(_, nir, nir_lower_input_attachments,
               &(nir_input_attachment_options) { });
   }

   if (nir->info.stage == MESA_SHADER_COMPUTE &&
       (shader_data->info->flags & VK_SHADER_CREATE_UNALIGNED_DISPATCH_BIT_MESA)) {
      NIR_PASS(_, nir, anv_nir_lower_unaligned_dispatch);
      /* anv_nir_lower_unaligned_dispatch pass uses nir_jump_return that we
       * need to lower it.
       */
      NIR_PASS(_, nir, nir_lower_returns);
      /* Lower load_base_workgroup_id inserted by unaligned_dispatch */
      nir_lower_compute_system_values_options lower_csv_options = {
         .has_base_workgroup_id = true,
      };
      NIR_PASS(_, nir, nir_lower_compute_system_values, &lower_csv_options);
   }

   if (mesa_shader_stage_is_mesh(nir->info.stage)) {
      nir_lower_compute_system_values_options options = {
         .lower_workgroup_id_to_index = true,
         /* nir_lower_idiv generates expensive code */
         .shortcut_1d_workgroup_id = devinfo->verx10 >= 125,
      };

      NIR_PASS(_, nir, nir_lower_compute_system_values, &options);
   }

   NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex, lookup_ycbcr_conversion, shader_data->info);

   if (nir->info.stage <= MESA_SHADER_FRAGMENT) {
      NIR_PASS(_, nir, anv_nir_lower_multiview,
               shader_data->key.base.view_mask,
               shader_data->use_primitive_replication);
   }

   if (nir->info.stage == MESA_SHADER_COMPUTE &&
       nir->info.cs.has_cooperative_matrix) {
      anv_fixup_subgroup_size(device, nir);
      NIR_PASS(_, nir, brw_nir_lower_cmat, nir->info.api_subgroup_size);

      /* Lowering of nir_instr_type_cmat_call will produce new
       * nir_instr_type_call instructions that need to be inlined.
       */
      bool inlined = false;
      NIR_PASS(_, nir, nir_opt_dce);
      NIR_PASS(inlined, nir, nir_inline_functions);
      nir_remove_non_entrypoints(nir);

      if (inlined) {
         /* Some shader_temp vars may have remained multi-function before
          * cmat lowering/inlining.  Now that everything was inlined,
          * they may be lowered to locals.
          */
         bool lowered_globals = false;
         NIR_PASS(lowered_globals, nir, nir_lower_global_vars_to_local);
         if (lowered_globals)
            NIR_PASS(_, nir, nir_split_struct_vars, nir_var_function_temp);
         NIR_PASS(_, nir, nir_opt_copy_prop_vars);
         NIR_PASS(_, nir, nir_opt_copy_prop);
      }
      NIR_PASS(_, nir, nir_opt_deref);
      NIR_PASS(_, nir, nir_opt_dce);

      NIR_PASS(_, nir, nir_lower_indirect_derefs_to_if_else_trees,
               nir_var_function_temp, 16);
   }

   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));

   NIR_PASS(_, nir, nir_lower_memory_model);

   /* Apply lowering for 64bit atomics pre-Xe2 */
   const bool lower_64bit_atomics = devinfo->ver < 20;

   if (lower_64bit_atomics) {
      /* Ensure robustness, do this before brw_nir_lower_storage_image so that
       * added image size intrinsics for bounds checkings are properly lowered
       * for cube images.
       */
      NIR_PASS(_, nir, nir_lower_robust_access,
               accept_64bit_atomic_cb, NULL);
   }

   NIR_PASS(_, nir, brw_nir_lower_storage_image, compiler,
            &(struct brw_nir_lower_storage_image_opts) {
               /* Anv only supports Gfx9+ which has better defined typed read
                * behavior. It allows us to only have to care about lowering
                * loads.
                */
               .lower_loads = true,
               .lower_stores_64bit = true,
               .lower_loads_without_formats =
                  pdevice->instance->emulate_read_without_format,
            });

   if (lower_64bit_atomics) {
      /* Switch from image to global */
      NIR_PASS(_, nir, nir_lower_image_atomics_to_global,
               accept_64bit_atomic_cb, NULL);

      /* Detile for global */
      NIR_PASS(_, nir, brw_nir_lower_texel_address, devinfo,
               pdevice->isl_dev.shader_tiling);
   }

   /* Lower push constants variables prior to global realignment for CBV
    * resources, it makes identifying a 64bit pointer from the push constants
    * easier.
    */
   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
            nir_address_format_32bit_offset);

   /* Realign pointers to CBV on stages that can promote to push buffers. */
   if (pdevice->instance->promote_cbv_to_push_buffers &&
       nir->info.stage <= MESA_SHADER_FRAGMENT) {
      /* Cleanup for the analysis, we don't want any ALU */
      cleanup_nir(nir);
      NIR_PASS(_, nir, anv_nir_realign_cbv);
   }

   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
            nir_address_format_64bit_global);

   NIR_PASS(_, nir, brw_nir_lower_ray_queries, &pdevice->info);

   shader_data->push_desc_info.used_descriptors =
      anv_nir_compute_used_push_descriptors(
         nir, set_layouts, set_layout_count);

   /* Need to have render targets placed first in the bind_map */
   if (nir->info.stage == MESA_SHADER_FRAGMENT)
      anv_shader_compute_fragment_rts(devinfo, state, shader_data);

   uint32_t dynamic_descriptors_offset = 0;
   uint32_t dynamic_descriptors_offsets[MAX_SETS] = {};
   for (uint32_t i = 0; i < set_layout_count; i++) {
      dynamic_descriptors_offsets[i] = dynamic_descriptors_offset;
      if (set_layouts[i] != NULL) {
         shader_data->bind_map.binding_mask |= ANV_PIPELINE_BIND_MASK_SET(i);
         const uint32_t dyn_desc_count =
            set_layouts[i]->vk.dynamic_descriptor_count;
         shader_data->bind_map.dynamic_descriptors[i] = dyn_desc_count;
         dynamic_descriptors_offset += dyn_desc_count;
      }
   }

   /* Apply the actual layout to UBOs, SSBOs, and textures */
   if (shader_data->info->flags & VK_SHADER_CREATE_DESCRIPTOR_HEAP_BIT_EXT) {
      NIR_PASS(_, nir, anv_nir_lower_descriptor_heap, device,
               shader_data->info->embedded_sampler_count,
               shader_data->info->embedded_samplers,
               &shader_data->bind_map);
   } else {
      NIR_PASS(_, nir, anv_nir_apply_pipeline_layout,
               pdevice, shader_data->key.base.robust_flags,
               set_layouts, set_layout_count,
               (shader_data->info->flags &
                VK_SHADER_CREATE_INDEPENDENT_SETS_BIT_KHR) ? NULL :
               dynamic_descriptors_offsets,
               shader_data->info->flags & VK_SHADER_CREATE_INDIRECT_BINDABLE_BIT_EXT,
               &shader_data->bind_map, &shader_data->push_map, mem_ctx);
   }

   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
            anv_nir_ubo_addr_format(pdevice, shader_data->key.base.robust_flags));
   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
            anv_nir_ssbo_addr_format(pdevice, shader_data->key.base.robust_flags));


   cleanup_nir(nir);

   NIR_PASS(_, nir, nir_lower_vars_to_ssa);

   cleanup_nir(nir);

   /* Required for nir_divergence_analysis() which is needed for
    * anv_nir_lower_ubo_loads.
    */
   NIR_PASS(_, nir, nir_convert_to_lcssa, true, true);
   nir_divergence_analysis(nir);

   NIR_PASS(_, nir, anv_nir_lower_ubo_loads);

   NIR_PASS(_, nir, nir_opt_remove_phis);

   const bool lower_non_uniform_texture_offsets = device->info->ver < 20;

   const enum nir_lower_non_uniform_access_type lower_non_uniform_access_types =
      nir_lower_non_uniform_texture_access |
      nir_lower_non_uniform_texture_query |
      nir_lower_non_uniform_image_access |
      nir_lower_non_uniform_image_query |
      nir_lower_non_uniform_get_ssbo_size |
      (lower_non_uniform_texture_offsets ?
       nir_lower_non_uniform_texture_offset_access : 0);

   /* Pre-Xe2 platforms don't have native support for dynamic programmable
    * offsets. Since support includes non-uniform programmable offsets, we
    * need to lower those texture messages in the same way we lower
    * non-uniform texture/sampler handles.
    */
   if (lower_non_uniform_texture_offsets)
      nir_divergence_analysis(nir);

   /* In practice, most shaders do not have non-uniform-qualified
    * accesses (see
    * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17558#note_1475069)
    * thus a cheaper and likely to fail check is run first.
    */
   if (nir_has_non_uniform_access(nir, lower_non_uniform_access_types)) {
      NIR_PASS(_, nir, nir_opt_non_uniform_access);

      /* We don't support non-uniform UBOs and non-uniform SSBO access is
      * handled naturally by falling back to A64 messages.
      */
      NIR_PASS(_, nir, nir_lower_non_uniform_access,
               &(nir_lower_non_uniform_access_options) {
                  .types = lower_non_uniform_access_types,
                  .tex_src_callback = lower_non_tg4_non_uniform_offsets,
                  .callback = NULL,
               });

      NIR_PASS(_, nir, intel_nir_lower_non_uniform_resource_intel);
      NIR_PASS(_, nir, intel_nir_cleanup_resource_intel);
      NIR_PASS(_, nir, nir_opt_dce);
   }

   if (mesa_shader_stage_uses_workgroup(nir->info.stage)) {
      NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
               nir_var_mem_shared, shared_type_info);

      NIR_PASS(_, nir, nir_lower_explicit_io,
               nir_var_mem_shared, nir_address_format_32bit_offset);

      if (nir->info.zero_initialize_shared_memory &&
          nir->info.shared_size > 0) {
         /* The effective Shared Local Memory size is at least 1024 bytes and
          * is always rounded to a power of two, so it is OK to align the size
          * used by the shader to chunk_size -- which does simplify the logic.
          */
         const unsigned chunk_size = 16;
         const unsigned shared_size = align(nir->info.shared_size, chunk_size);
         assert(shared_size <=
                intel_compute_slm_calculate_size(devinfo->ver,
                                                 nir->info.shared_size));

         NIR_PASS(_, nir, nir_zero_initialize_shared_memory,
                  shared_size, chunk_size);
      }
   }

   if (mesa_shader_stage_is_compute(nir->info.stage)) {
      NIR_PASS(_, nir, brw_nir_lower_cs_intrinsics, devinfo,
               &shader_data->prog_data.cs);
   }

   NIR_PASS(_, nir, anv_nir_lower_driver_values, pdevice);

   NIR_PASS(_, nir, anv_nir_update_resource_intel_block);

   NIR_PASS(_, nir, anv_nir_shrink_push_constant_ranges);

   NIR_PASS(_, nir, anv_nir_compute_push_layout,
               pdevice, shader_data->key.base.robust_flags,
               &(struct anv_nir_push_layout_info) {
                  .separate_tessellation = (nir->info.stage == MESA_SHADER_TESS_CTRL &&
                                            shader_data->key.tcs.separate_tess_vue_layout) ||
                                           (nir->info.stage == MESA_SHADER_TESS_EVAL &&
                                            shader_data->key.tes.separate_tess_vue_layout),
                  .fragment_dynamic      = nir->info.stage == MESA_SHADER_FRAGMENT &&
                                           brw_fs_prog_key_is_dynamic(&shader_data->key.fs),
                  .mesh_dynamic          = nir->info.stage == MESA_SHADER_FRAGMENT &&
                                           shader_data->key.fs.mesh_input == INTEL_SOMETIMES,
               },
               &shader_data->key.base,
               &shader_data->prog_data.base,
               &shader_data->bind_map, &shader_data->push_map);

   if (!(shader_data->info->flags & VK_SHADER_CREATE_DESCRIPTOR_HEAP_BIT_EXT)) {
      NIR_PASS(_, nir, anv_nir_lower_resource_intel, pdevice,
                  shader_data->bind_map.layout_type);
   }

   shader_data->push_desc_info.push_set_buffer =
      anv_nir_loads_push_desc_buffer(
         nir, set_layouts, set_layout_count, &shader_data->bind_map);
   shader_data->push_desc_info.fully_promoted_ubo_descriptors =
      anv_nir_push_desc_ubo_fully_promoted(
         nir, set_layouts, set_layout_count, &shader_data->bind_map);

   /* Only detected clearing compute shaders, these are the only problematic
    * cases we're aware of.
    */
   if (nir->info.stage == MESA_SHADER_COMPUTE)
      shader_data->bind_map.inferred_behavior = anv_nir_clear_shader_analysis(nir);
}

static uint32_t
sets_layout_embedded_sampler_count(const struct vk_shader_compile_info *info)
{
   uint32_t count = 0;

   for (uint32_t s = 0; s < info->set_layout_count; s++) {
      if (info->set_layouts[s] == NULL)
         continue;

      const struct anv_descriptor_set_layout *layout =
         (const struct anv_descriptor_set_layout *) info->set_layouts[s];
      count += layout->embedded_sampler_count;
   }

   return count;
}

static void
anv_shaders_pre_lower_gfx(struct anv_device *device,
                          struct anv_shader_data *shaders_data,
                          uint32_t shader_count,
                          const struct vk_graphics_pipeline_state *state,
                          void *mem_ctx)
{
   const struct intel_device_info *devinfo = device->info;
   const struct brw_compiler *compiler = device->physical->compiler;

   /* Walk backwards to link */
   struct anv_shader_data *next_stage = NULL;
   for (int s = shader_count - 1; s >= 0; s--) {
      struct anv_shader_data *shader_data = &shaders_data[s];
      struct vk_shader_compile_info *info = shader_data->info;

      if (next_stage == NULL) {
         next_stage = shader_data;
         continue;
      }

      switch (info->stage) {
      case MESA_SHADER_VERTEX:
      case MESA_SHADER_TESS_EVAL:
      case MESA_SHADER_TASK:
      case MESA_SHADER_GEOMETRY:
         anv_shader_link(compiler, info, next_stage->info);
         break;
      case MESA_SHADER_TESS_CTRL:
         anv_shader_link_tcs(compiler,
                             &shader_data->key.tcs,
                             info, next_stage->info);
         break;
      case MESA_SHADER_MESH:
         anv_shader_link(compiler, info, next_stage->info);
         next_stage->mue_map = &shader_data->prog_data.mesh.map;
         break;
      default:
         UNREACHABLE("Invalid graphics shader stage");
      }

      next_stage = shader_data;
   }

   bool use_primitive_replication = false;
   if (devinfo->ver >= 12 && shaders_data[0].key.base.view_mask != 0) {
      /* For some pipelines HW Primitive Replication can be used instead of
       * instancing to implement Multiview.  This depend on how viewIndex is
       * used in all the active shaders, so this check can't be done per
       * individual shaders.
       */
      nir_shader *shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT] = {};
      VkShaderStageFlags vk_stages = 0;
      for (unsigned s = 0; s < shader_count; s++) {
         struct anv_shader_data *shader_data = &shaders_data[s];
         shaders[shader_data->info->stage] = shader_data->info->nir;
         vk_stages |= mesa_to_vk_shader_stage(shader_data->info->stage);
      }

      use_primitive_replication =
         anv_check_for_primitive_replication(device, vk_stages, shaders,
                                             shaders_data[0].key.base.view_mask);
   }

   for (uint32_t s = 0; s < shader_count; s++) {
      struct anv_shader_data *shader_data = &shaders_data[s];
      shader_data->use_primitive_replication = use_primitive_replication;
      shader_data->instance_multiplier =
         (shader_data->key.base.view_mask && !use_primitive_replication) ?
         util_bitcount(shader_data->key.base.view_mask) : 1;
   }
}

static void
anv_shaders_post_lower_gfx(struct anv_device *device,
                           struct anv_shader_data *shaders_data,
                           uint32_t shader_count,
                           const struct vk_graphics_pipeline_state *state)
{
   struct vk_shader_compile_info *prev_stage = NULL;
   for (uint32_t s = 0; s < shader_count; s++) {
      struct anv_shader_data *shader_data = &shaders_data[s];
      struct vk_shader_compile_info *info = shader_data->info;

      struct shader_info *cur_info = &shader_data->info->nir->info;

      if (prev_stage && info->stage < MESA_SHADER_FRAGMENT) {
         struct shader_info *prev_info = &prev_stage->nir->info;

         prev_info->outputs_written |= cur_info->inputs_read &
                  ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
         cur_info->inputs_read |= prev_info->outputs_written &
                  ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
         prev_info->patch_outputs_written |= cur_info->patch_inputs_read;
         cur_info->patch_inputs_read |= prev_info->patch_outputs_written;
      }

      prev_stage = info;
   }
}

static void
anv_shaders_post_lower_rt(struct anv_device *device,
                          struct anv_shader_data *shaders_data,
                          uint32_t shader_count)
{
   for (uint32_t s = 0; s < shader_count; s++) {
      struct anv_shader_data *shader_data = &shaders_data[s];
      nir_shader *nir = shader_data->info->nir;

      switch (nir->info.stage) {
      case MESA_SHADER_RAYGEN:
         brw_nir_lower_raygen(nir, device->info);
         break;

      case MESA_SHADER_ANY_HIT:
         brw_nir_lower_any_hit(nir, device->info);
         break;

      case MESA_SHADER_CLOSEST_HIT:
         brw_nir_lower_closest_hit(nir, device->info);
         break;

      case MESA_SHADER_MISS:
         brw_nir_lower_miss(nir, device->info);
         break;

      case MESA_SHADER_CALLABLE:
         brw_nir_lower_callable(nir, device->info);
         break;

      case MESA_SHADER_INTERSECTION:
         /* Nothing to do, we merge this into ANY_HIT */
         break;

      default:
         UNREACHABLE("invalid stage");
      }
   }
}

static VkShaderStageFlags
anv_shader_get_rt_group_linking(struct vk_physical_device *device,
                                VkShaderStageFlags stages)
{
   const VkShaderStageFlags any_hit_intersection =
      VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
      VK_SHADER_STAGE_INTERSECTION_BIT_KHR;

   return (stages & any_hit_intersection) == any_hit_intersection ?
          any_hit_intersection : 0;
}

static void
anv_debug_archiver_init(void *mem_ctx, struct anv_shader_data *shaders_data,
                        uint32_t shader_count)
{
   /* A hash is used to identify the per stage archive file.  Just using the
    * single stage hash/key is sufficient if it is not linked.  If shaders
    * are linked together, also include a combined hash of all stages to
    * distinguish from the not linked case.
    */
   unsigned char linked_hash[BLAKE3_KEY_LEN];
   if (shader_count > 1) {
      blake3_hasher ctx;
      _mesa_blake3_init(&ctx);

      for (uint32_t s = 0; s < shader_count; s++) {
         struct anv_shader_data *shader_data = &shaders_data[s];
         struct vk_shader_compile_info *info = shader_data->info;
         _mesa_blake3_update(&ctx, info->nir->info.source_blake3, BLAKE3_OUT_LEN);
         _mesa_blake3_update(&ctx, &shader_data->key, shader_data->key_size);
      }
      _mesa_blake3_final(&ctx, linked_hash);
   }

   for (uint32_t s = 0; s < shader_count; s++) {
      struct anv_shader_data *shader_data = &shaders_data[s];
      struct vk_shader_compile_info *info = shader_data->info;

      char name[BLAKE3_HEX_LEN + 4] = {};
      {
         blake3_hasher ctx;
         unsigned char hash[BLAKE3_KEY_LEN];
         _mesa_blake3_init(&ctx);
         _mesa_blake3_update(&ctx, info->nir->info.source_blake3, BLAKE3_OUT_LEN);
         _mesa_blake3_update(&ctx, &shader_data->key, shader_data->key_size);
         if (shader_count > 1)
            _mesa_blake3_update(&ctx, linked_hash, BLAKE3_KEY_LEN);
         _mesa_blake3_final(&ctx, hash);

         _mesa_blake3_format(name, hash);
      }
      memcpy(&name[BLAKE3_HEX_LEN - 1], ".anv", 4);

      shader_data->archiver =
         debug_archiver_open(mem_ctx, name, PACKAGE_VERSION MESA_GIT_SHA1);

      if (shader_data->archiver) {
         debug_archiver_set_prefix(shader_data->archiver,
            _mesa_shader_stage_to_abbrev(info->stage));
      }
   }
}

static void
anv_debug_archiver_finish(struct anv_shader_data *shaders_data,
                          uint32_t shader_count)
{
   for (uint32_t s = 0; s < shader_count; s++) {
      struct anv_shader_data *shader_data = &shaders_data[s];
      debug_archiver_close(shader_data->archiver);
      shader_data->archiver = NULL;
   }
}

static VkResult
anv_shader_compile(struct vk_device *vk_device,
                   uint32_t shader_count,
                   struct vk_shader_compile_info *infos,
                   const struct vk_graphics_pipeline_state *state,
                   const struct vk_features *enabled_features,
                   const VkAllocationCallbacks* pAllocator,
                   struct vk_shader **shaders_out)
{
   struct anv_device *device =
      container_of(vk_device, struct anv_device, vk);
   VkResult result = VK_SUCCESS;

   for (uint32_t i = 0; i < shader_count; i++)
      shaders_out[i] = NULL;

   void *mem_ctx = ralloc_context(NULL);

   struct anv_shader_data *shaders_data =
      rzalloc_array(mem_ctx, struct anv_shader_data, shader_count);
   assert(shader_count < MAX2(ANV_GRAPHICS_SHADER_STAGE_COUNT,
                              ANV_RT_SHADER_STAGE_COUNT));

   /* Order the stages (no guarantee from the runtime) */
   struct vk_shader_compile_info *ordered_infos[MESA_SHADER_KERNEL] = { 0 };
   struct vk_shader **ordered_shaders_out[MESA_SHADER_KERNEL] = { 0 };
   VkShaderStageFlags stages = 0;
   for (uint32_t s = 0; s < shader_count; s++) {
      ordered_infos[infos[s].stage] = &infos[s];
      ordered_shaders_out[infos[s].stage] = &shaders_out[s];

      /* The runtime transfers the ownership of the NIR to us, so we need to
       * free it after compile.
       */
      ralloc_steal(mem_ctx, infos[s].nir);
      stages |= mesa_to_vk_shader_stage(infos[s].stage);
   }

   {
#define ADD_SHADER(_name)                                               \
   do {                                                                 \
      if (ordered_infos[MESA_SHADER_##_name]) {                         \
         shaders_data[remapped_index].info =                            \
            ordered_infos[MESA_SHADER_##_name];                         \
         shaders_data[remapped_index].shader_out =                      \
            ordered_shaders_out[MESA_SHADER_##_name];                   \
         remapped_index++;                                              \
      }                                                                 \
   } while (0)

      uint32_t remapped_index = 0;
      ADD_SHADER(COMPUTE);
      ADD_SHADER(VERTEX);
      ADD_SHADER(TESS_CTRL);
      ADD_SHADER(TESS_EVAL);
      ADD_SHADER(GEOMETRY);
      ADD_SHADER(TASK);
      ADD_SHADER(MESH);
      ADD_SHADER(FRAGMENT);
      ADD_SHADER(RAYGEN);
      ADD_SHADER(CLOSEST_HIT);
      ADD_SHADER(INTERSECTION);
      ADD_SHADER(ANY_HIT);
      ADD_SHADER(MISS);
      ADD_SHADER(CALLABLE);

#undef ADD_SHADER
   }

   /* From now on, don't use infos[] anymore. */

   for (uint32_t s = 0; s < shader_count; s++) {
      struct anv_shader_data *shader_data = &shaders_data[s];
      struct vk_shader_compile_info *info = shader_data->info;

      shader_data->source_hash = ((uint32_t*)info->nir->info.source_blake3)[0];

      for (uint32_t i = 0; i < info->set_layout_count; i++) {
         shader_data->dynamic_descriptors[i] =
            info->set_layouts[i] != NULL ?
            info->set_layouts[i]->dynamic_descriptor_count : 0;
      }

      shader_data->bind_map.layout_type =
         set_layouts_get_layout_type((struct anv_descriptor_set_layout * const *)info->set_layouts,
                                     info->set_layout_count);
      shader_data->bind_map.surface_to_descriptor =
         brw_shader_stage_requires_bindless_resources(info->stage) ? NULL :
         rzalloc_array(mem_ctx, struct anv_pipeline_binding, 256);
      shader_data->bind_map.sampler_to_descriptor =
         brw_shader_stage_requires_bindless_resources(info->stage) ? NULL :
         rzalloc_array(mem_ctx, struct anv_pipeline_binding, 256);
      shader_data->bind_map.embedded_sampler_to_binding =
         rzalloc_array(mem_ctx, struct anv_pipeline_embedded_sampler_binding,
                       MAX2(sets_layout_embedded_sampler_count(info),
                            info->embedded_sampler_count));

      shader_data->prog_data.base.stage = info->stage;

      switch (info->stage) {
      case MESA_SHADER_VERTEX:
         populate_vs_prog_key(&shader_data->key.vs, vk_device->physical,
                              info->robustness, state, stages);
         break;
      case MESA_SHADER_TESS_CTRL:
         populate_tcs_prog_key(&shader_data->key.tcs, vk_device->physical,
                               info->robustness, state, stages);
         break;
      case MESA_SHADER_TESS_EVAL:
         populate_tes_prog_key(&shader_data->key.tes, vk_device->physical,
                               info->robustness, state, stages);
         break;
      case MESA_SHADER_GEOMETRY:
         populate_gs_prog_key(&shader_data->key.gs, vk_device->physical,
                              info->robustness, state, stages);
         break;
      case MESA_SHADER_TASK:
         populate_task_prog_key(&shader_data->key.task, vk_device->physical,
                                info->robustness, state, stages);
         break;
      case MESA_SHADER_MESH:
         populate_mesh_prog_key(&shader_data->key.mesh, vk_device->physical,
                                info->robustness, state, stages);
         break;
      case MESA_SHADER_FRAGMENT:
         populate_fs_prog_key(&shader_data->key.fs, vk_device->physical,
                              info->robustness, state, stages);
         break;
      case MESA_SHADER_COMPUTE:
         populate_cs_prog_key(&shader_data->key.cs, vk_device->physical,
                              info->robustness);
         break;
      case MESA_SHADER_RAYGEN:
      case MESA_SHADER_ANY_HIT:
      case MESA_SHADER_CLOSEST_HIT:
      case MESA_SHADER_MISS:
      case MESA_SHADER_INTERSECTION:
      case MESA_SHADER_CALLABLE:
         populate_bs_prog_key(&shader_data->key.bs, vk_device->physical,
                              info->robustness, info->rt_flags);
         break;
      default:
         UNREACHABLE("Invalid stage");
      }
   }

   if (INTEL_DEBUG(DEBUG_MDA))
      anv_debug_archiver_init(mem_ctx, shaders_data, shader_count);

   {
      /* We're going to do cross stage link if we have a fragment shader with
       * any other stage (that would include all the associated
       * pre-rasterization stages of the pipeline).
       */
      const bool separate_shaders =
         !(shader_count > 1 && ordered_infos[MESA_SHADER_FRAGMENT] != NULL);

      for (uint32_t s = 0; s < shader_count; s++)
         shaders_data[s].info->nir->info.separate_shader = separate_shaders;
   }

   if (mesa_shader_stage_is_graphics(shaders_data[0].info->stage)) {
      anv_shaders_pre_lower_gfx(device, shaders_data, shader_count,
                                state, mem_ctx);
   }

   for (uint32_t s = 0; s < shader_count; s++) {
      struct anv_shader_data *shader_data = &shaders_data[s];

      anv_shader_lower_nir(device, mem_ctx, state, shader_data);

      anv_fixup_subgroup_size(device, shader_data->info->nir);

      anv_nir_apply_shader_workarounds(shader_data->info->nir);
   }

   /* Combine intersection & any-hit before lowering */
   if (ordered_infos[MESA_SHADER_INTERSECTION] != NULL) {
      brw_nir_lower_combined_intersection_any_hit(
         ordered_infos[MESA_SHADER_INTERSECTION]->nir,
         ordered_infos[MESA_SHADER_ANY_HIT] != NULL ?
         ordered_infos[MESA_SHADER_ANY_HIT]->nir : NULL,
         device->info);
   }

   if (mesa_shader_stage_is_graphics(shaders_data[0].info->stage))
      anv_shaders_post_lower_gfx(device, shaders_data, shader_count, state);
   else if (mesa_shader_stage_is_rt(shaders_data[0].info->stage))
      anv_shaders_post_lower_rt(device, shaders_data, shader_count);

   for (uint32_t s = 0; s < shader_count; s++) {
      struct anv_shader_data *shader_data = &shaders_data[s];
      struct anv_shader_data *prev_shader_data =
         s > 0 ? &shaders_data[s - 1] : NULL;

      char *error_str = NULL;
      switch (shader_data->info->stage) {
      case MESA_SHADER_VERTEX:
         anv_shader_compile_vs(device, mem_ctx, shader_data, &error_str);
         break;
      case MESA_SHADER_TESS_CTRL:
         anv_shader_compile_tcs(device, mem_ctx, shader_data, &error_str);
         break;
      case MESA_SHADER_TESS_EVAL:
         anv_shader_compile_tes(device, mem_ctx,
                                &shaders_data[s], prev_shader_data,
                                &error_str);
         break;
      case MESA_SHADER_GEOMETRY:
         anv_shader_compile_gs(device, mem_ctx, shader_data, &error_str);
         break;
      case MESA_SHADER_TASK:
         anv_shader_compile_task(device, mem_ctx, shader_data, &error_str);
         break;
      case MESA_SHADER_MESH:
         anv_shader_compile_mesh(device, mem_ctx,
                                 &shaders_data[s], prev_shader_data,
                                 &error_str);
         break;
      case MESA_SHADER_FRAGMENT:
         anv_shader_compile_fs(device, mem_ctx, shader_data, state, &error_str);
         break;
      case MESA_SHADER_COMPUTE:
         anv_shader_compile_cs(device, mem_ctx, shader_data, &error_str);
         break;
      case MESA_SHADER_RAYGEN:
      case MESA_SHADER_ANY_HIT:
      case MESA_SHADER_CLOSEST_HIT:
      case MESA_SHADER_MISS:
      case MESA_SHADER_INTERSECTION:
      case MESA_SHADER_CALLABLE:
         anv_shader_compile_bs(device, mem_ctx, shader_data, &error_str);
         break;
      default:
         UNREACHABLE("Invalid graphics shader stage");
      }

      if (shader_data->code == NULL) {
         if (error_str)
            result = vk_errorf(device, VK_ERROR_UNKNOWN, "%s", error_str);
         else
            result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
         goto end;
      }

      anv_nir_validate_push_layout(device->physical,
                                   &shader_data->prog_data.base,
                                   &shader_data->bind_map);

      shader_data->xfb_info = shader_data->info->nir->xfb_info;

      result = anv_shader_create(device, shader_data->info->stage,
                                 mem_ctx, shader_data, pAllocator,
                                 shader_data->shader_out);
      if (result != VK_SUCCESS)
         goto end;
   }

end:
   if (INTEL_DEBUG(DEBUG_MDA))
      anv_debug_archiver_finish(shaders_data, shader_count);

   ralloc_free(mem_ctx);

#if 0
   /* TODO: Write the feedback index into the pipeline */
   for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
      anv_pipeline_account_shader(&pipeline->base, pipeline->shaders[s]);
   }
#endif

   if (result != VK_SUCCESS) {
      for (unsigned s = 0; s < shader_count; s++) {
         if (shaders_out[s] != NULL)
            vk_shader_free(vk_device, &vk_device->alloc, shaders_out[s]);
      }
   }

   return result;
}

struct vk_device_shader_ops anv_device_shader_ops = {
   .get_nir_options                = anv_shader_get_nir_options,
   .get_spirv_options              = anv_shader_get_spirv_options,
   .preprocess_nir                 = anv_shader_preprocess_nir,
   .get_rt_group_linking           = anv_shader_get_rt_group_linking,
   .hash_state                     = anv_shader_hash_state,
   .compile                        = anv_shader_compile,
   .deserialize                    = anv_shader_deserialize,
   .replay_rt_shader_group         = anv_replay_rt_shader_group,
   .write_rt_shader_group          = anv_write_rt_shader_group,
   .write_rt_shader_group_replay_handle = anv_write_rt_shader_group_replay_handle,
   .cmd_bind_shaders               = anv_cmd_buffer_bind_shaders,
   .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
   .cmd_set_rt_state               = anv_cmd_buffer_set_rt_state,
   .cmd_set_stack_size             = anv_cmd_buffer_set_stack_size,
};