mesa/src/freedreno/vulkan/tu_pipeline.cc

/*
 * Copyright © 2016 Red Hat.
 * Copyright © 2016 Bas Nieuwenhuizen
 * SPDX-License-Identifier: MIT
 *
 * based in part on anv driver which is:
 * Copyright © 2015 Intel Corporation
 */

#include "tu_pipeline.h"

#include "common/freedreno_guardband.h"

#include "ir3/ir3_nir.h"
#include "nir/nir.h"
#include "nir/nir_builder.h"
#include "nir/nir_serialize.h"
#include "spirv/nir_spirv.h"
#include "util/u_debug.h"
#include "util/mesa-sha1.h"
#include "util/shader_stats.h"
#include "vk_nir.h"
#include "vk_pipeline.h"
#include "vk_render_pass.h"
#include "vk_util.h"

#include "tu_cmd_buffer.h"
#include "tu_cs.h"
#include "tu_device.h"
#include "tu_knl.h"
#include "tu_formats.h"
#include "tu_lrz.h"
#include "tu_pass.h"
#include "tu_rmv.h"

/* Emit IB that preloads the descriptors that the shader uses */

static void
emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
                enum a6xx_state_block sb, unsigned base, unsigned offset,
                unsigned count)
{
   /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not
    * clear if emitting more packets will even help anything. Presumably the
    * descriptor cache is relatively small, and these packets stop doing
    * anything when there are too many descriptors.
    */
   tu_cs_emit_pkt7(cs, opcode, 3);
   tu_cs_emit(cs,
              CP_LOAD_STATE6_0_STATE_TYPE(st) |
              CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) |
              CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
              CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1)));
   tu_cs_emit_qw(cs, offset | (base << 28));
}

static unsigned
tu6_load_state_size(struct tu_pipeline *pipeline,
                    struct tu_pipeline_layout *layout)
{
   const unsigned load_state_size = 4;
   unsigned size = 0;
   for (unsigned i = 0; i < layout->num_sets; i++) {
      if (!(pipeline->active_desc_sets & (1u << i)))
         continue;

      struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
      for (unsigned j = 0; j < set_layout->binding_count; j++) {
         struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
         unsigned count = 0;
         /* See comment in tu6_emit_load_state(). */
         VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
         unsigned stage_count = util_bitcount(stages);

         if (!binding->array_size)
            continue;

         switch (binding->type) {
         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
         case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
         case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
         case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
            /* UAV-backed resources only need one packet for all graphics stages */
            if (stage_count)
               count += 1;
            break;
         case VK_DESCRIPTOR_TYPE_SAMPLER:
         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
         case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
            /* Textures and UBO's needs a packet for each stage */
            count = stage_count;
            break;
         case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
            /* Because of how we pack combined images and samplers, we
             * currently can't use one packet for the whole array.
             */
            count = stage_count * binding->array_size * 2;
            break;
         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
         case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
         case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
            break;
         default:
            UNREACHABLE("bad descriptor type");
         }
         size += count * load_state_size;
      }
   }
   return size;
}

static void
tu6_emit_load_state(struct tu_device *device,
                    struct tu_pipeline *pipeline,
                    struct tu_pipeline_layout *layout)
{
   unsigned size = tu6_load_state_size(pipeline, layout);
   if (size == 0)
      return;

   struct tu_cs cs;
   tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);

   for (unsigned i = 0; i < layout->num_sets; i++) {
      /* From 13.2.7. Descriptor Set Binding:
       *
       *    A compatible descriptor set must be bound for all set numbers that
       *    any shaders in a pipeline access, at the time that a draw or
       *    dispatch command is recorded to execute using that pipeline.
       *    However, if none of the shaders in a pipeline statically use any
       *    bindings with a particular set number, then no descriptor set need
       *    be bound for that set number, even if the pipeline layout includes
       *    a non-trivial descriptor set layout for that set number.
       *
       * This means that descriptor sets unused by the pipeline may have a
       * garbage or 0 BINDLESS_BASE register, which will cause context faults
       * when prefetching descriptors from these sets. Skip prefetching for
       * descriptors from them to avoid this. This is also an optimization,
       * since these prefetches would be useless.
       */
      if (!(pipeline->active_desc_sets & (1u << i)))
         continue;

      struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
      for (unsigned j = 0; j < set_layout->binding_count; j++) {
         struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
         unsigned base = i;
         unsigned offset = binding->offset / 4;
         /* Note: amber sets VK_SHADER_STAGE_ALL for its descriptor layout, and
          * zink has descriptors for each stage in the push layout even if some
          * stages aren't present in a used pipeline.  We don't want to emit
          * loads for unused descriptors.
          */
         VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
         unsigned count = binding->array_size;

         /* If this is a variable-count descriptor, then the array_size is an
          * upper bound on the size, but we don't know how many descriptors
          * will actually be used. Therefore we can't pre-load them here.
          */
         if (j == set_layout->binding_count - 1 &&
             set_layout->has_variable_descriptors)
            continue;

         if (count == 0 || stages == 0)
            continue;
         switch (binding->type) {
         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
            assert(device->physical_device->reserved_set_idx >= 0);
            base = device->physical_device->reserved_set_idx;
            offset = (pipeline->program.dynamic_descriptor_offsets[i] +
                      binding->dynamic_offset_offset) / 4;
            FALLTHROUGH;
         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
         case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
         case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
         case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: {
            unsigned mul = binding->size / (A6XX_TEX_CONST_DWORDS * 4);
            /* UAV-backed resources only need one packet for all graphics stages */
            if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
               emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_UAV,
                               base, offset, count * mul);
            }
            if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
               emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_UAV, SB6_CS_SHADER,
                               base, offset, count * mul);
            }
            break;
         }
         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
         case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
         case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
            /* nothing - input attachments and inline uniforms don't use bindless */
            break;
         case VK_DESCRIPTOR_TYPE_SAMPLER:
         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
         case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
            tu_foreach_stage(stage, stages) {
               emit_load_state(&cs, tu6_stage2opcode(stage),
                               binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ?
                               ST6_SHADER : ST6_CONSTANTS,
                               tu6_stage2texsb(stage), base, offset, count);
            }
            break;
         }
         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
            assert(device->physical_device->reserved_set_idx >= 0);
            base = device->physical_device->reserved_set_idx;
            offset = (pipeline->program.dynamic_descriptor_offsets[i] +
                      binding->dynamic_offset_offset) / 4;
            FALLTHROUGH;
         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
            tu_foreach_stage(stage, stages) {
               emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO,
                               tu6_stage2shadersb(stage), base, offset, count);
            }
            break;
         }
         case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
            tu_foreach_stage(stage, stages) {
               /* TODO: We could emit less CP_LOAD_STATE6 if we used
                * struct-of-arrays instead of array-of-structs.
                */
               for (unsigned i = 0; i < count; i++) {
                  unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS;
                  unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS;
                  emit_load_state(&cs, tu6_stage2opcode(stage),
                                  ST6_CONSTANTS, tu6_stage2texsb(stage),
                                  base, tex_offset, 1);
                  emit_load_state(&cs, tu6_stage2opcode(stage),
                                  ST6_SHADER, tu6_stage2texsb(stage),
                                  base, sam_offset, 1);
               }
            }
            break;
         }
         default:
            UNREACHABLE("bad descriptor type");
         }
      }
   }

   pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
}

struct tu_pipeline_builder
{
   struct tu_device *device;
   void *mem_ctx;
   struct vk_pipeline_cache *cache;
   const VkAllocationCallbacks *alloc;
   const VkGraphicsPipelineCreateInfo *create_info;
   VkPipelineCreateFlags2KHR create_flags;

   struct tu_pipeline_layout layout;

   struct tu_pvtmem_config pvtmem;

   bool rasterizer_discard;
   /* these states are affectd by rasterizer_discard */
   uint8_t unscaled_input_fragcoord;

   /* Each library defines at least one piece of state in
    * VkGraphicsPipelineLibraryFlagsEXT, and libraries cannot overlap, so
    * there can be at most as many libraries as pieces of state, of which
    * there are currently 4.
    */
#define MAX_LIBRARIES 4

   unsigned num_libraries;
   struct tu_graphics_lib_pipeline *libraries[MAX_LIBRARIES];

   /* This is just the state that we are compiling now, whereas the final
    * pipeline will include the state from the libraries.
    */
   VkGraphicsPipelineLibraryFlagsEXT state;

   /* The stages we are compiling now. */
   VkShaderStageFlags active_stages;

   bool fragment_density_map;
   bool fdm_per_layer;
   uint8_t max_fdm_layers;

   struct vk_graphics_pipeline_all_state all_state;
   struct vk_graphics_pipeline_state graphics_state;
};

static bool
tu_logic_op_reads_dst(VkLogicOp op)
{
   switch (op) {
   case VK_LOGIC_OP_CLEAR:
   case VK_LOGIC_OP_COPY:
   case VK_LOGIC_OP_COPY_INVERTED:
   case VK_LOGIC_OP_SET:
      return false;
   default:
      return true;
   }
}

static bool
tu_blend_state_is_dual_src(const struct vk_color_blend_state *cb)
{
   for (unsigned i = 0; i < cb->attachment_count; i++) {
      if (tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].src_color_blend_factor) ||
          tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].dst_color_blend_factor) ||
          tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].src_alpha_blend_factor) ||
          tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].dst_alpha_blend_factor))
         return true;
   }

   return false;
}

enum ir3_push_consts_type
tu_push_consts_type(const struct tu_pipeline_layout *layout,
                    const struct ir3_compiler *compiler)
{
   if (!layout->push_constant_size)
      return IR3_PUSH_CONSTS_NONE;

   if (TU_DEBUG(PUSH_CONSTS_PER_STAGE))
      return IR3_PUSH_CONSTS_PER_STAGE;

   if (tu6_shared_constants_enable(layout, compiler)) {
      return IR3_PUSH_CONSTS_SHARED;
   } else {
      if (compiler->gen >= 7) {
         return IR3_PUSH_CONSTS_SHARED_PREAMBLE;
      } else {
         return IR3_PUSH_CONSTS_PER_STAGE;
      }
   }
}

static uint32_t
sp_xs_config(const struct ir3_shader_variant *v)
{
   if (!v)
      return 0;

   return A6XX_SP_VS_CONFIG_ENABLED |
         COND(v->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
         COND(v->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
         COND(v->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_UAV) |
         COND(v->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
         A6XX_SP_VS_CONFIG_NUAV(ir3_shader_num_uavs(v)) |
         A6XX_SP_VS_CONFIG_NTEX(v->num_samp) |
         A6XX_SP_VS_CONFIG_NSAMP(v->num_samp);
}

static bool
push_shared_consts(const struct ir3_shader_variant *v)
{
   return v && v->shader_options.push_consts_type == IR3_PUSH_CONSTS_SHARED_PREAMBLE;
}

template <chip CHIP>
void
tu6_emit_xs_config(struct tu_crb &crb, struct tu_shader_stages stages)
{
   if (stages.cs) {
      crb.add(SP_CS_CONST_CONFIG(CHIP,
         .constlen = stages.cs->constlen,
         .enabled = true,
         .read_imm_shared_consts = push_shared_consts(stages.cs),
      ));
      crb.add(A6XX_SP_CS_CONFIG(.dword = sp_xs_config(stages.cs)));
   } else {
      crb.add(SP_VS_CONST_CONFIG(CHIP,
         .constlen = COND(stages.vs, stages.vs->constlen),
         .enabled = stages.vs,
         .read_imm_shared_consts = push_shared_consts(stages.vs),
      ));
      crb.add(SP_HS_CONST_CONFIG(CHIP,
         .constlen = COND(stages.hs, stages.hs->constlen),
         .enabled = stages.hs,
         .read_imm_shared_consts = push_shared_consts(stages.hs),
      ));
      crb.add(SP_DS_CONST_CONFIG(CHIP,
         .constlen = COND(stages.ds, stages.ds->constlen),
         .enabled = stages.ds,
         .read_imm_shared_consts = push_shared_consts(stages.ds),
      ));
      crb.add(SP_GS_CONST_CONFIG(CHIP,
         .constlen = COND(stages.gs, stages.gs->constlen),
         .enabled = stages.gs,
         .read_imm_shared_consts = push_shared_consts(stages.gs),
      ));
      crb.add(SP_PS_CONST_CONFIG(CHIP,
         .constlen = COND(stages.fs, stages.fs->constlen),
         .enabled = stages.fs,
         .read_imm_shared_consts = push_shared_consts(stages.fs),
      ));

      crb.add(A6XX_SP_VS_CONFIG(.dword = sp_xs_config(stages.vs)));
      crb.add(A6XX_SP_HS_CONFIG(.dword = sp_xs_config(stages.hs)));
      crb.add(A6XX_SP_DS_CONFIG(.dword = sp_xs_config(stages.ds)));
      crb.add(A6XX_SP_GS_CONFIG(.dword = sp_xs_config(stages.gs)));
      crb.add(A6XX_SP_PS_CONFIG(.dword = sp_xs_config(stages.fs)));
   }
}
TU_GENX(tu6_emit_xs_config);

static void
tu6_emit_dynamic_offset(struct tu_cs *cs,
                        const struct ir3_shader_variant *xs,
                        const struct tu_shader *shader,
                        const struct tu_program_state *program)
{
   const struct tu_physical_device *phys_dev = cs->device->physical_device;

   if (!xs)
      return;

   if (cs->device->physical_device->info->props.load_shader_consts_via_preamble) {
      if (shader->const_state.dynamic_offsets_ubo.size == 0)
         return;

      uint32_t offsets[MAX_SETS];
      for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
         unsigned dynamic_offset_start =
            program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
         offsets[i] = dynamic_offset_start;
      }

      /* A7XX TODO: Emit data via sub_cs instead of NOP */
      uint64_t iova = tu_cs_emit_data_nop(cs, offsets, phys_dev->usable_sets, 4);
      uint32_t offset = shader->const_state.dynamic_offsets_ubo.idx;

      tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 5);
      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
               CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
               CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
               CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
               CP_LOAD_STATE6_0_NUM_UNIT(1));
      tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
      tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
      int size_vec4s = DIV_ROUND_UP(phys_dev->usable_sets, 4);
      tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
   } else {
      if (shader->const_state.dynamic_offset_loc == UINT32_MAX)
         return;

      tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + phys_dev->usable_sets);
      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(shader->const_state.dynamic_offset_loc / 4) |
               CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
               CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
               CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
               CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(phys_dev->usable_sets, 4)));
      tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
      tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));

      for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
         unsigned dynamic_offset_start =
            program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
         tu_cs_emit(cs, dynamic_offset_start);
      }
   }
}

template <chip CHIP>
void
tu6_emit_shared_consts_enable(struct tu_crb &crb, bool enable)
{
   if (CHIP == A6XX) {
      /* Enable/disable shared constants */
      crb.add(HLSQ_SHARED_CONSTS(CHIP, .enable = enable));
   } else {
      assert(!enable);
   }

   crb.add(A6XX_SP_MODE_CNTL(.constant_demotion_enable = true,
                             .isammode = ISAMMODE_GL,
                             .shared_consts_enable = enable));
}
TU_GENX(tu6_emit_shared_consts_enable);

template <chip CHIP>
static void
tu6_setup_streamout(struct tu_cs *cs,
                    const struct ir3_shader_variant *v,
                    const struct ir3_shader_linkage *l)
{
   const struct ir3_stream_output_info *info = &v->stream_output;
   /* Note: 64 here comes from the HW layout of the program RAM. The program
    * for stream N is at DWORD 64 * N.
    */
#define A6XX_SO_PROG_DWORDS 64
   uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
   BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
   bool has_pc_dgen_so_cntl = cs->device->physical_device->info->props.has_pc_dgen_so_cntl;

   /* TODO: streamout state should be in a non-GMEM draw state */

   /* no streamout: */
   if (info->num_outputs == 0) {
      tu_crb crb = cs->crb(3);
      crb.add(VPC_SO_MAPPING_WPTR(CHIP, 0));
      crb.add(VPC_SO_CNTL(CHIP, 0));
      if (has_pc_dgen_so_cntl)
         crb.add(PC_DGEN_SO_CNTL(CHIP, 0));
      return;
   }

   for (unsigned i = 0; i < info->num_outputs; i++) {
      const struct ir3_stream_output *out = &info->output[i];
      unsigned k = out->register_index;
      unsigned idx;

      /* Skip it, if it's an output that was never assigned a register. */
      if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG)
         continue;

      /* linkage map sorted by order frag shader wants things, so
       * a bit less ideal here..
       */
      for (idx = 0; idx < l->cnt; idx++)
         if (l->var[idx].slot == v->outputs[k].slot)
            break;

      assert(idx < l->cnt);

      for (unsigned j = 0; j < out->num_components; j++) {
         unsigned c   = j + out->start_component;
         unsigned loc = l->var[idx].loc + c;
         unsigned off = j + out->dst_offset;  /* in dwords */

         assert(loc < A6XX_SO_PROG_DWORDS * 2);
         unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
         if (loc & 1) {
            prog[dword] |= A6XX_VPC_SO_MAPPING_PORT_B_EN |
                           A6XX_VPC_SO_MAPPING_PORT_B_BUF(out->output_buffer) |
                           A6XX_VPC_SO_MAPPING_PORT_B_OFF(off * 4);
         } else {
            prog[dword] |= A6XX_VPC_SO_MAPPING_PORT_A_EN |
                           A6XX_VPC_SO_MAPPING_PORT_A_BUF(out->output_buffer) |
                           A6XX_VPC_SO_MAPPING_PORT_A_OFF(off * 4);
         }
         BITSET_SET(valid_dwords, dword);
      }
   }

   unsigned prog_count = 0;
   unsigned start, end;
   BITSET_FOREACH_RANGE(start, end, valid_dwords,
                        A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
      prog_count += end - start + 1;
   }

   tu_crb crb = cs->crb(6 + prog_count);

   crb.add(VPC_SO_CNTL(
      CHIP,
      .buf0_stream = info->stride[0] > 0 ? 1 + info->buffer_to_stream[0] : 0,
      .buf1_stream = info->stride[1] > 0 ? 1 + info->buffer_to_stream[1] : 0,
      .buf2_stream = info->stride[2] > 0 ? 1 + info->buffer_to_stream[2] : 0,
      .buf3_stream = info->stride[3] > 0 ? 1 + info->buffer_to_stream[3] : 0,
      .stream_enable = info->streams_written));
   for (uint32_t i = 0; i < 4; i++) {
      crb.add(VPC_SO_BUFFER_STRIDE(CHIP, i, info->stride[i]));
   }
   bool first = true;
   BITSET_FOREACH_RANGE(start, end, valid_dwords,
                        A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
      crb.add(VPC_SO_MAPPING_WPTR(CHIP, .addr = start, .reset = first));
      for (unsigned i = start; i < end; i++) {
         crb.add(VPC_SO_MAPPING_PORT(CHIP, .dword = prog[i]));
      }
      first = false;
   }

   if (has_pc_dgen_so_cntl) {
      /* When present, setting this register makes sure that degenerate primitives
       * are included in the stream output and not discarded.
       */
      crb.add(PC_DGEN_SO_CNTL(CHIP, .stream_enable = info->streams_written));
   }
}

enum tu_geom_consts_type
{
   TU_CONSTS_PRIMITIVE_MAP,
   TU_CONSTS_PRIMITIVE_PARAM,
};

static void
tu6_emit_const(struct tu_cs *cs, uint32_t opcode, enum tu_geom_consts_type type,
               const struct ir3_const_state *const_state,
               unsigned constlen, enum a6xx_state_block block,
               uint32_t offset, uint32_t size, const uint32_t *dwords) {
   assert(size % 4 == 0);
   dwords = (uint32_t *)&((uint8_t *)dwords)[offset];

   if (!cs->device->physical_device->info->props.load_shader_consts_via_preamble) {
      uint32_t base;
      switch (type) {
      case TU_CONSTS_PRIMITIVE_MAP:
         base = const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_MAP].offset_vec4;
         break;
      case TU_CONSTS_PRIMITIVE_PARAM:
         base = const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_PARAM].offset_vec4;
         break;
      default:
         UNREACHABLE("bad consts type");
      }

      int32_t adjusted_size = MIN2(base * 4 + size, constlen * 4) - base * 4;
      if (adjusted_size <= 0)
         return;

      tu_cs_emit_pkt7(cs, opcode, 3 + adjusted_size);
      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
            CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
            CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
            CP_LOAD_STATE6_0_STATE_BLOCK(block) |
            CP_LOAD_STATE6_0_NUM_UNIT(adjusted_size / 4));

      tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
      tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));

      tu_cs_emit_array(cs, dwords, adjusted_size);
   } else {
      uint32_t base;
      switch (type) {
      case TU_CONSTS_PRIMITIVE_MAP:
         base = const_state->primitive_map_ubo.idx;
         break;
      case TU_CONSTS_PRIMITIVE_PARAM:
         base = const_state->primitive_param_ubo.idx;
         break;
      default:
         UNREACHABLE("bad consts type");
      }
      if (base == -1)
         return;

      /* A7XX TODO: Emit data via sub_cs instead of NOP */
      uint64_t iova = tu_cs_emit_data_nop(cs, dwords, size, 4);

      tu_cs_emit_pkt7(cs, opcode, 5);
      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
               CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
               CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
               CP_LOAD_STATE6_0_STATE_BLOCK(block) |
               CP_LOAD_STATE6_0_NUM_UNIT(1));
      tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
      tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
      int size_vec4s = DIV_ROUND_UP(size, 4);
      tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
   }
}

static void
tu6_emit_link_map(struct tu_cs *cs,
                  const struct ir3_shader_variant *producer,
                  const struct ir3_shader_variant *consumer,
                  enum a6xx_state_block sb)
{
   const struct ir3_const_state *const_state = ir3_const_state(consumer);
   uint32_t size = align(consumer->input_size, 4);

   if (size == 0)
      return;

   tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_MAP,
                  const_state, consumer->constlen, sb, 0, size, producer->output_loc);
}

static int
tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
                     const struct ir3_shader_variant *last_shader,
                     uint32_t index,
                     uint8_t *interp_mode,
                     uint8_t *ps_repl_mode)
{
   const uint32_t compmask = fs->inputs[index].compmask;

   /* NOTE: varyings are packed, so if compmask is 0xb then first, second, and
    * fourth component occupy three consecutive varying slots
    */
   int shift = 0;
   *interp_mode = 0;
   *ps_repl_mode = 0;
   if (fs->inputs[index].slot == VARYING_SLOT_PNTC) {
      if (compmask & 0x1) {
         *ps_repl_mode |= PS_REPL_S << shift;
         shift += 2;
      }
      if (compmask & 0x2) {
         *ps_repl_mode |= PS_REPL_T << shift;
         shift += 2;
      }
      if (compmask & 0x4) {
         *interp_mode |= INTERP_ZERO << shift;
         shift += 2;
      }
      if (compmask & 0x8) {
         *interp_mode |= INTERP_ONE << 6;
         shift += 2;
      }
   } else if (fs->inputs[index].slot == VARYING_SLOT_LAYER ||
              fs->inputs[index].slot == VARYING_SLOT_VIEWPORT) {
      /* If the last geometry shader doesn't statically write these, they're
       * implicitly zero and the FS is supposed to read zero.
       */
      const gl_varying_slot slot = (gl_varying_slot) fs->inputs[index].slot;
      if (ir3_find_output(last_shader, slot) < 0 &&
          (compmask & 0x1)) {
         *interp_mode |= INTERP_ZERO;
      } else {
         *interp_mode |= INTERP_FLAT;
      }
   } else if (fs->inputs[index].flat) {
      for (int i = 0; i < 4; i++) {
         if (compmask & (1 << i)) {
            *interp_mode |= INTERP_FLAT << shift;
            shift += 2;
         }
      }
   }

   return util_bitcount(compmask) * 2;
}

template <chip CHIP>
static void
tu6_emit_vpc_varying_modes(struct tu_cs *cs,
                           const struct ir3_shader_variant *fs,
                           const struct ir3_shader_variant *last_shader)
{
   uint32_t interp_modes[8] = { 0 };
   uint32_t ps_repl_modes[8] = { 0 };
   uint32_t interp_regs = 0;

   if (fs) {
      for (int i = -1;
           (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) {

         /* get the mode for input i */
         uint8_t interp_mode;
         uint8_t ps_repl_mode;
         const int bits =
            tu6_vpc_varying_mode(fs, last_shader, i, &interp_mode, &ps_repl_mode);

         /* OR the mode into the array */
         const uint32_t inloc = fs->inputs[i].inloc * 2;
         uint32_t n = inloc / 32;
         uint32_t shift = inloc % 32;
         interp_modes[n] |= interp_mode << shift;
         ps_repl_modes[n] |= ps_repl_mode << shift;
         if (shift + bits > 32) {
            n++;
            shift = 32 - shift;

            interp_modes[n] |= interp_mode >> shift;
            ps_repl_modes[n] |= ps_repl_mode >> shift;
         }
         interp_regs = MAX2(interp_regs, n + 1);
      }
   }

   if (interp_regs) {
      tu_cs_emit_pkt4(cs, VPC_VARYING_INTERP_MODE_MODE(CHIP, 0).reg, interp_regs);
      tu_cs_emit_array(cs, interp_modes, interp_regs);

      tu_cs_emit_pkt4(cs, VPC_VARYING_REPLACE_MODE_MODE(CHIP, 0).reg, interp_regs);
      tu_cs_emit_array(cs, ps_repl_modes, interp_regs);
   }
}

template <chip CHIP>
void
tu6_emit_vpc(struct tu_cs *cs,
             const struct ir3_shader_variant *vs,
             const struct ir3_shader_variant *hs,
             const struct ir3_shader_variant *ds,
             const struct ir3_shader_variant *gs,
             const struct ir3_shader_variant *fs)
{
   /* note: doesn't compile as static because of the array regs.. */
   const struct reg_config {
      uint16_t reg_sp_xs_out_reg;
      uint16_t reg_sp_xs_vpc_dst_reg;
      uint16_t reg_vpc_xs_pack;
      uint16_t reg_vpc_xs_clip_cntl;
      uint16_t reg_vpc_xs_clip_cntl_v2;
      uint16_t reg_gras_xs_cl_cntl;
      uint16_t reg_pc_xs_out_cntl;
      uint16_t reg_sp_xs_primitive_cntl;
      uint16_t reg_vpc_xs_layer_cntl;
      uint16_t reg_vpc_xs_layer_cntl_v2;
      uint16_t reg_gras_xs_layer_cntl;
   } reg_config[] = {
      [MESA_SHADER_VERTEX] = {
         REG_A6XX_SP_VS_OUTPUT_REG(0),
         REG_A6XX_SP_VS_VPC_DEST_REG(0),
         REG_A6XX_VPC_VS_CNTL,
         REG_A6XX_VPC_VS_CLIP_CULL_CNTL,
         REG_A6XX_VPC_VS_CLIP_CULL_CNTL_V2,
         REG_A6XX_GRAS_CL_VS_CLIP_CULL_DISTANCE,
         REG_A6XX_PC_VS_CNTL,
         REG_A6XX_SP_VS_OUTPUT_CNTL,
         REG_A6XX_VPC_VS_SIV_CNTL,
         REG_A6XX_VPC_VS_SIV_CNTL_V2,
         REG_A6XX_GRAS_SU_VS_SIV_CNTL,
      },
      [MESA_SHADER_TESS_CTRL] = {
         0,
         0,
         0,
         0,
         0,
         0,
         REG_A6XX_PC_HS_CNTL,
         0,
         0,
         0
      },
      [MESA_SHADER_TESS_EVAL] = {
         REG_A6XX_SP_DS_OUTPUT_REG(0),
         REG_A6XX_SP_DS_VPC_DEST_REG(0),
         REG_A6XX_VPC_DS_CNTL,
         REG_A6XX_VPC_DS_CLIP_CULL_CNTL,
         REG_A6XX_VPC_DS_CLIP_CULL_CNTL_V2,
         REG_A6XX_GRAS_CL_DS_CLIP_CULL_DISTANCE,
         REG_A6XX_PC_DS_CNTL,
         REG_A6XX_SP_DS_OUTPUT_CNTL,
         REG_A6XX_VPC_DS_SIV_CNTL,
         REG_A6XX_VPC_DS_SIV_CNTL_V2,
         REG_A6XX_GRAS_SU_DS_SIV_CNTL,
      },
      [MESA_SHADER_GEOMETRY] = {
         REG_A6XX_SP_GS_OUTPUT_REG(0),
         REG_A6XX_SP_GS_VPC_DEST_REG(0),
         REG_A6XX_VPC_GS_CNTL,
         REG_A6XX_VPC_GS_CLIP_CULL_CNTL,
         REG_A6XX_VPC_GS_CLIP_CULL_CNTL_V2,
         REG_A6XX_GRAS_CL_GS_CLIP_CULL_DISTANCE,
         REG_A6XX_PC_GS_CNTL,
         REG_A6XX_SP_GS_OUTPUT_CNTL,
         REG_A6XX_VPC_GS_SIV_CNTL,
         REG_A6XX_VPC_GS_SIV_CNTL_V2,
         REG_A6XX_GRAS_SU_GS_SIV_CNTL,
      },
   };

   const struct ir3_shader_variant *last_shader;
   if (gs) {
      last_shader = gs;
   } else if (hs) {
      last_shader = ds;
   } else {
      last_shader = vs;
   }

   const struct reg_config *cfg = &reg_config[last_shader->type];

   struct ir3_shader_linkage linkage = {
      .primid_loc = 0xff,
      .clip0_loc = 0xff,
      .clip1_loc = 0xff,
   };
   if (fs)
      ir3_link_shaders(&linkage, last_shader, fs, true);

   if (last_shader->stream_output.num_outputs)
      ir3_link_stream_out(&linkage, last_shader);

   /* a6xx finds position/pointsize at the end */
   const uint32_t pointsize_regid =
      ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
   const uint32_t layer_regid =
      ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
   const uint32_t view_regid =
      ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
   const uint32_t clip0_regid =
      ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
   const uint32_t clip1_regid =
      ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
   uint32_t flags_regid = gs ?
      ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
   const uint32_t shading_rate_regid =
      ir3_find_output_regid(last_shader, VARYING_SLOT_PRIMITIVE_SHADING_RATE);

   uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
   uint32_t shading_rate_loc = 0xff;

   if (layer_regid != regid(63, 0)) {
      layer_loc = linkage.max_loc;
      ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc);
   }

   if (view_regid != regid(63, 0)) {
      view_loc = linkage.max_loc;
      ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc);
   }

   if (shading_rate_regid != regid(63, 0)) {
      shading_rate_loc = linkage.max_loc;
      ir3_link_add(&linkage, VARYING_SLOT_PRIMITIVE_SHADING_RATE,
                   shading_rate_regid, 0x1, linkage.max_loc);
   }

   unsigned extra_pos = 0;

   for (unsigned i = 0; i < last_shader->outputs_count; i++) {
      if (last_shader->outputs[i].slot != VARYING_SLOT_POS)
         continue;

      if (position_loc == 0xff)
         position_loc = linkage.max_loc;

      ir3_link_add(&linkage, last_shader->outputs[i].slot,
                   last_shader->outputs[i].regid,
                   0xf, position_loc + 4 * last_shader->outputs[i].view);
      extra_pos = MAX2(extra_pos, last_shader->outputs[i].view);
   }

   if (pointsize_regid != regid(63, 0)) {
      pointsize_loc = linkage.max_loc;
      ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc);
   }

   uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;

   /* Handle the case where clip/cull distances aren't read by the FS */
   uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
   if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
      clip0_loc = linkage.max_loc;
      ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid,
                   clip_cull_mask & 0xf, linkage.max_loc);
   }
   if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
      clip1_loc = linkage.max_loc;
      ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid,
                   clip_cull_mask >> 4, linkage.max_loc);
   }

   tu6_setup_streamout<CHIP>(cs, last_shader, &linkage);

   /* There is a hardware bug on a750 where STRIDE_IN_VPC of 5 to 8 in GS with
    * an input primitive type with adjacency, an output primitive type of
    * points, and a high enough vertex count causes a hang.
    */
   if (cs->device->physical_device->info->props.gs_vpc_adjacency_quirk &&
       gs && gs->gs.output_primitive == MESA_PRIM_POINTS &&
       linkage.max_loc > 4) {
      linkage.max_loc = MAX2(linkage.max_loc, 9);
   }

   /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
    * at least when a DS is the last stage, so add a dummy output to keep it
    * happy if there aren't any. We do this late in order to avoid emitting
    * any unused code and make sure that optimizations don't remove it.
    */
   if (linkage.cnt == 0)
      ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc);

   /* map outputs of the last shader to VPC */
   assert(linkage.cnt <= 32);
   const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
   const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
   uint32_t sp_out[16] = {0};
   uint32_t sp_vpc_dst[8] = {0};
   for (uint32_t i = 0; i < linkage.cnt; i++) {
      ((uint16_t *) sp_out)[i] =
         A6XX_SP_VS_OUTPUT_REG_A_REGID(linkage.var[i].regid) |
         A6XX_SP_VS_OUTPUT_REG_A_COMPMASK(linkage.var[i].compmask);
      ((uint8_t *) sp_vpc_dst)[i] =
         A6XX_SP_VS_VPC_DEST_REG_OUTLOC0(linkage.var[i].loc);
   }

   tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count);
   tu_cs_emit_array(cs, sp_out, sp_out_count);

   tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
   tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count);

   tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1);
   tu_cs_emit(cs, A6XX_VPC_VS_CNTL_POSITIONLOC(position_loc) |
                  A6XX_VPC_VS_CNTL_PSIZELOC(pointsize_loc) |
                  A6XX_VPC_VS_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
                  A6XX_VPC_VS_CNTL_EXTRAPOS(extra_pos));

   tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1);
   tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_MASK(clip_cull_mask) |
                  A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
                  A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_47_LOC(clip1_loc));
   tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl_v2, 1);
   tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_MASK(clip_cull_mask) |
                  A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
                  A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_47_LOC(clip1_loc));

   tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1);
   tu_cs_emit(cs, A6XX_GRAS_CL_VS_CLIP_CULL_DISTANCE_CLIP_MASK(last_shader->clip_mask) |
                  A6XX_GRAS_CL_VS_CLIP_CULL_DISTANCE_CULL_MASK(last_shader->cull_mask));

   const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs };

   for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) {
      const struct ir3_shader_variant *shader = geom_shaders[i];
      if (!shader)
         continue;

      bool primid = shader->type != MESA_SHADER_VERTEX &&
         VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));

      tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
      if (shader == last_shader) {
         tu_cs_emit(cs, A6XX_PC_VS_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
                        CONDREG(pointsize_regid, A6XX_PC_VS_CNTL_PSIZE) |
                        CONDREG(layer_regid, A6XX_PC_VS_CNTL_LAYER) |
                        CONDREG(view_regid, A6XX_PC_VS_CNTL_VIEW) |
                        COND(primid, A6XX_PC_VS_CNTL_PRIMITIVE_ID) |
                        A6XX_PC_VS_CNTL_CLIP_MASK(clip_cull_mask) |
                        CONDREG(shading_rate_regid, A6XX_PC_VS_CNTL_SHADINGRATE));
      } else {
         tu_cs_emit(cs, COND(primid, A6XX_PC_VS_CNTL_PRIMITIVE_ID));
      }
   }

   /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */
   if (gs)
      assert(flags_regid != INVALID_REG);

   tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1);
   tu_cs_emit(cs, A6XX_SP_VS_OUTPUT_CNTL_OUT(linkage.cnt) |
                  A6XX_SP_GS_OUTPUT_CNTL_FLAGS_REGID(flags_regid));

   tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1);
   tu_cs_emit(cs, A6XX_VPC_VS_SIV_CNTL_LAYERLOC(layer_loc) |
                  A6XX_VPC_VS_SIV_CNTL_VIEWLOC(view_loc) |
                  A6XX_VPC_VS_SIV_CNTL_SHADINGRATELOC(shading_rate_loc));
   tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl_v2, 1);
   tu_cs_emit(cs, A6XX_VPC_VS_SIV_CNTL_LAYERLOC(layer_loc) |
                  A6XX_VPC_VS_SIV_CNTL_VIEWLOC(view_loc) |
                  A6XX_VPC_VS_SIV_CNTL_SHADINGRATELOC(shading_rate_loc));

   tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1);
   tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_SU_VS_SIV_CNTL_WRITES_LAYER) |
                  CONDREG(view_regid, A6XX_GRAS_SU_VS_SIV_CNTL_WRITES_VIEW));

   tu6_emit_vpc_varying_modes<CHIP>(cs, fs, last_shader);
}
TU_GENX(tu6_emit_vpc);

static void
tu6_emit_vs_params(struct tu_cs *cs,
                   const struct ir3_const_state *const_state,
                   unsigned constlen,
                   unsigned param_stride,
                   unsigned num_vertices)
{
   uint32_t vs_params[4] = {
      param_stride * num_vertices * 4,  /* vs primitive stride */
      param_stride * 4,                 /* vs vertex stride */
      0,
      0,
   };
   tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
                  const_state, constlen, SB6_VS_SHADER, 0,
                  ARRAY_SIZE(vs_params), vs_params);
}

static void
tu_get_tess_iova(struct tu_device *dev,
                 uint64_t *tess_factor_iova,
                 uint64_t *tess_param_iova)
{
   /* Create the shared tess factor BO the first time tess is used on the device. */
   if (!dev->tess_bo) {
      mtx_lock(&dev->mutex);
      if (!dev->tess_bo) {
         tu_bo_init_new(dev, NULL, &dev->tess_bo, TU_TESS_BO_SIZE,
                        TU_BO_ALLOC_INTERNAL_RESOURCE, "tess");
      }
      mtx_unlock(&dev->mutex);
   }

   *tess_factor_iova = dev->tess_bo->iova;
   *tess_param_iova = dev->tess_bo->iova + TU_TESS_FACTOR_SIZE;
}

static const enum mesa_vk_dynamic_graphics_state tu_patch_control_points_state[] = {
   MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS,
};

#define HS_PARAMS_SIZE 8

template <chip CHIP>
static unsigned
tu6_patch_control_points_size(struct tu_device *dev,
                              const struct tu_shader *vs,
                              const struct tu_shader *tcs,
                              const struct tu_shader *tes,
                              const struct tu_program_state *program,
                              uint32_t patch_control_points)
{
   if (dev->physical_device->info->props.load_shader_consts_via_preamble) {
#define EMIT_CONST_DWORDS(const_dwords) (6 + const_dwords + 4)
      return EMIT_CONST_DWORDS(4) +
         EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
#undef EMIT_CONST_DWORDS
   } else {
#define EMIT_CONST_DWORDS(const_dwords) (4 + const_dwords)
      return EMIT_CONST_DWORDS(4) +
         EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
#undef EMIT_CONST_DWORDS
   }
}

template <chip CHIP>
void
tu6_emit_patch_control_points(struct tu_cs *cs,
                              const struct tu_shader *vs,
                              const struct tu_shader *tcs,
                              const struct tu_shader *tes,
                              const struct tu_program_state *program,
                              uint32_t patch_control_points)
{
   if (!tcs->variant)
      return;

   struct tu_device *dev = cs->device;

   tu6_emit_vs_params(cs,
                      &program->link[MESA_SHADER_VERTEX].const_state,
                      program->link[MESA_SHADER_VERTEX].constlen,
                      vs->variant->output_size,
                      patch_control_points);

   uint64_t tess_factor_iova, tess_param_iova;
   tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);

   uint32_t hs_params[HS_PARAMS_SIZE] = {
      vs->variant->output_size * patch_control_points * 4,  /* hs primitive stride */
      vs->variant->output_size * 4,                         /* hs vertex stride */
      tcs->variant->output_size,
      patch_control_points,
      tess_param_iova,
      tess_param_iova >> 32,
      tess_factor_iova,
      tess_factor_iova >> 32,
   };

   const struct ir3_const_state *hs_const =
      &program->link[MESA_SHADER_TESS_CTRL].const_state;
   unsigned hs_constlen = program->link[MESA_SHADER_TESS_CTRL].constlen;
   tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
                  hs_const, hs_constlen, SB6_HS_SHADER, 0,
                  ARRAY_SIZE(hs_params), hs_params);

   uint32_t patch_local_mem_size_16b =
      patch_control_points * vs->variant->output_size / 4;

   /* Total attribute slots in HS incoming patch. */
   tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_PARAM_1, 1);
   tu_cs_emit(cs, patch_local_mem_size_16b);

   const uint32_t wavesize = 64;
   const uint32_t vs_hs_local_mem_size = 16384;

   uint32_t max_patches_per_wave;
   if (dev->physical_device->info->props.tess_use_shared) {
      /* HS invocations for a patch are always within the same wave,
       * making barriers less expensive. VS can't have barriers so we
       * don't care about VS invocations being in the same wave.
       */
      max_patches_per_wave = wavesize / tcs->variant->tess.tcs_vertices_out;
   } else {
      /* VS is also in the same wave */
      max_patches_per_wave =
         wavesize / MAX2(patch_control_points,
                         tcs->variant->tess.tcs_vertices_out);
   }

   uint32_t patches_per_wave =
      MIN2(vs_hs_local_mem_size / (patch_local_mem_size_16b * 16),
           max_patches_per_wave);

   uint32_t wave_input_size = DIV_ROUND_UP(
      patches_per_wave * patch_local_mem_size_16b * 16, 256);

   tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_CNTL_1, 1);
   tu_cs_emit(cs, wave_input_size);

   /* maximum number of patches that can fit in tess factor/param buffers */
   uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(tes->variant->key.tessellation),
                        TU_TESS_PARAM_SIZE / (tcs->variant->output_size * 4));
   /* convert from # of patches to draw count */
   subdraw_size *= patch_control_points;

   tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
   tu_cs_emit(cs, subdraw_size);
}

static void
tu6_emit_geom_tess_consts(struct tu_cs *cs,
                          const struct ir3_shader_variant *vs,
                          const struct ir3_shader_variant *hs,
                          const struct ir3_shader_variant *ds,
                          const struct ir3_shader_variant *gs)
{
   struct tu_device *dev = cs->device;

   if (gs && !hs) {
      tu6_emit_vs_params(cs, ir3_const_state(vs), vs->constlen,
                         vs->output_size, gs->gs.vertices_in);
   }

   if (hs) {
      uint64_t tess_factor_iova, tess_param_iova;
      tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);

      uint32_t ds_params[8] = {
         gs ? ds->output_size * gs->gs.vertices_in * 4 : 0,  /* ds primitive stride */
         ds->output_size * 4,                                /* ds vertex stride */
         hs->output_size,                                    /* hs vertex stride (dwords) */
         hs->tess.tcs_vertices_out,
         tess_param_iova,
         tess_param_iova >> 32,
         tess_factor_iova,
         tess_factor_iova >> 32,
      };

      tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
                     ds->const_state, ds->constlen, SB6_DS_SHADER, 0,
                     ARRAY_SIZE(ds_params), ds_params);
   }

   if (gs) {
      const struct ir3_shader_variant *prev = ds ? ds : vs;
      uint32_t gs_params[4] = {
         prev->output_size * gs->gs.vertices_in * 4,  /* gs primitive stride */
         prev->output_size * 4,                 /* gs vertex stride */
         0,
         0,
      };
      tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
                     gs->const_state, gs->constlen, SB6_GS_SHADER, 0,
                     ARRAY_SIZE(gs_params), gs_params);
   }
}

template <chip CHIP>
static void
tu6_emit_program_config(struct tu_cs *cs,
                        const struct tu_program_state *prog,
                        struct tu_shader **shaders,
                        const struct ir3_shader_variant **variants)
{
   STATIC_ASSERT(MESA_SHADER_VERTEX == 0);

   tu_crb crb = cs->crb(0);

   bool shared_consts_enable =
      prog->shared_consts.type == IR3_PUSH_CONSTS_SHARED;
   tu6_emit_shared_consts_enable<CHIP>(crb, shared_consts_enable);

   crb.add(SP_UPDATE_CNTL(CHIP, .vs_state = true, .hs_state = true,
                          .ds_state = true, .gs_state = true,
                          .fs_state = true, .gfx_uav = true,
                          .gfx_shared_const = shared_consts_enable));

   const struct ir3_shader_variant *vs = variants[MESA_SHADER_VERTEX];
   const struct ir3_shader_variant *hs = variants[MESA_SHADER_TESS_CTRL];
   const struct ir3_shader_variant *ds = variants[MESA_SHADER_TESS_EVAL];
   const struct ir3_shader_variant *gs = variants[MESA_SHADER_GEOMETRY];
   const struct ir3_shader_variant *fs = variants[MESA_SHADER_FRAGMENT];

   tu6_emit_xs_config<CHIP>(crb, { .vs = vs, .hs = hs, .ds = ds, .gs = gs, .fs = fs });

   crb.flush();

   for (size_t stage_idx = MESA_SHADER_VERTEX;
        stage_idx <= MESA_SHADER_FRAGMENT; stage_idx++) {
      mesa_shader_stage stage = (mesa_shader_stage) stage_idx;
      tu6_emit_dynamic_offset(cs, variants[stage], shaders[stage], prog);
   }

   if (hs) {
      tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER);
      tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER);
   }

   if (gs) {
      if (hs) {
         tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER);
      } else {
         tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER);
      }

      uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size;

      if (CHIP == A6XX) {
         /* Size of per-primitive alloction in ldlw memory in vec4s. */
         uint32_t vec4_size = gs->gs.vertices_in *
                              DIV_ROUND_UP(prev_stage_output_size, 4);

         tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
         tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
      }

      uint32_t prim_size = prev_stage_output_size;
      if (prim_size > 64)
         prim_size = 64;
      else if (prim_size == 64)
         prim_size = 63;
      tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_CNTL_1, 1);
      tu_cs_emit(cs, prim_size);
   }

   if (gs || hs) {
      tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs);
   }
}

static bool
contains_all_shader_state(VkGraphicsPipelineLibraryFlagsEXT state)
{
   return (state &
      (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
       VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) ==
      (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
       VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT);
}

static bool
pipeline_contains_all_shader_state(struct tu_pipeline *pipeline)
{
   return pipeline->type == TU_PIPELINE_GRAPHICS ||
      pipeline->type == TU_PIPELINE_COMPUTE ||
      contains_all_shader_state(tu_pipeline_to_graphics_lib(pipeline)->state);
}

/* Return true if this pipeline contains all of the GPL stages listed but none
 * of the libraries it uses do, so this is "the first time" that all of them
 * are defined together. This is useful for state that needs to be combined
 * from multiple GPL stages.
 */

static bool
set_combined_state(struct tu_pipeline_builder *builder,
                   struct tu_pipeline *pipeline,
                   VkGraphicsPipelineLibraryFlagsEXT state)
{
   if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB &&
       (tu_pipeline_to_graphics_lib(pipeline)->state & state) != state)
      return false;

   for (unsigned i = 0; i < builder->num_libraries; i++) {
      if ((builder->libraries[i]->state & state) == state)
         return false;
   }

   return true;
}

#define TU6_EMIT_VERTEX_INPUT_MAX_DWORDS (MAX_VERTEX_ATTRIBS * 2 + 1)

static VkResult
tu_pipeline_allocate_cs(struct tu_device *dev,
                        struct tu_pipeline *pipeline,
                        struct tu_pipeline_layout *layout,
                        struct tu_pipeline_builder *builder,
                        const struct ir3_shader_variant *compute)
{
   uint32_t size = 1024;

   /* graphics case: */
   if (builder) {
      if (builder->state &
          VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) {
         size += TU6_EMIT_VERTEX_INPUT_MAX_DWORDS;
      }

      if (set_combined_state(builder, pipeline,
                             VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
                             VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
         size += tu6_load_state_size(pipeline, layout);
      }
   } else {
      size += tu6_load_state_size(pipeline, layout);
   }

   /* Allocate the space for the pipeline out of the device's RO suballocator.
    *
    * Sub-allocating BOs saves memory and also kernel overhead in refcounting of
    * BOs at exec time.
    *
    * The pipeline cache would seem like a natural place to stick the
    * suballocator, except that it is not guaranteed to outlive the pipelines
    * created from it, so you can't store any long-lived state there, and you
    * can't use its EXTERNALLY_SYNCHRONIZED flag to avoid atomics because
    * pipeline destroy isn't synchronized by the cache.
    */
   mtx_lock(&dev->pipeline_mutex);
   VkResult result = tu_suballoc_bo_alloc(&pipeline->bo, &dev->pipeline_suballoc,
                                          size * 4, 128);
   mtx_unlock(&dev->pipeline_mutex);
   if (result != VK_SUCCESS)
      return result;

   TU_RMV(cmd_buffer_suballoc_bo_create, dev, &pipeline->bo);
   tu_cs_init_suballoc(&pipeline->cs, dev, &pipeline->bo);

   return VK_SUCCESS;
}

static void
tu_append_executable(struct tu_pipeline *pipeline,
                     const struct ir3_shader_variant *variant,
                     char *nir_from_spirv)
{
   struct tu_pipeline_executable exe = {
      .stage = variant->type,
      .stats = variant->info,
      .is_binning = variant->binning_pass,
      .nir_from_spirv = nir_from_spirv,
      .nir_final = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.nir),
      .disasm = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.disasm),
   };

   util_dynarray_append(&pipeline->executables, exe);
}

static void
tu_hash_stage(struct mesa_sha1 *ctx,
              VkPipelineCreateFlags2KHR pipeline_flags,
              const VkPipelineShaderStageCreateInfo *stage,
              const nir_shader *nir,
              const struct tu_shader_key *key)
{

   if (nir) {
      struct blob blob;
      blob_init(&blob);
      nir_serialize(&blob, nir, true);
      _mesa_sha1_update(ctx, blob.data, blob.size);
      blob_finish(&blob);
   } else {
      unsigned char stage_hash[SHA1_DIGEST_LENGTH];
      vk_pipeline_hash_shader_stage(pipeline_flags, stage, NULL, stage_hash);
      _mesa_sha1_update(ctx, stage_hash, sizeof(stage_hash));
   }
   _mesa_sha1_update(ctx, key, sizeof(*key));
}

static void
tu_hash_shaders(unsigned char *hash,
                VkPipelineCreateFlags2KHR pipeline_flags,
                const VkPipelineShaderStageCreateInfo **stages,
                nir_shader *const *nir,
                const struct tu_pipeline_layout *layout,
                const struct tu_shader_key *keys,
                VkGraphicsPipelineLibraryFlagsEXT state)
{
   struct mesa_sha1 ctx;

   _mesa_sha1_init(&ctx);

   if (layout)
      _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));

   for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
      if (stages[i] || nir[i]) {
         tu_hash_stage(&ctx, pipeline_flags, stages[i], nir[i], &keys[i]);
      }
   }
   _mesa_sha1_update(&ctx, &state, sizeof(state));
   enum ir3_shader_debug ir3_debug_key = ir3_shader_debug_hash_key();
   _mesa_sha1_update(&ctx, &ir3_debug_key, sizeof(ir3_debug_key));
   _mesa_sha1_final(&ctx, hash);
}

static void
tu_hash_compute(unsigned char *hash,
                VkPipelineCreateFlags2KHR pipeline_flags,
                const VkPipelineShaderStageCreateInfo *stage,
                const struct tu_pipeline_layout *layout,
                const struct tu_shader_key *key)
{
   struct mesa_sha1 ctx;

   _mesa_sha1_init(&ctx);

   if (layout)
      _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));

   tu_hash_stage(&ctx, pipeline_flags, stage, NULL, key);
   enum ir3_shader_debug ir3_debug_key = ir3_shader_debug_hash_key();
   _mesa_sha1_update(&ctx, &ir3_debug_key, sizeof(ir3_debug_key));

   _mesa_sha1_final(&ctx, hash);
}

static struct tu_shader *
tu_pipeline_cache_lookup(struct vk_pipeline_cache *cache,
                         const void *key_data, size_t key_size,
                         bool *application_cache_hit)
{
   struct vk_pipeline_cache_object *object =
      vk_pipeline_cache_lookup_object(cache, key_data, key_size,
                                      &tu_shader_ops, application_cache_hit);
   if (object)
      return container_of(object, struct tu_shader, base);
   else
      return NULL;
}

static struct tu_shader *
tu_pipeline_cache_insert(struct vk_pipeline_cache *cache,
                         struct tu_shader *shader)
{
   struct vk_pipeline_cache_object *object =
      vk_pipeline_cache_add_object(cache, &shader->base);
   return container_of(object, struct tu_shader, base);
}

static bool
tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
                         struct blob *blob);

static struct vk_pipeline_cache_object *
tu_nir_shaders_deserialize(struct vk_pipeline_cache *cache,
                           const void *key_data,
                           size_t key_size,
                           struct blob_reader *blob);

static void
tu_nir_shaders_destroy(struct vk_device *device,
                       struct vk_pipeline_cache_object *object)
{
   struct tu_nir_shaders *shaders =
      container_of(object, struct tu_nir_shaders, base);

   for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++)
      ralloc_free(shaders->nir[i]);

   vk_pipeline_cache_object_finish(&shaders->base);
   vk_free(&device->alloc, shaders);
}

const struct vk_pipeline_cache_object_ops tu_nir_shaders_ops = {
   .serialize = tu_nir_shaders_serialize,
   .deserialize = tu_nir_shaders_deserialize,
   .destroy = tu_nir_shaders_destroy,
};

static struct tu_nir_shaders *
tu_nir_shaders_init(struct tu_device *dev, const void *key_data, size_t key_size)
{
   VK_MULTIALLOC(ma);
   VK_MULTIALLOC_DECL(&ma, struct tu_nir_shaders, shaders, 1);
   VK_MULTIALLOC_DECL_SIZE(&ma, char, obj_key_data, key_size);

   if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
                             VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
      return NULL;

   memcpy(obj_key_data, key_data, key_size);
   vk_pipeline_cache_object_init(&dev->vk, &shaders->base,
                                 &tu_nir_shaders_ops, obj_key_data, key_size);

   return shaders;
}

static bool
tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
                         struct blob *blob)
{
   struct tu_nir_shaders *shaders =
      container_of(object, struct tu_nir_shaders, base);

   for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
      if (shaders->nir[i]) {
         blob_write_uint8(blob, 1);
         nir_serialize(blob, shaders->nir[i], true);
      } else {
         blob_write_uint8(blob, 0);
      }
   }

   return true;
}

static struct vk_pipeline_cache_object *
tu_nir_shaders_deserialize(struct vk_pipeline_cache *cache,
                           const void *key_data,
                           size_t key_size,
                           struct blob_reader *blob)
{
   struct tu_device *dev =
      container_of(cache->base.device, struct tu_device, vk);
   struct tu_nir_shaders *shaders =
      tu_nir_shaders_init(dev, key_data, key_size);

   if (!shaders)
      return NULL;

   for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
      if (blob_read_uint8(blob)) {
         shaders->nir[i] =
            nir_deserialize(NULL, ir3_get_compiler_options(dev->compiler), blob);
      }
   }

   return &shaders->base;
}

static struct tu_nir_shaders *
tu_nir_cache_lookup(struct vk_pipeline_cache *cache,
                    const void *key_data, size_t key_size,
                    bool *application_cache_hit)
{
   struct vk_pipeline_cache_object *object =
      vk_pipeline_cache_lookup_object(cache, key_data, key_size,
                                      &tu_nir_shaders_ops, application_cache_hit);
   if (object)
      return container_of(object, struct tu_nir_shaders, base);
   else
      return NULL;
}

static struct tu_nir_shaders *
tu_nir_cache_insert(struct vk_pipeline_cache *cache,
                    struct tu_nir_shaders *shaders)
{
   struct vk_pipeline_cache_object *object =
      vk_pipeline_cache_add_object(cache, &shaders->base);
   return container_of(object, struct tu_nir_shaders, base);
}

static VkResult
tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
                                    struct tu_pipeline *pipeline)
{
   VkResult result = VK_SUCCESS;
   const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = {
      NULL
   };
   VkPipelineCreationFeedback pipeline_feedback = {
      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
   };
   VkPipelineCreationFeedback stage_feedbacks[MESA_SHADER_STAGES] = { 0 };

   const bool executable_info =
      builder->create_flags &
      VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;

   bool retain_nir =
      builder->create_flags &
      VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT;

   int64_t pipeline_start = os_time_get_nano();

   const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
      vk_find_struct_const(builder->create_info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);

   bool must_compile = false;
   for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
      if (!(builder->active_stages & builder->create_info->pStages[i].stage))
         continue;

      mesa_shader_stage stage =
         vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
      stage_infos[stage] = &builder->create_info->pStages[i];
      must_compile = true;
   }

   /* Forward declare everything due to the goto usage */
   nir_shader *nir[ARRAY_SIZE(stage_infos)] = { NULL };
   struct tu_shader *shaders[ARRAY_SIZE(stage_infos)] = { NULL };
   nir_shader *post_link_nir[ARRAY_SIZE(nir)] = { NULL };
   char *nir_initial_disasm[ARRAY_SIZE(stage_infos)] = { NULL };
   bool cache_hit = false;

   struct tu_shader_key keys[ARRAY_SIZE(stage_infos)] = { };
   for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
        stage < ARRAY_SIZE(keys); stage = (mesa_shader_stage) (stage+1)) {
      const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info = NULL;
      if (stage_infos[stage])
         subgroup_info = vk_find_struct_const(stage_infos[stage],
                                              PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
      bool allow_varying_subgroup_size =
         !stage_infos[stage] ||
         (stage_infos[stage]->flags &
          VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT);
      bool require_full_subgroups =
         stage_infos[stage] &&
         (stage_infos[stage]->flags &
          VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT);
      tu_shader_key_subgroup_size(&keys[stage], allow_varying_subgroup_size,
                                  require_full_subgroups, subgroup_info,
                                  builder->device);

      if (stage_infos[stage]) {
         struct vk_pipeline_robustness_state rs;
         vk_pipeline_robustness_state_fill(&builder->device->vk, &rs,
                                           builder->create_info->pNext,
                                           stage_infos[stage]->pNext);
         tu_shader_key_robustness(&keys[stage], &rs);
         if (builder->create_flags & VK_PIPELINE_CREATE_2_VIEW_INDEX_FROM_DEVICE_INDEX_BIT_KHR)
            keys[stage].lower_view_index_to_device_index = true;
      }
   }

   if ((builder->state &
        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
       builder->graphics_state.ial &&
       builder->create_info->renderPass == VK_NULL_HANDLE) {
      const struct vk_input_attachment_location_state *ial =
         builder->graphics_state.ial;

      keys[MESA_SHADER_FRAGMENT].dynamic_renderpass = true;

      uint32_t attachments_referenced = 0;

      if (ial->color_attachment_count == MESA_VK_COLOR_ATTACHMENT_COUNT_UNKNOWN) {
         attachments_referenced |=
            BITFIELD_MASK(MAX_RTS) << TU_DYN_INPUT_ATT_OFFSET;
      } else {
         for (unsigned i = 0; i < ial->color_attachment_count; i++) {
            if (ial->color_map[i] != MESA_VK_ATTACHMENT_UNUSED) {
               attachments_referenced |=
                  (1u << (ial->color_map[i] + TU_DYN_INPUT_ATT_OFFSET));
               }
         }
      }

      if (ial->depth_att != MESA_VK_ATTACHMENT_UNUSED) {
         if (ial->depth_att == MESA_VK_ATTACHMENT_NO_INDEX)
            attachments_referenced |= 1;
         else
            attachments_referenced |= 1u << (ial->depth_att + 1);
      }

      if (ial->stencil_att != MESA_VK_ATTACHMENT_UNUSED) {
         if (ial->stencil_att == MESA_VK_ATTACHMENT_NO_INDEX)
            attachments_referenced |= 1;
         else
            attachments_referenced |= 1u << (ial->stencil_att + 1);
      }

      keys[MESA_SHADER_FRAGMENT].read_only_input_attachments =
         ~attachments_referenced;
   }

   if (builder->state &
       VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
      keys[MESA_SHADER_FRAGMENT].custom_resolve =
         builder->graphics_state.rp->custom_resolve;
   }

   if (builder->create_flags &
       VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT) {
      for (unsigned i = 0; i < builder->num_libraries; i++) {
         struct tu_graphics_lib_pipeline *library = builder->libraries[i];

         for (unsigned j = 0; j < ARRAY_SIZE(library->shaders); j++) {
            if (library->shaders[j].nir) {
               assert(!nir[j]);
               nir[j] = nir_shader_clone(builder->mem_ctx,
                     library->shaders[j].nir);
               keys[j] = library->shaders[j].key;
               must_compile = true;
            }
         }
      }
   }

   struct tu_nir_shaders *nir_shaders = NULL;
   if (!must_compile)
      goto done;

   if (builder->state &
       VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
      keys[MESA_SHADER_VERTEX].multiview_mask =
         builder->graphics_state.rp->view_mask;

      mesa_shader_stage last_pre_rast_stage = MESA_SHADER_VERTEX;
      for (int i = MESA_SHADER_GEOMETRY; i >= MESA_SHADER_VERTEX; i--) {
         if (nir[i]) {
            last_pre_rast_stage = (mesa_shader_stage)i;
            break;
         }
      }

      keys[last_pre_rast_stage].fdm_per_layer = builder->fdm_per_layer;
   }

   if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
      keys[MESA_SHADER_FRAGMENT].multiview_mask =
         builder->graphics_state.rp->view_mask;
      keys[MESA_SHADER_FRAGMENT].fragment_density_map =
         builder->fragment_density_map;
      keys[MESA_SHADER_FRAGMENT].fdm_per_layer =
         builder->fdm_per_layer;
      keys[MESA_SHADER_FRAGMENT].max_fdm_layers = builder->max_fdm_layers;
      keys[MESA_SHADER_FRAGMENT].unscaled_input_fragcoord =
         builder->unscaled_input_fragcoord;

      const VkPipelineMultisampleStateCreateInfo *msaa_info =
         builder->create_info->pMultisampleState;

      /* The 1.3.215 spec says:
       *
       *    Sample shading can be used to specify a minimum number of unique
       *    samples to process for each fragment. If sample shading is enabled,
       *    an implementation must provide a minimum of
       *
       *       max(ceil(minSampleShadingFactor * totalSamples), 1)
       *
       *    unique associated data for each fragment, where
       *    minSampleShadingFactor is the minimum fraction of sample shading.
       *
       * The definition is pretty much the same as OpenGL's GL_SAMPLE_SHADING.
       * They both require unique associated data.
       *
       * There are discussions to change the definition, such that
       * sampleShadingEnable does not imply unique associated data.  Before the
       * discussions are settled and before apps (i.e., ANGLE) are fixed to
       * follow the new and incompatible definition, we should stick to the
       * current definition.
       *
       * Note that ir3_shader_key::sample_shading is not actually used by ir3,
       * just checked in tu6_emit_fs_inputs.  We will also copy the value to
       * tu_shader_key::force_sample_interp in a bit.
       */
      keys[MESA_SHADER_FRAGMENT].force_sample_interp =
         !builder->rasterizer_discard && msaa_info && msaa_info->sampleShadingEnable;
   }

   unsigned char pipeline_sha1[20];
   tu_hash_shaders(pipeline_sha1, builder->create_flags, stage_infos, nir,
                   &builder->layout, keys, builder->state);

   unsigned char nir_sha1[21];
   memcpy(nir_sha1, pipeline_sha1, sizeof(pipeline_sha1));
   nir_sha1[20] = 'N';

   if (!executable_info) {
      cache_hit = true;
      bool application_cache_hit = false;

      unsigned char shader_sha1[21];
      memcpy(shader_sha1, pipeline_sha1, sizeof(pipeline_sha1));

      for (mesa_shader_stage stage = MESA_SHADER_VERTEX; stage < ARRAY_SIZE(nir);
           stage = (mesa_shader_stage) (stage + 1)) {
         if (stage_infos[stage] || nir[stage]) {
            bool shader_application_cache_hit;
            shader_sha1[20] = (unsigned char) stage;
            shaders[stage] =
               tu_pipeline_cache_lookup(builder->cache, &shader_sha1,
                                        sizeof(shader_sha1),
                                        &shader_application_cache_hit);
            if (!shaders[stage]) {
               cache_hit = false;
               break;
            }
            application_cache_hit &= shader_application_cache_hit;
         }
      }

      /* If the user asks us to keep the NIR around, we need to have it for a
       * successful cache hit. If we only have a "partial" cache hit, then we
       * still need to recompile in order to get the NIR.
       */
      if (cache_hit &&
          (builder->create_flags &
           VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT)) {
         bool nir_application_cache_hit = false;
         nir_shaders =
            tu_nir_cache_lookup(builder->cache, &nir_sha1,
                                sizeof(nir_sha1),
                                &nir_application_cache_hit);

         application_cache_hit &= nir_application_cache_hit;
         cache_hit &= !!nir_shaders;
      }

      if (application_cache_hit && builder->cache != builder->device->mem_cache) {
         pipeline_feedback.flags |=
            VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
      }
   }

   if (!cache_hit) {
      if (builder->create_flags &
          VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR) {
         return VK_PIPELINE_COMPILE_REQUIRED;
      }

      result = tu_compile_shaders(builder->device,
                                  builder->create_flags,
                                  stage_infos,
                                  nir,
                                  keys,
                                  &builder->layout,
                                  pipeline_sha1,
                                  shaders,
                                  executable_info ? nir_initial_disasm : NULL,
                                  pipeline->executables_mem_ctx,
                                  retain_nir ? post_link_nir : NULL,
                                  stage_feedbacks);

      if (result != VK_SUCCESS)
         goto fail;

      if (retain_nir) {
         nir_shaders =
            tu_nir_shaders_init(builder->device, &nir_sha1, sizeof(nir_sha1));
         for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
              stage < ARRAY_SIZE(nir); stage = (mesa_shader_stage) (stage + 1)) {
            if (!post_link_nir[stage])
               continue;

            nir_shaders->nir[stage] = post_link_nir[stage];
         }

         nir_shaders = tu_nir_cache_insert(builder->cache, nir_shaders);
      }

      for (mesa_shader_stage stage = MESA_SHADER_VERTEX; stage < ARRAY_SIZE(nir);
           stage = (mesa_shader_stage) (stage + 1)) {
         if (!nir[stage])
            continue;

         shaders[stage] = tu_pipeline_cache_insert(builder->cache, shaders[stage]);
      }
   }

done:

   /* Create empty shaders which contain the draw states to initialize
    * registers for unused shader stages.
    */
   if (builder->state &
       VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
      if (!shaders[MESA_SHADER_TESS_CTRL]) {
         shaders[MESA_SHADER_TESS_CTRL] = builder->device->empty_tcs;
         vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_TESS_CTRL]->base);
      }
      if (!shaders[MESA_SHADER_TESS_EVAL]) {
         shaders[MESA_SHADER_TESS_EVAL] = builder->device->empty_tes;
         vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_TESS_EVAL]->base);
      }
      if (!shaders[MESA_SHADER_GEOMETRY]) {
         shaders[MESA_SHADER_GEOMETRY] = builder->device->empty_gs;
         vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_GEOMETRY]->base);
      }
   }

   if (builder->state &
       VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
      if (!shaders[MESA_SHADER_FRAGMENT]) {
         shaders[MESA_SHADER_FRAGMENT] =
            builder->fragment_density_map ?
            builder->device->empty_fs_fdm : builder->device->empty_fs;
         vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_FRAGMENT]->base);
      }
   }

   for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
        stage < ARRAY_SIZE(nir); stage = (mesa_shader_stage) (stage + 1)) {
      if (shaders[stage] && shaders[stage]->variant) {
         tu_append_executable(pipeline, shaders[stage]->variant,
                              nir_initial_disasm[stage]);
      }
   }

   /* We may have deduplicated a cache entry, in which case our original
    * post_link_nir may be gone.
    */
   if (nir_shaders) {
      for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
           stage < ARRAY_SIZE(nir); stage = (mesa_shader_stage) (stage + 1)) {
         if (nir_shaders->nir[stage]) {
            post_link_nir[stage] = nir_shaders->nir[stage];
         }
      }
   }

   /* In the case where we're building a library without link-time
    * optimization but with sub-libraries that retain LTO info, we should
    * retain it ourselves in case another pipeline includes us with LTO.
    */
   for (unsigned i = 0; i < builder->num_libraries; i++) {
      struct tu_graphics_lib_pipeline *library = builder->libraries[i];
      for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
           stage < ARRAY_SIZE(library->shaders);
           stage = (mesa_shader_stage) (stage + 1)) {
         if (!post_link_nir[stage] && library->shaders[stage].nir) {
            post_link_nir[stage] = library->shaders[stage].nir;
            keys[stage] = library->shaders[stage].key;
         }

         if (!shaders[stage] && library->base.shaders[stage]) {
            shaders[stage] = library->base.shaders[stage];
            vk_pipeline_cache_object_ref(&shaders[stage]->base);
         }
      }
   }

   if (shaders[MESA_SHADER_VERTEX]) {
      const struct ir3_shader_variant *vs =
         shaders[MESA_SHADER_VERTEX]->variant;

      if (!vs->stream_output.num_outputs && ir3_has_binning_vs(&vs->key)) {
         tu_append_executable(pipeline, vs->binning, NULL);
      }
   }

   if (pipeline_contains_all_shader_state(pipeline)) {
      /* It doesn't make much sense to use RETAIN_LINK_TIME_OPTIMIZATION_INFO
       * when compiling all stages, but make sure we don't leak.
       */
      if (nir_shaders)
         vk_pipeline_cache_object_unref(&builder->device->vk,
                                        &nir_shaders->base);
   } else {
      struct tu_graphics_lib_pipeline *library =
         tu_pipeline_to_graphics_lib(pipeline);
      library->nir_shaders = nir_shaders;
      for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
           stage < ARRAY_SIZE(library->shaders);
           stage = (mesa_shader_stage) (stage + 1)) {
         library->shaders[stage].nir = post_link_nir[stage];
         library->shaders[stage].key = keys[stage];
      }
   }

   for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
        stage < ARRAY_SIZE(shaders); stage = (mesa_shader_stage) (stage + 1)) {
      pipeline->shaders[stage] = shaders[stage];
      if (shaders[stage])
         pipeline->active_desc_sets |= shaders[stage]->active_desc_sets;
   }

   pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
   if (creation_feedback) {
      *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;

      for (uint32_t i = 0; i < creation_feedback->pipelineStageCreationFeedbackCount; i++) {
         mesa_shader_stage s =
            vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
         creation_feedback->pPipelineStageCreationFeedbacks[i] = stage_feedbacks[s];
      }
   }

   return VK_SUCCESS;

fail:
   if (nir_shaders)
      vk_pipeline_cache_object_unref(&builder->device->vk,
                                     &nir_shaders->base);

   return result;
}

static void
tu_pipeline_builder_parse_libraries(struct tu_pipeline_builder *builder,
                                    struct tu_pipeline *pipeline)
{
   const VkPipelineLibraryCreateInfoKHR *library_info =
      vk_find_struct_const(builder->create_info->pNext,
                           PIPELINE_LIBRARY_CREATE_INFO_KHR);

   if (library_info) {
      assert(library_info->libraryCount <= MAX_LIBRARIES);
      builder->num_libraries = library_info->libraryCount;
      for (unsigned i = 0; i < library_info->libraryCount; i++) {
         VK_FROM_HANDLE(tu_pipeline, library, library_info->pLibraries[i]);
         builder->libraries[i] = tu_pipeline_to_graphics_lib(library);
      }
   }

   /* Merge in the state from libraries. The program state is a bit special
    * and is handled separately.
    */
   if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB)
      tu_pipeline_to_graphics_lib(pipeline)->state = builder->state;
   for (unsigned i = 0; i < builder->num_libraries; i++) {
      struct tu_graphics_lib_pipeline *library = builder->libraries[i];
      if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB)
         tu_pipeline_to_graphics_lib(pipeline)->state |= library->state;

      if (library->state &
          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
         pipeline->output = library->base.output;
         pipeline->lrz_blend.lrz_blend_status =
            library->base.lrz_blend.lrz_blend_status;
         pipeline->lrz_blend.valid |= library->base.lrz_blend.valid;
      }

      if ((library->state &
           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
          (library->state &
           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
         pipeline->prim_order = library->base.prim_order;
      }

      if (library->base.bandwidth.valid)
         pipeline->bandwidth = library->base.bandwidth;

      if (library->base.disable_fs.valid)
         pipeline->disable_fs = library->base.disable_fs;

      pipeline->set_state_mask |= library->base.set_state_mask;

      u_foreach_bit (i, library->base.set_state_mask) {
         pipeline->dynamic_state[i] = library->base.dynamic_state[i];
      }

      if (contains_all_shader_state(library->state)) {
         pipeline->program = library->base.program;
         pipeline->load_state = library->base.load_state;
         for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); i++) {
            if (library->base.shaders[i]) {
               pipeline->shaders[i] = library->base.shaders[i];
               vk_pipeline_cache_object_ref(&pipeline->shaders[i]->base);
            }
         }
      }

      BITSET_OR(pipeline->static_state_mask, pipeline->static_state_mask,
                library->base.static_state_mask);

      vk_graphics_pipeline_state_merge(&builder->graphics_state,
                                       &library->graphics_state);
   }
}

static void
tu_pipeline_builder_parse_layout(struct tu_pipeline_builder *builder,
                                 struct tu_pipeline *pipeline)
{
   VK_FROM_HANDLE(tu_pipeline_layout, layout, builder->create_info->layout);

   if (layout) {
      /* Note: it's still valid to have a layout even if there are libraries.
       * This allows the app to e.g. overwrite an INDEPENDENT_SET layout with
       * a non-INDEPENDENT_SET layout which may make us use a faster path,
       * currently this just affects dynamic offset descriptors.
       */
      builder->layout = *layout;
   } else {
      for (unsigned i = 0; i < builder->num_libraries; i++) {
         struct tu_graphics_lib_pipeline *library = builder->libraries[i];
         builder->layout.num_sets = MAX2(builder->layout.num_sets,
                                         library->num_sets);
         assert(builder->layout.num_sets <= builder->device->physical_device->usable_sets);
         for (unsigned j = 0; j < library->num_sets; j++) {
            builder->layout.set[i].layout = library->layouts[i];
         }

         builder->layout.push_constant_size = library->push_constant_size;
      }

      tu_pipeline_layout_init(&builder->layout);
   }

   if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB) {
      struct tu_graphics_lib_pipeline *library =
         tu_pipeline_to_graphics_lib(pipeline);
      library->num_sets = builder->layout.num_sets;
      for (unsigned i = 0; i < library->num_sets; i++) {
         library->layouts[i] = builder->layout.set[i].layout;
         if (library->layouts[i])
            vk_descriptor_set_layout_ref(&library->layouts[i]->vk);
      }
      library->push_constant_size = builder->layout.push_constant_size;
   }
}

static void
tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
                        struct tu_const_state *const_state,
                        const struct ir3_shader_variant *v)
{
   link->const_state = *ir3_const_state(v);
   link->tu_const_state = *const_state;
   link->constlen = v->constlen;
}

template <chip CHIP>
static void
tu_emit_program_state(struct tu_cs *sub_cs,
                      struct tu_program_state *prog,
                      struct tu_shader **shaders)
{
   struct tu_device *dev = sub_cs->device;
   struct tu_cs prog_cs;

   const struct ir3_shader_variant *variants[MESA_SHADER_STAGES];
   struct tu_draw_state draw_states[MESA_SHADER_STAGES];

   for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
        stage < ARRAY_SIZE(variants); stage = (mesa_shader_stage) (stage+1)) {
      variants[stage] = shaders[stage] ? shaders[stage]->variant : NULL;
   }

   uint32_t safe_variants =
      ir3_trim_constlen(variants, dev->compiler);

   unsigned dynamic_descriptor_sizes[MAX_SETS] = { };

   for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
        stage < ARRAY_SIZE(variants); stage = (mesa_shader_stage) (stage+1)) {
      if (shaders[stage]) {
         if (safe_variants & (1u << stage)) {
            variants[stage] = shaders[stage]->safe_const_variant;
            draw_states[stage] = shaders[stage]->safe_const_state;
         } else {
            draw_states[stage] = shaders[stage]->state;
         }

         for (unsigned i = 0; i < MAX_SETS; i++) {
            if (shaders[stage]->dynamic_descriptor_sizes[i] >= 0) {
               dynamic_descriptor_sizes[i] =
                  shaders[stage]->dynamic_descriptor_sizes[i];
            }
         }

         if (variants[stage]) {
            memcpy(prog->stage_sha1[stage], variants[stage]->sha1_str,
                   sizeof(variants[stage]->sha1_str));
         }
      }
   }

   for (unsigned i = 0; i < ARRAY_SIZE(variants); i++) {
      if (!variants[i])
         continue;

      tu_pipeline_set_linkage(&prog->link[i],
                              &shaders[i]->const_state,
                              variants[i]);

      struct tu_push_constant_range *push_consts =
         &shaders[i]->const_state.push_consts;
      if (push_consts->type == IR3_PUSH_CONSTS_SHARED ||
          push_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
         prog->shared_consts = *push_consts;
      }

      if (variants[i]->info.uses_ray_intersection)
         prog->uses_ray_intersection = true;
   }

   unsigned dynamic_descriptor_offset = 0;
   for (unsigned i = 0; i < MAX_SETS; i++) {
      prog->dynamic_descriptor_offsets[i] = dynamic_descriptor_offset;
      dynamic_descriptor_offset += dynamic_descriptor_sizes[i];
   }

   /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything
    * else that could depend on that state (like push constants)
    *
    * Note also that this always uses the full VS even in binning pass.  The
    * binning pass variant has the same const layout as the full VS, and
    * the constlen for the VS will be the same or greater than the constlen
    * for the binning pass variant.  It is required that the constlen state
    * matches between binning and draw passes, as some parts of the push
    * consts are emitted in state groups that are shared between the binning
    * and draw passes.
    */
   tu_cs_begin_sub_stream(sub_cs, 512, &prog_cs);
   tu6_emit_program_config<CHIP>(&prog_cs, prog, shaders, variants);
   prog->config_state = tu_cs_end_draw_state(sub_cs, &prog_cs);

   prog->vs_state = draw_states[MESA_SHADER_VERTEX];

  /* Don't use the binning pass variant when GS is present because we don't
   * support compiling correct binning pass variants with GS.
   */
   if (variants[MESA_SHADER_GEOMETRY]) {
      prog->vs_binning_state = prog->vs_state;
   } else {
      prog->vs_binning_state =
         (safe_variants & (1u << MESA_SHADER_VERTEX))
            ? shaders[MESA_SHADER_VERTEX]->safe_const_binning_state
            : shaders[MESA_SHADER_VERTEX]->binning_state;
   }

   prog->hs_state = draw_states[MESA_SHADER_TESS_CTRL];
   prog->ds_state = draw_states[MESA_SHADER_TESS_EVAL];
   prog->gs_state = draw_states[MESA_SHADER_GEOMETRY];
   prog->gs_binning_state =
      (safe_variants & (1u << MESA_SHADER_GEOMETRY)) ?
      shaders[MESA_SHADER_GEOMETRY]->safe_const_binning_state :
      shaders[MESA_SHADER_GEOMETRY]->binning_state;
   prog->fs_state = draw_states[MESA_SHADER_FRAGMENT];

   const struct ir3_shader_variant *vs = variants[MESA_SHADER_VERTEX];
   const struct ir3_shader_variant *hs = variants[MESA_SHADER_TESS_CTRL];
   const struct ir3_shader_variant *ds = variants[MESA_SHADER_TESS_EVAL];
   const struct ir3_shader_variant *gs = variants[MESA_SHADER_GEOMETRY];
   const struct ir3_shader_variant *fs = variants[MESA_SHADER_FRAGMENT];

   tu_cs_begin_sub_stream(sub_cs, 512, &prog_cs);
   tu6_emit_vpc<CHIP>(&prog_cs, vs, hs, ds, gs, fs);
   prog->vpc_state = tu_cs_end_draw_state(sub_cs, &prog_cs);

   const struct ir3_shader_variant *last_variant;
   const struct tu_shader *last_shader;
   if (gs) {
      last_shader = shaders[MESA_SHADER_GEOMETRY];
      last_variant = gs;
   } else if (ds) {
      last_shader = shaders[MESA_SHADER_TESS_EVAL];
      last_variant = ds;
   } else {
      last_shader = shaders[MESA_SHADER_VERTEX];
      last_variant = vs;
   }

   prog->per_view_viewport =
      !last_variant->writes_viewport &&
      shaders[MESA_SHADER_FRAGMENT]->fs.has_fdm &&
      dev->physical_device->info->props.has_per_view_viewport;
   prog->per_layer_viewport = last_shader->per_layer_viewport;
   prog->fake_single_viewport = prog->per_view_viewport ||
      prog->per_layer_viewport;
   prog->writes_shading_rate = last_variant->writes_shading_rate;
   prog->reads_shading_rate = fs->reads_shading_rate;
}

static const enum mesa_vk_dynamic_graphics_state tu_vertex_input_state[] = {
   MESA_VK_DYNAMIC_VI,
};

template <chip CHIP>
static unsigned
tu6_vertex_input_size(struct tu_device *dev,
                      const struct vk_vertex_input_state *vi)
{
   return 1 + 2 * util_last_bit(vi->attributes_valid);
}

template <chip CHIP>
static void
tu6_emit_vertex_input(struct tu_cs *cs,
                      const struct vk_vertex_input_state *vi)
{
   unsigned attr_count = util_last_bit(vi->attributes_valid);
   if (attr_count != 0)
      tu_cs_emit_pkt4(cs, REG_A6XX_VFD_FETCH_INSTR_INSTR(0), attr_count * 2);

   for (uint32_t loc = 0; loc < attr_count; loc++) {
      const struct vk_vertex_attribute_state *attr = &vi->attributes[loc];

      if (vi->attributes_valid & (1u << loc)) {
         const struct vk_vertex_binding_state *binding =
            &vi->bindings[attr->binding];

         enum pipe_format pipe_format = vk_format_to_pipe_format(attr->format);
         const struct tu_native_format format = tu6_format_vtx(pipe_format);
         tu_cs_emit(cs, A6XX_VFD_FETCH_INSTR_INSTR(0,
                          .idx = attr->binding,
                          .offset = attr->offset,
                          .instanced = binding->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE,
                          .format = format.fmt,
                          .swap = format.swap,
                          .unk30 = 1,
                          ._float = !util_format_is_pure_integer(pipe_format)).value);
         tu_cs_emit(cs, A6XX_VFD_FETCH_INSTR_STEP_RATE(0, binding->divisor).value);
      } else {
         tu_cs_emit(cs, 0);
         tu_cs_emit(cs, 0);
      }
   }
}

static const enum mesa_vk_dynamic_graphics_state tu_vertex_stride_state[] = {
   MESA_VK_DYNAMIC_VI_BINDINGS_VALID,
   MESA_VK_DYNAMIC_VI_BINDING_STRIDES,
};

template <chip CHIP>
static unsigned
tu6_vertex_stride_size(struct tu_device *dev,
                       const struct vk_vertex_input_state *vi)
{
   return 1 + 2 * util_last_bit(vi->bindings_valid);
}

template <chip CHIP>
static void
tu6_emit_vertex_stride(struct tu_cs *cs, const struct vk_vertex_input_state *vi)
{
   if (vi->bindings_valid) {
      unsigned bindings_count = util_last_bit(vi->bindings_valid);
      tu_crb crb = cs->crb(bindings_count);
      for (unsigned i = 0; i < bindings_count; i++) {
         crb.add(A6XX_VFD_VERTEX_BUFFER_STRIDE(
            i, .vfd_vertex_buffer_stride = vi->bindings[i].stride));
      }
   }
}

template <chip CHIP>
static unsigned
tu6_vertex_stride_size_dyn(struct tu_device *dev,
                           const uint16_t *vi_binding_stride,
                           uint32_t bindings_valid)
{
   return 1 + 2 * util_last_bit(bindings_valid);
}

template <chip CHIP>
static void
tu6_emit_vertex_stride_dyn(struct tu_cs *cs, const uint16_t *vi_binding_stride,
                           uint32_t bindings_valid)
{
   if (bindings_valid) {
      unsigned bindings_count = util_last_bit(bindings_valid);
      tu_crb crb = cs->crb(bindings_count);
      for (unsigned i = 0; i < bindings_count; i++) {
         crb.add(A6XX_VFD_VERTEX_BUFFER_STRIDE(
            i, .vfd_vertex_buffer_stride = vi_binding_stride[i]));
      }
   }
}

static const enum mesa_vk_dynamic_graphics_state tu_viewport_state[] = {
   MESA_VK_DYNAMIC_VP_VIEWPORTS,
   MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT,
   MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE,
   MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
};

template <chip CHIP>
static unsigned
tu6_viewport_size(struct tu_device *dev,
                  const struct vk_viewport_state *vp,
                  const struct vk_rasterization_state *rs)
{
   return 1 + vp->viewport_count * 6 + 1 + vp->viewport_count * 2 +
      1 + vp->viewport_count * 2 + 5;
}

template <chip CHIP>
static void
tu6_emit_viewport(struct tu_cs *cs,
                  const struct vk_viewport_state *vp,
                  const struct vk_rasterization_state *rs)
{
   VkExtent2D guardband = {511, 511};

   tu_cs_emit_pkt4(cs, GRAS_CL_VIEWPORT_XOFFSET(CHIP, 0).reg, vp->viewport_count * 6);
   for (uint32_t i = 0; i < vp->viewport_count; i++) {
      const VkViewport *viewport = &vp->viewports[i];
      float offsets[3];
      float scales[3];
      scales[0] = viewport->width / 2.0f;
      scales[1] = viewport->height / 2.0f;
      if (vp->depth_clip_negative_one_to_one) {
         scales[2] = 0.5 * (viewport->maxDepth - viewport->minDepth);
      } else {
         scales[2] = viewport->maxDepth - viewport->minDepth;
      }

      offsets[0] = viewport->x + scales[0];
      offsets[1] = viewport->y + scales[1];
      if (vp->depth_clip_negative_one_to_one) {
         offsets[2] = 0.5 * (viewport->minDepth + viewport->maxDepth);
      } else {
         offsets[2] = viewport->minDepth;
      }

      for (uint32_t j = 0; j < 3; j++) {
         tu_cs_emit(cs, fui(offsets[j]));
         tu_cs_emit(cs, fui(scales[j]));
      }

      guardband.width =
         MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false));
      guardband.height =
         MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false));
   }

   tu_cs_emit_pkt4(cs, GRAS_SC_VIEWPORT_SCISSOR_TL(CHIP, 0).reg, vp->viewport_count * 2);
   for (uint32_t i = 0; i < vp->viewport_count; i++) {
      const VkViewport *viewport = &vp->viewports[i];
      VkOffset2D min;
      VkOffset2D max;
      min.x = (int32_t) viewport->x;
      max.x = (int32_t) ceilf(viewport->x + viewport->width);
      if (viewport->height >= 0.0f) {
         min.y = (int32_t) viewport->y;
         max.y = (int32_t) ceilf(viewport->y + viewport->height);
      } else {
         min.y = (int32_t)(viewport->y + viewport->height);
         max.y = (int32_t) ceilf(viewport->y);
      }
      /* the spec allows viewport->height to be 0.0f */
      if (min.y == max.y)
         max.y++;
      /* allow viewport->width = 0.0f for un-initialized viewports: */
      if (min.x == max.x)
         max.x++;

      min.x = MAX2(min.x, 0);
      min.y = MAX2(min.y, 0);
      max.x = MAX2(max.x, 1);
      max.y = MAX2(max.y, 1);

      assert(min.x < max.x);
      assert(min.y < max.y);

      tu_cs_emit(
         cs, GRAS_SC_VIEWPORT_SCISSOR_TL(CHIP, 0, .x = min.x, .y = min.y).value);
      tu_cs_emit(
         cs, GRAS_SC_VIEWPORT_SCISSOR_BR(CHIP, 0, .x = max.x - 1, .y = max.y - 1)
                .value);
   }

   /* A7XX+ doesn't clamp to [0,1] with disabled depth clamp, to support
    * VK_EXT_depth_clamp_zero_one we have to always enable clamp and manually
    * set range to [0,1] when rs->depth_clamp_enable is false.
    */
   bool zero_one_depth_clamp = CHIP >= A7XX && !rs->depth_clamp_enable;

   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VIEWPORT_ZCLAMP(0), vp->viewport_count * 2);
   for (uint32_t i = 0; i < vp->viewport_count; i++) {
      const VkViewport *viewport = &vp->viewports[i];
      if (zero_one_depth_clamp) {
         tu_cs_emit(cs, fui(0.0f));
         tu_cs_emit(cs, fui(1.0f));
      } else {
         tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth)));
         tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth)));
      }
   }
   tu_cs_emit_regs(cs,
      GRAS_CL_GUARDBAND_CLIP_ADJ(CHIP, .horz = guardband.width, .vert = guardband.height));

   /* TODO: what to do about this and multi viewport ? */
   float z_clamp_min = vp->viewport_count ? MIN2(vp->viewports[0].minDepth, vp->viewports[0].maxDepth) : 0;
   float z_clamp_max = vp->viewport_count ? MAX2(vp->viewports[0].minDepth, vp->viewports[0].maxDepth) : 0;
   if (zero_one_depth_clamp) {
      z_clamp_min = 0.0f;
      z_clamp_max = 1.0f;
   }

   tu_cs_emit_regs(cs,
                   RB_VIEWPORT_ZCLAMP_MIN(CHIP, z_clamp_min),
                   RB_VIEWPORT_ZCLAMP_MAX(CHIP, z_clamp_max));
}

struct apply_viewport_state {
   struct vk_viewport_state vp;
   struct vk_rasterization_state rs;
   /* See tu_render_pass_state::shared_viewport */
   bool share_scale;
   /* See tu_pipeline::fake_single_viewport */
   bool fake_single_viewport;
   bool custom_resolve;
};

/* It's a hardware restriction that the window offset (i.e. common_bin_offset)
 * must be the same for all views. This means that rendering coordinates
 * cannot be a simple scaling of framebuffer coordinates, because this would
 * require us to scale the window offset and the scale may be different per
 * view. Instead we have to apply a per-bin offset to the rendering coordinate
 * transform to make sure that the window offset maps to the per-view bin
 * coordinate, which will be the same if there is no offset. Specifically we
 * need an offset o to the transform:
 *
 * x' = s * x + o
 *
 * so that when we plug in the per-view bin start b_s and the common window
 * offset b_cs:
 *
 * b_cs = s * b_s + o
 *
 * and we get:
 *
 * o = b_cs - s * b_s
 *
 * We use this form exactly, because we know the bin start is a multiple of
 * the frag area so s * b_s is an integer and we can compute an exact result
 * easily. We also have to make sure that the bin offset is a multiple of the
 * frag area by restricting the frag area.
 */

VkOffset2D
tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin,
                      VkOffset2D common_bin_offset)
{
   assert(bin.offset.x % frag_area.width == 0);
   assert(bin.offset.y % frag_area.height == 0);

   return (VkOffset2D) {
      common_bin_offset.x - bin.offset.x / frag_area.width,
      common_bin_offset.y - bin.offset.y / frag_area.height
   };
}

static void
fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
                    VkOffset2D common_bin_offset,
                    const VkOffset2D *hw_viewport_offsets,
                    unsigned views,
                    const VkExtent2D *frag_areas, const VkRect2D *bins,
                    bool binning)
{
   const struct apply_viewport_state *state =
      (const struct apply_viewport_state *)data;

   struct vk_viewport_state vp = state->vp;

   for (unsigned i = 0; i < state->vp.viewport_count; i++) {
      /* Note: If we're using shared scaling, the scale should already be the
       * same across all views, we can pick any view. However the number
       * of viewports and number of views is not guaranteed the same, so we
       * need to pick the 0'th view which always exists to be safe.
       *
       * If FDM per layer is enabled in the shader but disabled by the
       * renderpass, views will be 1 and we also have to replicate the 0'th
       * view to every view.
       */
      VkExtent2D frag_area =
         (state->share_scale || views == 1) ? frag_areas[0] : frag_areas[i];
      VkRect2D bin =
         (state->share_scale || views == 1) ? bins[0] : bins[i];
      VkOffset2D hw_viewport_offset =
         (state->share_scale || views == 1) ? hw_viewport_offsets[0] :
         hw_viewport_offsets[i];
      /* Implement fake_single_viewport by replicating viewport 0 across all
       * views.
       */
      VkViewport viewport =
         state->fake_single_viewport ? state->vp.viewports[0] : state->vp.viewports[i];
      if ((frag_area.width == 1 && frag_area.height == 1 &&
           common_bin_offset.x == bin.offset.x &&
           common_bin_offset.y == bin.offset.y) ||
          /* When in a custom resolve operation (TODO: and using
           * non-subsampled images) we switch to framebuffer coordinates so we
           * shouldn't apply the transform.  However the binning pass isn't
           * aware of this, so we have to keep applying the transform for
           * binning.
           */
          (state->custom_resolve && !binning)) {
         vp.viewports[i] = viewport;
         continue;
      }

      float scale_x = (float) 1.0f / frag_area.width;
      float scale_y = (float) 1.0f / frag_area.height;

      vp.viewports[i].minDepth = viewport.minDepth;
      vp.viewports[i].maxDepth = viewport.maxDepth;
      vp.viewports[i].width = viewport.width * scale_x;
      vp.viewports[i].height = viewport.height * scale_y;

      VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin,
                                                common_bin_offset);
      offset.x -= hw_viewport_offset.x;
      offset.y -= hw_viewport_offset.y;

      vp.viewports[i].x = scale_x * viewport.x + offset.x;
      vp.viewports[i].y = scale_y * viewport.y + offset.y;
   }

   TU_CALLX(cs->device, tu6_emit_viewport)(cs, &vp, &state->rs);
}

static void
tu6_emit_viewport_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
                      const struct vk_viewport_state *vp,
                      const struct vk_rasterization_state *rs)
{
   unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
   struct apply_viewport_state state = {
      .vp = *vp,
      .rs = *rs,
      .share_scale = !cmd->state.per_view_viewport &&
         !cmd->state.per_layer_viewport,
      .fake_single_viewport = cmd->state.fake_single_viewport,
      .custom_resolve = cmd->state.subpass->custom_resolve,
   };
   if (cmd->state.per_view_viewport)
      state.vp.viewport_count = num_views;
   else if (cmd->state.per_layer_viewport)
      state.vp.viewport_count = cmd->state.max_fdm_layers;
   unsigned size = TU_CALLX(cmd->device, tu6_viewport_size)(cmd->device, &state.vp, &state.rs);
   tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs);
   tu_create_fdm_bin_patchpoint(cmd, cs, size, TU_FDM_NONE,
                                fdm_apply_viewports, state);
   cmd->state.rp.shared_viewport |= !cmd->state.per_view_viewport &&
      !cmd->state.program.per_layer_viewport;
}

static const enum mesa_vk_dynamic_graphics_state tu_scissor_state[] = {
   MESA_VK_DYNAMIC_VP_SCISSORS,
   MESA_VK_DYNAMIC_VP_SCISSOR_COUNT,
};

template <chip CHIP>
static unsigned
tu6_scissor_size(struct tu_device *dev, const struct vk_viewport_state *vp)
{
   return 1 + vp->scissor_count * 2;
}

template <chip CHIP>
void
tu6_emit_scissor(struct tu_cs *cs, const struct vk_viewport_state *vp)
{
   tu_cs_emit_pkt4(cs, GRAS_SC_SCREEN_SCISSOR_TL(CHIP, 0).reg, vp->scissor_count * 2);

   for (uint32_t i = 0; i < vp->scissor_count; i++) {
      const VkRect2D *scissor = &vp->scissors[i];

      uint32_t min_x = scissor->offset.x;
      uint32_t min_y = scissor->offset.y;
      uint32_t max_x = min_x + scissor->extent.width - 1;
      uint32_t max_y = min_y + scissor->extent.height - 1;

      if (!scissor->extent.width || !scissor->extent.height) {
         min_x = min_y = 1;
         max_x = max_y = 0;
      } else {
         /* avoid overflow */
         uint32_t scissor_max = BITFIELD_MASK(15);
         min_x = MIN2(scissor_max, min_x);
         min_y = MIN2(scissor_max, min_y);
         max_x = MIN2(scissor_max, max_x);
         max_y = MIN2(scissor_max, max_y);
      }

      tu_cs_emit(cs, GRAS_SC_SCREEN_SCISSOR_TL(CHIP, i, .x = min_x, .y = min_y).value);
      tu_cs_emit(cs, GRAS_SC_SCREEN_SCISSOR_BR(CHIP, i, .x = max_x, .y = max_y).value);
   }
}

static void
fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
                   VkOffset2D common_bin_offset,
                   const VkOffset2D *hw_viewport_offsets,
                   unsigned views,
                   const VkExtent2D *frag_areas, const VkRect2D *bins,
                   bool binning)
{
   const struct apply_viewport_state *state =
      (const struct apply_viewport_state *)data;

   struct vk_viewport_state vp = state->vp;

   for (unsigned i = 0; i < vp.scissor_count; i++) {
      VkExtent2D frag_area =
         (state->share_scale || views == 1) ? frag_areas[0] : frag_areas[i];
      VkRect2D bin =
         (state->share_scale || views == 1) ? bins[0] : bins[i];
      VkRect2D scissor =
         state->fake_single_viewport ? state->vp.scissors[0] : state->vp.scissors[i];
      VkOffset2D hw_viewport_offset =
         (state->share_scale || views == 1) ? hw_viewport_offsets[0] :
         hw_viewport_offsets[i];

      /* Transform the scissor following the viewport. It's unclear how this
       * is supposed to handle cases where the scissor isn't aligned to the
       * fragment area, but we round outwards to always render partial
       * fragments if the scissor size equals the framebuffer size and it
       * isn't aligned to the fragment area.
       */
      VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin,
                                                common_bin_offset);
      offset.x -= hw_viewport_offset.x;
      offset.y -= hw_viewport_offset.y;

      /* Disable scaling and offset when doing a custom resolve to a
       * non-subsampled image and not in the binning pass, because we
       * use framebuffer coordinates.
       *
       * TODO: When we support subsampled images, only do this for
       * non-subsampled images.
       */
      if (state->custom_resolve && !binning) {
         offset = (VkOffset2D) {};
         frag_area = (VkExtent2D) {1, 1};
      }

      VkOffset2D min = {
         scissor.offset.x / frag_area.width + offset.x,
         scissor.offset.y / frag_area.width + offset.y,
      };
      VkOffset2D max = {
         DIV_ROUND_UP(scissor.offset.x + scissor.extent.width, frag_area.width) + offset.x,
         DIV_ROUND_UP(scissor.offset.y + scissor.extent.height, frag_area.height) + offset.y,
      };

      /* Intersect scissor with the scaled bin, this essentially replaces the
       * window scissor. With custom resolve (TODO: and non-subsampled images)
       * we have to use the unscaled bin instead.
       */
      uint32_t scaled_width = bin.extent.width / frag_area.width;
      uint32_t scaled_height = bin.extent.height / frag_area.height;
      int32_t bin_x;
      int32_t bin_y;
      if (state->custom_resolve && !binning) {
         bin_x = bin.offset.x;
         bin_y = bin.offset.y;
      } else {
         bin_x = common_bin_offset.x - hw_viewport_offset.x;
         bin_y = common_bin_offset.y - hw_viewport_offset.y;
      }
      vp.scissors[i].offset.x = MAX2(min.x, bin_x);
      vp.scissors[i].offset.y = MAX2(min.y, bin_y);
      vp.scissors[i].extent.width =
         MIN2(max.x, bin_x + scaled_width) - vp.scissors[i].offset.x;
      vp.scissors[i].extent.height =
         MIN2(max.y, bin_y + scaled_height) - vp.scissors[i].offset.y;
   }

   TU_CALLX(cs->device, tu6_emit_scissor)(cs, &vp);
}

static void
tu6_emit_scissor_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
                     const struct vk_viewport_state *vp)
{
   unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
   struct apply_viewport_state state = {
      .vp = *vp,
      .share_scale = !cmd->state.per_view_viewport &&
         !cmd->state.per_layer_viewport,
      .fake_single_viewport = cmd->state.fake_single_viewport,
      .custom_resolve = cmd->state.subpass->custom_resolve,
   };
   if (cmd->state.per_view_viewport)
      state.vp.scissor_count = num_views;
   else if (cmd->state.per_layer_viewport)
      state.vp.scissor_count = cmd->state.max_fdm_layers;
   unsigned size = TU_CALLX(cmd->device, tu6_scissor_size)(cmd->device, &state.vp);
   tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs);
   tu_create_fdm_bin_patchpoint(cmd, cs, size, TU_FDM_NONE, fdm_apply_scissors,
                                state);
}

static const enum mesa_vk_dynamic_graphics_state tu_sample_locations_state[] = {
   MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE,
   MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS,
};

template <chip CHIP>
static unsigned
tu6_sample_locations_size(struct tu_device *dev, bool enable,
                          const struct vk_sample_locations_state *samp_loc)
{
   return 6 + (enable ? 9 : 0);
}

template <chip CHIP>
void
tu6_emit_sample_locations(struct tu_cs *cs, bool enable,
                          const struct vk_sample_locations_state *samp_loc)
{
   uint32_t sample_config =
      COND(enable, A6XX_RB_MSAA_SAMPLE_POS_CNTL_LOCATION_ENABLE);

   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_MSAA_SAMPLE_POS_CNTL, 1);
   tu_cs_emit(cs, sample_config);

   tu_cs_emit_pkt4(cs, REG_A6XX_RB_MSAA_SAMPLE_POS_CNTL, 1);
   tu_cs_emit(cs, sample_config);

   tu_cs_emit_pkt4(cs, REG_A6XX_TPL1_MSAA_SAMPLE_POS_CNTL, 1);
   tu_cs_emit(cs, sample_config);

   if (!enable)
      return;

   assert(samp_loc->grid_size.width == 1);
   assert(samp_loc->grid_size.height == 1);

   uint64_t sample_locations = 0;
   for (uint32_t i = 0; i < samp_loc->per_pixel; i++) {
      /* From VkSampleLocationEXT:
       *
       *    The values specified in a VkSampleLocationEXT structure are always
       *    clamped to the implementation-dependent sample location coordinate
       *    range
       *    [sampleLocationCoordinateRange[0],sampleLocationCoordinateRange[1]]
       */
      float x = CLAMP(samp_loc->locations[i].x, SAMPLE_LOCATION_MIN,
                      SAMPLE_LOCATION_MAX);
      float y = CLAMP(samp_loc->locations[i].y, SAMPLE_LOCATION_MIN,
                      SAMPLE_LOCATION_MAX);

      sample_locations |=
         ((uint64_t)(A6XX_RB_PROGRAMMABLE_MSAA_POS_0_SAMPLE_0_X(x) |
                     A6XX_RB_PROGRAMMABLE_MSAA_POS_0_SAMPLE_0_Y(y))) << i*8;
   }

   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_PROGRAMMABLE_MSAA_POS_0, 2);
   tu_cs_emit_qw(cs, sample_locations);

   tu_cs_emit_pkt4(cs, REG_A6XX_RB_PROGRAMMABLE_MSAA_POS_0, 2);
   tu_cs_emit_qw(cs, sample_locations);

   tu_cs_emit_pkt4(cs, REG_A6XX_TPL1_PROGRAMMABLE_MSAA_POS_0, 2);
   tu_cs_emit_qw(cs, sample_locations);
}

static const enum mesa_vk_dynamic_graphics_state tu_depth_bias_state[] = {
   MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS,
};

template <chip CHIP>
static unsigned
tu6_depth_bias_size(struct tu_device *dev,
                    const struct vk_rasterization_state *rs)
{
   return 4;
}

template <chip CHIP>
void
tu6_emit_depth_bias(struct tu_cs *cs, const struct vk_rasterization_state *rs)
{
   tu_cs_emit_regs(cs,
      GRAS_SU_POLY_OFFSET_SCALE(CHIP, rs->depth_bias.slope_factor),
      GRAS_SU_POLY_OFFSET_OFFSET(CHIP, rs->depth_bias.constant_factor),
      GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(CHIP, rs->depth_bias.clamp));
}

static const enum mesa_vk_dynamic_graphics_state tu_bandwidth_state[] = {
   MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
   MESA_VK_DYNAMIC_CB_LOGIC_OP,
   MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
   MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
   MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
   MESA_VK_DYNAMIC_CB_WRITE_MASKS,
};

static void
tu_calc_bandwidth(struct tu_bandwidth *bandwidth,
                  const struct vk_color_blend_state *cb,
                  const struct vk_render_pass_state *rp)
{
   bool rop_reads_dst = cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op);

   uint32_t total_bpp = 0;
   for (unsigned i = 0; i < cb->attachment_count; i++) {
      const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
      if (!(cb->color_write_enables & (1u << i)))
         continue;

      const VkFormat format = rp->color_attachment_formats[i];

      uint32_t write_bpp = 0;
      if (format == VK_FORMAT_UNDEFINED) {
         /* do nothing */
      } else if (att->write_mask == 0xf) {
         write_bpp = vk_format_get_blocksizebits(format);
      } else {
         const enum pipe_format pipe_format = vk_format_to_pipe_format(format);
         for (uint32_t i = 0; i < 4; i++) {
            if (att->write_mask & (1 << i)) {
               write_bpp += util_format_get_component_bits(pipe_format,
                     UTIL_FORMAT_COLORSPACE_RGB, i);
            }
         }
      }
      total_bpp += write_bpp;

      if (rop_reads_dst || att->blend_enable) {
         total_bpp += write_bpp;
      }
   }

   bandwidth->color_bandwidth_per_sample = total_bpp / 8;

   if (rp->attachments & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) {
      bandwidth->depth_cpp_per_sample = util_format_get_component_bits(
            vk_format_to_pipe_format(rp->depth_attachment_format),
            UTIL_FORMAT_COLORSPACE_ZS, 0) / 8;
   }

   if (rp->attachments & MESA_VK_RP_ATTACHMENT_STENCIL_BIT) {
      bandwidth->stencil_cpp_per_sample = util_format_get_component_bits(
            vk_format_to_pipe_format(rp->stencil_attachment_format),
            UTIL_FORMAT_COLORSPACE_ZS, 1) / 8;
   }

   bandwidth->valid = true;
}

static const enum mesa_vk_dynamic_graphics_state tu_disable_fs_state[] = {
   MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
   MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
   MESA_VK_DYNAMIC_CB_WRITE_MASKS,
   MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE,
};

static bool
tu_calc_disable_fs(const struct vk_color_blend_state *cb,
                   const struct vk_render_pass_state *rp,
                   bool alpha_to_coverage_enable,
                   const struct tu_shader *fs)
{
   if (alpha_to_coverage_enable)
      return false;
   if (fs && !fs->variant->writes_only_color)
      return false;

   bool has_enabled_attachments = false;
   for (unsigned i = 0; i < cb->attachment_count; i++) {
      if (rp->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
         continue;

      const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
      if ((cb->color_write_enables & (1u << i)) && att->write_mask != 0) {
         has_enabled_attachments = true;
         break;
      }
   }

   return !fs || fs->variant->empty ||
          (fs->variant->writes_only_color && !has_enabled_attachments);
}

static void
tu_emit_disable_fs(struct tu_disable_fs *disable_fs,
                   const struct vk_color_blend_state *cb,
                   const struct vk_render_pass_state *rp,
                   bool alpha_to_coverage_enable,
                   const struct tu_shader *fs)
{
   disable_fs->disable_fs =
      tu_calc_disable_fs(cb, rp, alpha_to_coverage_enable, fs);
   disable_fs->valid = true;
}

/* Return true if the blend state reads the color attachments. */
static tu_lrz_blend_status
tu6_calc_blend_lrz(const struct vk_color_blend_state *cb,
                   const struct vk_render_pass_state *rp)
{
   if (cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op))
      return TU_LRZ_BLEND_READS_DEST_OR_PARTIAL_WRITE;

   uint32_t written_color_attachments = 0;
   uint32_t total_color_attachments = 0;
   for (unsigned i = 0; i < cb->attachment_count; i++) {
      if (rp->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
         continue;

      total_color_attachments++;
      const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
      if ((cb->color_write_enables & (1u << i)) && att->write_mask != 0) {
         written_color_attachments++;
      }
   }

   if (total_color_attachments == 0)
      return TU_LRZ_BLEND_SAFE_FOR_LRZ;

   if (written_color_attachments == 0)
      return TU_LRZ_BLEND_ALL_COLOR_WRITES_SKIPPED;

   if (written_color_attachments < cb->attachment_count)
      return TU_LRZ_BLEND_READS_DEST_OR_PARTIAL_WRITE;

   for (unsigned i = 0; i < cb->attachment_count; i++) {
      if (rp->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
         continue;

      const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
      if (att->blend_enable)
         return TU_LRZ_BLEND_READS_DEST_OR_PARTIAL_WRITE;
      if (!(cb->color_write_enables & (1u << i)))
         return TU_LRZ_BLEND_READS_DEST_OR_PARTIAL_WRITE;
      unsigned mask =
         MASK(vk_format_get_nr_components(rp->color_attachment_formats[i]));
      if ((att->write_mask & mask) != mask)
         return TU_LRZ_BLEND_READS_DEST_OR_PARTIAL_WRITE;
   }

   return TU_LRZ_BLEND_SAFE_FOR_LRZ;
}

static const enum mesa_vk_dynamic_graphics_state tu_blend_lrz_state[] = {
   MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
   MESA_VK_DYNAMIC_CB_LOGIC_OP,
   MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
   MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
   MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
   MESA_VK_DYNAMIC_CB_WRITE_MASKS,
};

static void
tu_emit_blend_lrz(struct tu_lrz_blend *lrz,
                  const struct vk_color_blend_state *cb,
                  const struct vk_render_pass_state *rp)
{
   lrz->lrz_blend_status = tu6_calc_blend_lrz(cb, rp);
   lrz->valid = true;
}

static const enum mesa_vk_dynamic_graphics_state tu_blend_state[] = {
   MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
   MESA_VK_DYNAMIC_CB_LOGIC_OP,
   MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
   MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
   MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
   MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS,
   MESA_VK_DYNAMIC_CB_WRITE_MASKS,
   MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE,
   MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE,
   MESA_VK_DYNAMIC_MS_SAMPLE_MASK,
   MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP,
};

template <chip CHIP>
static unsigned
tu6_blend_size(struct tu_device *dev,
               const struct vk_color_blend_state *cb,
               const struct vk_color_attachment_location_state *cal,
               const struct vk_render_pass_state *rp,
               bool alpha_to_coverage_enable,
               bool alpha_to_one_enable,
               uint32_t sample_mask)
{
   unsigned num_rts = alpha_to_coverage_enable ?
      MAX2(cb->attachment_count, 1) : cb->attachment_count;
   return 8 + 3 * num_rts;
}

template <chip CHIP>
static void
tu6_emit_blend(struct tu_cs *cs,
               const struct vk_color_blend_state *cb,
               const struct vk_color_attachment_location_state *cal,
               const struct vk_render_pass_state *rp,
               bool alpha_to_coverage_enable,
               bool alpha_to_one_enable,
               uint32_t sample_mask)
{
   bool rop_reads_dst = cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op);
   enum a3xx_rop_code rop = tu6_rop((VkLogicOp)cb->logic_op);

   uint32_t blend_enable_mask = 0;
   for (unsigned i = 0; i < cb->attachment_count; i++) {
      if (!(cb->color_write_enables & (1u << i)) ||
          cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
         continue;

      const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
      VkFormat att_format = rp->color_attachment_formats[i];
      bool is_float_or_srgb = vk_format_is_float(att_format) || vk_format_is_srgb(att_format);

      /* Logic op overrides any blending. Even when logic op is present, blending
       * should be kept disabled for any ops that don't read dst values or for
       * attachments of float or sRGB formats.
       */
      if ((att->blend_enable && !cb->logic_op_enable) || (rop_reads_dst && !is_float_or_srgb)) {
         blend_enable_mask |= 1u << cal->color_map[i];
      }
   }

   /* This will emit a dummy RB_MRT_*_CONTROL below if alpha-to-coverage is
    * enabled but there are no color attachments, in addition to changing
    * *_FS_OUTPUT_CNTL1.
    */
   unsigned num_rts = alpha_to_coverage_enable ?
      MAX2(cb->attachment_count, 1) : cb->attachment_count;

   bool dual_src_blend = tu_blend_state_is_dual_src(cb);

   tu_cs_emit_regs(cs, SP_BLEND_CNTL(CHIP, .enable_blend = blend_enable_mask,
                                          .independent_blend_en = true,
                                          .dual_color_in_enable =
                                             dual_src_blend,
                                          .alpha_to_coverage =
                                             alpha_to_coverage_enable));
   /* TODO: set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled?
    *
    * We could also set blend_reads_dest more conservatively, but it didn't show
    * performance wins in anholt's testing:
    * https://gitlab.freedesktop.org/anholt/mesa/-/commits/tu-color-reads
    */
   tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.blend_reads_dest = blend_enable_mask,
                                          .independent_blend = true,
                                          .dual_color_in_enable =
                                             dual_src_blend,
                                          .alpha_to_coverage =
                                             alpha_to_coverage_enable,
                                          .alpha_to_one = alpha_to_one_enable,
                                          .sample_mask = sample_mask));

   unsigned num_remapped_rts = 0;
   for (unsigned i = 0; i < num_rts; i++) {
      if (cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
         continue;
      unsigned remapped_idx = cal->color_map[i];
      num_remapped_rts = MAX2(num_remapped_rts, remapped_idx + 1);
      const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
      if ((cb->color_write_enables & (1u << i)) && i < cb->attachment_count) {
         const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->color_blend_op);
         const enum adreno_rb_blend_factor src_color_factor =
            tu6_blend_factor((VkBlendFactor)att->src_color_blend_factor);
         const enum adreno_rb_blend_factor dst_color_factor =
            tu6_blend_factor((VkBlendFactor)att->dst_color_blend_factor);
         const enum a3xx_rb_blend_opcode alpha_op =
            tu6_blend_op(att->alpha_blend_op);
         const enum adreno_rb_blend_factor src_alpha_factor =
            tu6_blend_factor((VkBlendFactor)att->src_alpha_blend_factor);
         const enum adreno_rb_blend_factor dst_alpha_factor =
            tu6_blend_factor((VkBlendFactor)att->dst_alpha_blend_factor);
         VkFormat att_format = rp->color_attachment_formats[i];
         bool is_float_or_srgb = vk_format_is_float(att_format) || vk_format_is_srgb(att_format);

         /* Keep blend and logic op flags tidy. These conditions match the blend-enable
          * mask construction above, except for the dst-reading rop condition that doesn't
          * apply here.
          */
         bool blend_enable = att->blend_enable && !cb->logic_op_enable;
         bool logic_op_enable = cb->logic_op_enable && !is_float_or_srgb;

         tu_cs_emit_regs(cs,
                         A6XX_RB_MRT_CONTROL(remapped_idx,
                                             .color_blend_en = blend_enable,
                                             .alpha_blend_en = blend_enable,
                                             .rop_enable = logic_op_enable,
                                             .rop_code = rop,
                                             .component_enable = att->write_mask),
                         A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,
                                                   .rgb_src_factor = src_color_factor,
                                                   .rgb_blend_opcode = color_op,
                                                   .rgb_dest_factor = dst_color_factor,
                                                   .alpha_src_factor = src_alpha_factor,
                                                   .alpha_blend_opcode = alpha_op,
                                                   .alpha_dest_factor = dst_alpha_factor));
      } else {
            tu_cs_emit_regs(cs,
                            A6XX_RB_MRT_CONTROL(remapped_idx,),
                            A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,));
      }
   }
   tu_cs_emit_regs(cs, A6XX_SP_PS_MRT_CNTL(.mrt = num_remapped_rts));
   tu_cs_emit_regs(cs, A6XX_RB_PS_MRT_CNTL(.mrt = num_remapped_rts));
}

static const enum mesa_vk_dynamic_graphics_state tu_blend_constants_state[] = {
   MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS,
};

template <chip CHIP>
static unsigned
tu6_blend_constants_size(struct tu_device *dev,
                         const struct vk_color_blend_state *cb)
{
   return 5;
}

template <chip CHIP>
static void
tu6_emit_blend_constants(struct tu_cs *cs, const struct vk_color_blend_state *cb)
{
   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLEND_CONSTANT_RED_FP32, 4);
   tu_cs_emit_array(cs, (const uint32_t *) cb->blend_constants, 4);
}

static const enum mesa_vk_dynamic_graphics_state tu_rast_state[] = {
   MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
   MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE,
   MESA_VK_DYNAMIC_RS_POLYGON_MODE,
   MESA_VK_DYNAMIC_RS_CULL_MODE,
   MESA_VK_DYNAMIC_RS_FRONT_FACE,
   MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE,
   MESA_VK_DYNAMIC_RS_LINE_MODE,
   MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE,
   MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM,
   MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE,
   MESA_VK_DYNAMIC_RS_LINE_WIDTH,
   MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE,
   MESA_VK_DYNAMIC_RS_EXTRA_PRIMITIVE_OVERESTIMATION_SIZE,
};

template <chip CHIP>
uint32_t
tu6_rast_size(struct tu_device *dev,
              const struct vk_rasterization_state *rs,
              const struct vk_viewport_state *vp,
              bool multiview,
              bool per_view_viewport,
              bool disable_fs)
{
   if (CHIP == A6XX && dev->physical_device->info->props.is_a702) {
      return 17;
   } else if (CHIP == A6XX) {
      return 15 + (dev->physical_device->info->props.has_legacy_pipeline_shading_rate ? 8 : 0);
   } else {
      return 27;
   }
}

template <chip CHIP>
void
tu6_emit_rast(struct tu_cs *cs,
              const struct vk_rasterization_state *rs,
              const struct vk_viewport_state *vp,
              bool multiview,
              bool per_view_viewport,
              bool disable_fs)
{
   enum a5xx_line_mode line_mode =
      rs->line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR ?
      BRESENHAM : RECTANGULAR;
   tu_cs_emit_regs(cs,
                   GRAS_SU_CNTL(CHIP,
                     .cull_front = rs->cull_mode & VK_CULL_MODE_FRONT_BIT,
                     .cull_back = rs->cull_mode & VK_CULL_MODE_BACK_BIT,
                     .front_cw = rs->front_face == VK_FRONT_FACE_CLOCKWISE,
                     .linehalfwidth = rs->line.width / 2.0f,
                     .poly_offset = rs->depth_bias.enable,
                     .line_mode = line_mode,
                     .multiview_enable = multiview,
                     .rendertargetindexincr = multiview,
                     .viewportindexincr = multiview && per_view_viewport));

   bool depth_clip_enable = vk_rasterization_state_depth_clip_enable(rs);

   tu_cs_emit_regs(cs,
                   GRAS_CL_CNTL(CHIP,
                     .znear_clip_disable = !depth_clip_enable,
                     .zfar_clip_disable = !depth_clip_enable,
                     /* To support VK_EXT_depth_clamp_zero_one on a7xx+ */
                     .z_clamp_enable = rs->depth_clamp_enable || CHIP >= A7XX,
                     .zero_gb_scale_z = vp->depth_clip_negative_one_to_one ? 0 : 1,
                     .vp_clip_code_ignore = 1));;

   enum a6xx_polygon_mode polygon_mode = tu6_polygon_mode(rs->polygon_mode);

   tu_cs_emit_regs(cs, VPC_RAST_CNTL(CHIP, polygon_mode));

   tu_cs_emit_regs(cs,
                   PC_DGEN_RAST_CNTL(CHIP, polygon_mode));

   if (CHIP == A7XX || cs->device->physical_device->info->props.is_a702) {
      tu_cs_emit_regs(cs, VPC_PS_RAST_CNTL(CHIP, polygon_mode));
   }

   tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL(CHIP,
      .stream = rs->rasterization_stream,
      .discard = rs->rasterizer_discard_enable));
   if (CHIP == A6XX) {
      tu_cs_emit_regs(cs, VPC_UNKNOWN_9107(CHIP,
         .raster_discard = rs->rasterizer_discard_enable));
   } else {
      tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL_V2(CHIP,
         .stream = rs->rasterization_stream,
         .discard = rs->rasterizer_discard_enable));

      bool conservative_ras_en =
         rs->conservative_mode ==
         VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT;
      /* This is important to get D/S only draw calls to bypass invoking
       * the fragment shader. The public documentation for Adreno states:
       *  "Hint the driver to engage Fast-Z by using an empty fragment
       *   shader and disabling frame buffer write masks for renderpasses
       *   that modify Z values only."
       *  "The GPU has a special mode that writes Z-only pixels at twice
       *   the normal rate."
       */
      tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP,
            .fs_disable = disable_fs,
            .raster_mode = TYPE_TILED,
            .raster_direction = LR_TB,
            .conservativerasen = conservative_ras_en));

      if (CHIP >= A7XX) {
         tu_cs_emit_regs(cs, GRAS_SU_RENDER_CNTL(CHIP, .fs_disable = disable_fs));
         tu_cs_emit_regs(cs, SP_RENDER_CNTL(CHIP, .fs_disable = disable_fs));
      }

      tu_cs_emit_regs(
         cs, PC_DGEN_SU_CONSERVATIVE_RAS_CNTL(CHIP, conservative_ras_en));

      /* There are only two conservative rasterization modes:
       * - shift_amount = 0 (NO_SHIFT) - normal rasterization
       * - shift_amount = 1 (HALF_PIXEL_SHIFT) - overestimate by half a pixel
       *   plus the rasterization grid size (1/256)
       * - shift_amount = 2 (FULL_PIXEL_SHIFT) - overestimate by another half
       *   a pixel
       *
       * We expose a max of 0.5 and a granularity of 0.5, so the app should
       * only give us 0 or 0.5 which correspond to HALF_PIXEL_SHIFT and
       * FULL_PIXEL_SHIFT respectively. If they give us anything else just
       * assume they meant 0.5 as the most conservative choice.
       */
      enum a6xx_shift_amount shift_amount = conservative_ras_en ?
         (rs->extra_primitive_overestimation_size != 0. ?
            FULL_PIXEL_SHIFT : HALF_PIXEL_SHIFT) : NO_SHIFT;
      tu_cs_emit_regs(cs, GRAS_SU_CONSERVATIVE_RAS_CNTL(CHIP,
            .conservativerasen = conservative_ras_en,
            .shiftamount = shift_amount));
   }

   /* move to hw ctx init? */
   tu_cs_emit_regs(cs,
                   GRAS_SU_POINT_MINMAX(CHIP, .min = 1.0f / 16.0f, .max = 4092.0f),
                   GRAS_SU_POINT_SIZE(CHIP, 1.0f));

   if (CHIP == A6XX && cs->device->physical_device->info->props.has_legacy_pipeline_shading_rate) {
      tu_cs_emit_regs(cs, RB_UNKNOWN_8A00(CHIP));
      tu_cs_emit_regs(cs, RB_UNKNOWN_8A10(CHIP));
      tu_cs_emit_regs(cs, RB_UNKNOWN_8A20(CHIP));
      tu_cs_emit_regs(cs, RB_UNKNOWN_8A30(CHIP));
   }
}

static const enum mesa_vk_dynamic_graphics_state tu_ds_state[] = {
   MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE,
   MESA_VK_DYNAMIC_DS_STENCIL_OP,
   MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK,
   MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK,
   MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE,
   MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS,
};

template <chip CHIP>
static unsigned
tu6_ds_size(struct tu_device *dev,
                 const struct vk_depth_stencil_state *ds,
                 const struct vk_render_pass_state *rp)
{
   return 13;
}

template <chip CHIP>
static void
tu6_emit_ds(struct tu_cs *cs,
            const struct vk_depth_stencil_state *ds,
            const struct vk_render_pass_state *rp)
{
   bool stencil_test_enable =
      ds->stencil.test_enable && rp->attachments & MESA_VK_RP_ATTACHMENT_STENCIL_BIT;

   /* While the .stencil_read field can be used to avoid having to read stencil
    * when the func/ops cause it to be unused, there was no change in perf on
    * the 1/42 games tested that was affected (Transport Fever, 0.0 +/- 0.0%
    * change).  Besides, in some cases where we could clear stencil_read here,
    * the packed z/s is going to be read anyway due to depth testing, though
    * that doesn't apply to this game.
    *
    * Given that the condition for avoiding stencil_read is fairly complicated,
    * we won't bother with the CPU overhead until we can see some win from it.
    *
    * https://gitlab.freedesktop.org/anholt/mesa/-/commits/tu-s-reads
    */

   tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CNTL(
      .stencil_enable = stencil_test_enable,
      .stencil_enable_bf = stencil_test_enable,
      .stencil_read = stencil_test_enable,
      .func = tu6_compare_func((VkCompareOp)ds->stencil.front.op.compare),
      .fail = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.fail),
      .zpass = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.pass),
      .zfail = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.depth_fail),
      .func_bf = tu6_compare_func((VkCompareOp)ds->stencil.back.op.compare),
      .fail_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.fail),
      .zpass_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.pass),
      .zfail_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.depth_fail)));
   tu_cs_emit_regs(cs, GRAS_SU_STENCIL_CNTL(CHIP, stencil_test_enable));

   tu_cs_emit_regs(cs, A6XX_RB_STENCIL_MASK(
      .mask = ds->stencil.front.compare_mask,
      .bfmask = ds->stencil.back.compare_mask));

   tu_cs_emit_regs(cs, A6XX_RB_STENCIL_WRITE_MASK(
      .wrmask = ds->stencil.front.write_mask,
      .bfwrmask = ds->stencil.back.write_mask));

   tu_cs_emit_regs(cs, A6XX_RB_STENCIL_REF_CNTL(
      .ref = ds->stencil.front.reference,
      .bfref = ds->stencil.back.reference));

   tu_cs_emit_regs(cs,
                   A6XX_RB_DEPTH_BOUND_MIN(ds->depth.bounds_test.min),
                   A6XX_RB_DEPTH_BOUND_MAX(ds->depth.bounds_test.max));
}

static const enum mesa_vk_dynamic_graphics_state tu_rb_depth_cntl_state[] = {
   MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE,
   MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE,
   MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP,
   MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE,
   MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
};

template <chip CHIP>
static unsigned
tu6_rb_depth_cntl_size(struct tu_device *dev,
                       const struct vk_depth_stencil_state *ds,
                       const struct vk_render_pass_state *rp,
                       const struct vk_rasterization_state *rs)
{
   return 4;
}

template <chip CHIP>
static void
tu6_emit_rb_depth_cntl(struct tu_cs *cs,
                       const struct vk_depth_stencil_state *ds,
                       const struct vk_render_pass_state *rp,
                       const struct vk_rasterization_state *rs)
{
   if (rp->attachments & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) {
      bool depth_test = ds->depth.test_enable;
      enum adreno_compare_func zfunc = tu6_compare_func(ds->depth.compare_op);

      /* On some GPUs it is necessary to enable z test for depth bounds test
       * when UBWC is enabled. Otherwise, the GPU would hang. FUNC_ALWAYS is
       * required to pass z test. Relevant tests:
       *  dEQP-VK.pipeline.extended_dynamic_state.two_draws_dynamic.depth_bounds_test_disable
       *  dEQP-VK.dynamic_state.ds_state.depth_bounds_1
       */
      if (ds->depth.bounds_test.enable &&
          !ds->depth.test_enable &&
          cs->device->physical_device->info->props.depth_bounds_require_depth_test_quirk) {
         depth_test = true;
         zfunc = FUNC_ALWAYS;
      }

      tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
         .z_test_enable = depth_test,
         .z_write_enable = ds->depth.test_enable && ds->depth.write_enable,
         .zfunc = zfunc,
         /* To support VK_EXT_depth_clamp_zero_one on a7xx+ */
         .z_clamp_enable = rs->depth_clamp_enable || CHIP >= A7XX,
         .z_read_enable =
            (ds->depth.test_enable && (zfunc != FUNC_NEVER && zfunc != FUNC_ALWAYS)) ||
            ds->depth.bounds_test.enable,
         .z_bounds_enable = ds->depth.bounds_test.enable));
      tu_cs_emit_regs(cs, GRAS_SU_DEPTH_CNTL(CHIP, depth_test));
   } else {
      tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
      tu_cs_emit_regs(cs, GRAS_SU_DEPTH_CNTL(CHIP));
   }
}

static const enum mesa_vk_dynamic_graphics_state tu_prim_mode_sysmem_state[] = {
   MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE,
};

template <chip CHIP>
static unsigned
tu6_prim_mode_sysmem_size(struct tu_device *dev,
                          struct tu_shader *fs,
                          bool raster_order_attachment_access,
                          VkImageAspectFlags feedback_loops,
                          bool *sysmem_single_prim_mode)
{
   return 2;
}

template <chip CHIP>
static void
tu6_emit_prim_mode_sysmem(struct tu_cs *cs,
                          struct tu_shader *fs,
                          bool raster_order_attachment_access,
                          VkImageAspectFlags feedback_loops,
                          bool *sysmem_single_prim_mode)
{
   /* VK_EXT_rasterization_order_attachment_access:
    *
    * This extension allow access to framebuffer attachments when used as both
    * input and color attachments from one fragment to the next, in
    * rasterization order, without explicit synchronization.
    */
   raster_order_attachment_access |= TU_DEBUG(RAST_ORDER);

   /* If there is a feedback loop, then the shader can read the previous value
    * of a pixel being written out. It can also write some components and then
    * read different components without a barrier in between. This is a
    * problem in sysmem mode with UBWC, because the main buffer and flags
    * buffer can get out-of-sync if only one is flushed. We fix this by
    * setting the SINGLE_PRIM_MODE field to the same value that the blob does
    * for advanced_blend in sysmem mode if a feedback loop is detected.
    */
   enum a6xx_single_prim_mode sysmem_prim_mode =
      (raster_order_attachment_access || feedback_loops ||
       fs->fs.dynamic_input_attachments_used) ?
      FLUSH_PER_OVERLAP_AND_OVERWRITE : NO_FLUSH;

   if (sysmem_prim_mode == FLUSH_PER_OVERLAP_AND_OVERWRITE)
      *sysmem_single_prim_mode = true;

   tu_cs_emit_regs(cs, GRAS_SC_CNTL(CHIP,
      .single_prim_mode = sysmem_prim_mode,
      .ccusinglecachelinesize = 2,
   ));
}

static const enum mesa_vk_dynamic_graphics_state tu_fragment_shading_rate_state[] = {
   MESA_VK_DYNAMIC_FSR,
};

template <chip CHIP>
static unsigned
tu6_fragment_shading_rate_size(struct tu_device *dev,
                               const vk_fragment_shading_rate_state *fsr,
                               bool enable_att_fsr,
                               bool enable_prim_fsr,
                               bool fs_reads_fsr)
{
   return 6;
}

template <chip CHIP>
static void
tu6_emit_fragment_shading_rate(struct tu_cs *cs,
                               const vk_fragment_shading_rate_state *fsr,
                               bool enable_att_fsr,
                               bool enable_prim_fsr,
                               bool fs_reads_fsr)
{
   /* gl_ShadingRateEXT don't read 1x1 value with null config, so
    * if it is read - we have to emit the config.
    */
   if (!fsr || (!fs_reads_fsr && vk_fragment_shading_rate_is_disabled(fsr))) {
      tu_cs_emit_regs(cs, A6XX_RB_VRS_CONFIG());
      tu_cs_emit_regs(cs, SP_VRS_CONFIG(CHIP));
      tu_cs_emit_regs(cs, GRAS_VRS_CONFIG(CHIP));
      return;
   }

   uint32_t frag_width = fsr->fragment_size.width;
   uint32_t frag_height = fsr->fragment_size.height;

   bool enable_draw_fsr = true;
   if (enable_att_fsr) {
      if (fsr->combiner_ops[1] ==
          VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR) {
         enable_draw_fsr = false;
         enable_prim_fsr = false;
      } else if (fsr->combiner_ops[1] ==
                 VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) {
         enable_att_fsr = false;
      }
   }
   if (enable_prim_fsr) {
      if (fsr->combiner_ops[0] ==
          VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR) {
         enable_draw_fsr = false;
      } else if (fsr->combiner_ops[0] ==
                 VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) {
         enable_prim_fsr = false;
      }
   }

   tu_cs_emit_regs(
      cs,
      A6XX_RB_VRS_CONFIG(.unk2 = true, .pipeline_fsr_enable = enable_draw_fsr,
                         .attachment_fsr_enable = enable_att_fsr,
                         .primitive_fsr_enable = enable_prim_fsr));
   tu_cs_emit_regs(cs,
                   SP_VRS_CONFIG(CHIP, .pipeline_fsr_enable = enable_draw_fsr,
                                 .attachment_fsr_enable = enable_att_fsr,
                                 .primitive_fsr_enable = enable_prim_fsr));
   tu_cs_emit_regs(
      cs, GRAS_VRS_CONFIG(CHIP,
                .pipeline_fsr_enable = enable_draw_fsr,
                .frag_size_x = util_logbase2(frag_width),
                .frag_size_y = util_logbase2(frag_height),
                .combiner_op_1 = (a6xx_fsr_combiner) fsr->combiner_ops[0],
                .combiner_op_2 = (a6xx_fsr_combiner) fsr->combiner_ops[1],
                .attachment_fsr_enable = enable_att_fsr,
                .primitive_fsr_enable = enable_prim_fsr));
}


static inline bool
emit_pipeline_state(BITSET_WORD *keep, BITSET_WORD *remove,
                    BITSET_WORD *pipeline_set,
                    const enum mesa_vk_dynamic_graphics_state *state_array,
                    unsigned num_states, bool extra_cond,
                    struct tu_pipeline_builder *builder)
{
   BITSET_DECLARE(state, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};

   /* Unrolling this loop should produce a constant value once the function is
    * inlined, because state_array and num_states are a per-draw-state
    * constant, but GCC seems to need a little encouragement. clang does a
    * little better but still needs a pragma when there are a large number of
    * states.
    */
#if defined(__clang__)
#pragma clang loop unroll(full)
#elif defined(__GNUC__) && __GNUC__ >= 8
#pragma GCC unroll MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX
#endif
   for (unsigned i = 0; i < num_states; i++) {
      BITSET_SET(state, state_array[i]);
   }

   /* If all of the state is set, then after we emit it we can tentatively
    * remove it from the states to set for the pipeline by making it dynamic.
    * If we can't emit it, though, we need to keep around the partial state so
    * that we can emit it later, even if another draw state consumes it. That
    * is, we have to cancel any tentative removal.
    */
   BITSET_DECLARE(temp, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX);
   memcpy(temp, pipeline_set, sizeof(temp));
   BITSET_AND(temp, temp, state);
   if (!BITSET_EQUAL(temp, state) || !extra_cond) {
      __bitset_or(keep, keep, temp, ARRAY_SIZE(temp));
      return false;
   }
   __bitset_or(remove, remove, state, ARRAY_SIZE(state));
   return true;
}

template <chip CHIP>
static void
tu_pipeline_builder_emit_state(struct tu_pipeline_builder *builder,
                               struct tu_pipeline *pipeline)
{
   struct tu_cs cs;
   BITSET_DECLARE(keep, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
   BITSET_DECLARE(remove, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
   BITSET_DECLARE(pipeline_set, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};

   vk_graphics_pipeline_get_state(&builder->graphics_state, pipeline_set);

#define EMIT_STATE(name, extra_cond)                                          \
   emit_pipeline_state(keep, remove, pipeline_set, tu_##name##_state,         \
                       ARRAY_SIZE(tu_##name##_state), extra_cond, builder)

#define DRAW_STATE_COND(name, id, extra_cond, ...)                            \
   if (EMIT_STATE(name, extra_cond)) {                                        \
      unsigned size = tu6_##name##_size<CHIP>(builder->device, __VA_ARGS__);  \
      if (size > 0) {                                                         \
         tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);                    \
         tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                             \
         pipeline->dynamic_state[id] =                                        \
            tu_cs_end_draw_state(&pipeline->cs, &cs);                         \
      }                                                                       \
      pipeline->set_state_mask |= (1u << id);                                 \
   }
#define DRAW_STATE(name, id, ...) DRAW_STATE_COND(name, id, true, __VA_ARGS__)

   DRAW_STATE(vertex_input, TU_DYNAMIC_STATE_VERTEX_INPUT,
              builder->graphics_state.vi);
   DRAW_STATE(vertex_stride, TU_DYNAMIC_STATE_VB_STRIDE,
              builder->graphics_state.vi);
   /* If (a) per-view viewport is used or (b) we don't know yet, then we need
    * to set viewport and stencil state dynamically.
    */
   bool no_per_view_viewport = pipeline_contains_all_shader_state(pipeline) &&
      !pipeline->program.per_view_viewport &&
      !pipeline->program.per_layer_viewport;
   DRAW_STATE_COND(viewport, TU_DYNAMIC_STATE_VIEWPORT, no_per_view_viewport,
                   builder->graphics_state.vp,
                   builder->graphics_state.rs);
   DRAW_STATE_COND(scissor, TU_DYNAMIC_STATE_SCISSOR, no_per_view_viewport,
              builder->graphics_state.vp);
   DRAW_STATE(sample_locations,
              TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
              builder->graphics_state.ms->sample_locations_enable,
              builder->graphics_state.ms->sample_locations);
   DRAW_STATE(depth_bias, TU_DYNAMIC_STATE_DEPTH_BIAS,
              builder->graphics_state.rs);
   bool attachments_valid =
      builder->graphics_state.rp &&
      vk_render_pass_state_has_attachment_info(builder->graphics_state.rp);
   struct vk_color_blend_state dummy_cb = {};
   const struct vk_color_blend_state *cb = builder->graphics_state.cb;
   if (attachments_valid &&
       !(builder->graphics_state.rp->attachments &
         MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)) {
      /* If there are no color attachments, then the original blend state may
       * be NULL and the common code sanitizes it to always be NULL. In this
       * case we want to emit an empty blend/bandwidth/etc.  rather than
       * letting it be dynamic (and potentially garbage).
       */
      cb = &dummy_cb;
      BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE);
      BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_LOGIC_OP);
      BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT);
      BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
      BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
      BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
      BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
      BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
   }
   DRAW_STATE_COND(blend, TU_DYNAMIC_STATE_BLEND, attachments_valid, cb,
                   builder->graphics_state.cal,
                   builder->graphics_state.rp,
                   builder->graphics_state.ms->alpha_to_coverage_enable,
                   builder->graphics_state.ms->alpha_to_one_enable,
                   builder->graphics_state.ms->sample_mask);
   if (EMIT_STATE(blend_lrz, attachments_valid))
      tu_emit_blend_lrz(&pipeline->lrz_blend, cb,
                        builder->graphics_state.rp);
   if (EMIT_STATE(bandwidth, attachments_valid))
      tu_calc_bandwidth(&pipeline->bandwidth, cb,
                        builder->graphics_state.rp);
   if (EMIT_STATE(
          disable_fs,
          attachments_valid && pipeline_contains_all_shader_state(pipeline)))
      tu_emit_disable_fs(&pipeline->disable_fs, cb,
                         builder->graphics_state.rp,
                         builder->graphics_state.ms->alpha_to_coverage_enable,
                         pipeline->shaders[MESA_SHADER_FRAGMENT]);
   DRAW_STATE(blend_constants, TU_DYNAMIC_STATE_BLEND_CONSTANTS, cb);

   if (attachments_valid &&
       !(builder->graphics_state.rp->attachments &
         MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)) {
      /* Don't actually make anything dynamic as that may mean a partially-set
       * state group where the group is NULL which angers common code.
       */
      BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE);
      BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_LOGIC_OP);
      BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT);
      BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
      BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
      BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
      BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
      BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
   }
   DRAW_STATE_COND(rast, TU_DYNAMIC_STATE_RAST,
                   pipeline_contains_all_shader_state(pipeline) &&
                      pipeline->disable_fs.valid,
                   builder->graphics_state.rs, builder->graphics_state.vp,
                   builder->graphics_state.rp->view_mask != 0,
                   pipeline->program.per_view_viewport,
                   pipeline->disable_fs.disable_fs);
   DRAW_STATE_COND(ds, TU_DYNAMIC_STATE_DS,
              attachments_valid,
              builder->graphics_state.ds,
              builder->graphics_state.rp);
   DRAW_STATE_COND(rb_depth_cntl, TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
                   attachments_valid,
                   builder->graphics_state.ds,
                   builder->graphics_state.rp,
                   builder->graphics_state.rs);
   DRAW_STATE_COND(patch_control_points,
                   TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
                   pipeline_contains_all_shader_state(pipeline),
                   pipeline->shaders[MESA_SHADER_VERTEX],
                   pipeline->shaders[MESA_SHADER_TESS_CTRL],
                   pipeline->shaders[MESA_SHADER_TESS_EVAL],
                   &pipeline->program,
                   builder->graphics_state.ts->patch_control_points);
   bool has_raster_order_state = false;
   if (pipeline->type == TU_PIPELINE_GRAPHICS) {
      has_raster_order_state = true;
   } else {
      struct tu_graphics_lib_pipeline *lib =
         tu_pipeline_to_graphics_lib(pipeline);
      has_raster_order_state =
         (lib->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
         (lib->state &
          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT);
   }
   if (!builder->device->physical_device->info->props.has_coherent_ubwc_flag_caches) {
      DRAW_STATE_COND(prim_mode_sysmem,
                      TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM,
                      has_raster_order_state,
                      pipeline->shaders[MESA_SHADER_FRAGMENT],
                      pipeline->output.raster_order_attachment_access ||
                      pipeline->ds.raster_order_attachment_access,
                      vk_pipeline_flags_feedback_loops(builder->graphics_state.pipeline_flags),
                      &pipeline->prim_order.sysmem_single_prim_mode);
   }

   if (builder->device->physical_device->info->props.has_attachment_shading_rate) {
      bool has_fsr_att =
         builder->graphics_state.pipeline_flags &
         VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
      DRAW_STATE_COND(fragment_shading_rate,
                      TU_DYNAMIC_STATE_A7XX_FRAGMENT_SHADING_RATE,
                      attachments_valid && pipeline_contains_all_shader_state(pipeline),
                      builder->graphics_state.fsr,
                      has_fsr_att,
                      pipeline->program.writes_shading_rate,
                      pipeline->program.reads_shading_rate);
   }
#undef DRAW_STATE
#undef DRAW_STATE_COND
#undef EMIT_STATE

   /* LRZ always needs depth/stencil state at draw time */
   BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
   BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
   BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
   BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP);
   BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
   BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_OP);
   BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK);
   BITSET_SET(keep, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE);

   /* MSAA needs line mode */
   BITSET_SET(keep, MESA_VK_DYNAMIC_RS_LINE_MODE);

   /* The patch control points is part of the draw */
   BITSET_SET(keep, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS);

   /* Vertex buffer state needs to know the max valid binding */
   BITSET_SET(keep, MESA_VK_DYNAMIC_VI_BINDINGS_VALID);

   /* Remove state which has been emitted and we no longer need to set when
    * binding the pipeline by making it "dynamic".
    */
   BITSET_ANDNOT(remove, remove, keep);

   BITSET_OR(pipeline->static_state_mask, pipeline->static_state_mask, remove);

   BITSET_OR(builder->graphics_state.dynamic, builder->graphics_state.dynamic,
             remove);
}

static inline bool
emit_draw_state(const struct vk_dynamic_graphics_state *dynamic_state,
                const enum mesa_vk_dynamic_graphics_state *state_array,
                unsigned num_states)
{
   BITSET_DECLARE(state, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};

   /* Unrolling this loop should produce a constant value once the function is
    * inlined, because state_array and num_states are a per-draw-state
    * constant, but GCC seems to need a little encouragement. clang does a
    * little better but still needs a pragma when there are a large number of
    * states.
    */
#if defined(__clang__)
#pragma clang loop unroll(full)
#elif defined(__GNUC__) && __GNUC__ >= 8
#pragma GCC unroll MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX
#endif
   for (unsigned i = 0; i < num_states; i++) {
      BITSET_SET(state, state_array[i]);
   }

   BITSET_DECLARE(temp, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX);
   BITSET_AND(temp, state, dynamic_state->dirty);
   return !BITSET_IS_EMPTY(temp);
}

template <chip CHIP>
uint32_t
tu_emit_draw_state(struct tu_cmd_buffer *cmd)
{
   struct tu_cs cs;
   uint32_t dirty_draw_states = 0;

#define EMIT_STATE(name)                                                      \
   emit_draw_state(&cmd->vk.dynamic_graphics_state, tu_##name##_state,        \
                   ARRAY_SIZE(tu_##name##_state))
#define DRAW_STATE_COND(name, id, extra_cond, ...)                            \
   if ((EMIT_STATE(name) || (extra_cond)) &&                                  \
       !(cmd->state.pipeline_draw_states & (1u << id))) {                     \
      unsigned size = tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__);      \
      if (size > 0) {                                                         \
         tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs);                     \
         tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                             \
         cmd->state.dynamic_state[id] =                                       \
            tu_cs_end_draw_state(&cmd->sub_cs, &cs);                          \
      } else {                                                                \
         cmd->state.dynamic_state[id] = {};                                   \
      }                                                                       \
      dirty_draw_states |= (1u << id);                                        \
   }
#define DRAW_STATE_FDM(name, id, ...)                                         \
   if ((EMIT_STATE(name) || (cmd->state.dirty &                               \
                             (TU_CMD_DIRTY_FDM |                              \
                              TU_CMD_DIRTY_PER_VIEW_VIEWPORT))) &&            \
       !(cmd->state.pipeline_draw_states & (1u << id))) {                     \
      if (cmd->state.has_fdm || cmd->state.per_layer_viewport) {              \
         tu_cs_set_writeable(&cmd->sub_cs, true);                             \
         tu6_emit_##name##_fdm(&cs, cmd, __VA_ARGS__);                        \
         cmd->state.dynamic_state[id] =                                       \
            tu_cs_end_draw_state(&cmd->sub_cs, &cs);                          \
         tu_cs_set_writeable(&cmd->sub_cs, false);                            \
      } else {                                                                \
         unsigned size = tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__);   \
         if (size > 0) {                                                      \
            tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs);                  \
            tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                          \
            cmd->state.dynamic_state[id] =                                    \
               tu_cs_end_draw_state(&cmd->sub_cs, &cs);                       \
         } else {                                                             \
            cmd->state.dynamic_state[id] = {};                                \
         }                                                                    \
         tu_cs_begin_sub_stream(&cmd->sub_cs,                                 \
                                tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__),  \
                                &cs);                                         \
         tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                             \
         cmd->state.dynamic_state[id] =                                       \
            tu_cs_end_draw_state(&cmd->sub_cs, &cs);                          \
      }                                                                       \
      dirty_draw_states |= (1u << id);                                        \
   }
#define DRAW_STATE(name, id, ...) DRAW_STATE_COND(name, id, false, __VA_ARGS__)

   DRAW_STATE(vertex_input, TU_DYNAMIC_STATE_VERTEX_INPUT,
              cmd->vk.dynamic_graphics_state.vi);

   /* Vertex input stride is special because it's part of the vertex input in
    * the pipeline but a separate array when it's dynamic state so we have to
    * use two separate functions.
    */
#define tu6_emit_vertex_stride tu6_emit_vertex_stride_dyn
#define tu6_vertex_stride_size tu6_vertex_stride_size_dyn

   DRAW_STATE(vertex_stride, TU_DYNAMIC_STATE_VB_STRIDE,
              cmd->vk.dynamic_graphics_state.vi_binding_strides,
              cmd->vk.dynamic_graphics_state.vi_bindings_valid);

#undef tu6_emit_vertex_stride
#undef tu6_vertex_stride_size

   DRAW_STATE_FDM(viewport, TU_DYNAMIC_STATE_VIEWPORT,
                  &cmd->vk.dynamic_graphics_state.vp,
                  &cmd->vk.dynamic_graphics_state.rs);
   DRAW_STATE_FDM(scissor, TU_DYNAMIC_STATE_SCISSOR,
                  &cmd->vk.dynamic_graphics_state.vp);
   DRAW_STATE(sample_locations,
              TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
              cmd->vk.dynamic_graphics_state.ms.sample_locations_enable,
              cmd->vk.dynamic_graphics_state.ms.sample_locations);
   DRAW_STATE(depth_bias, TU_DYNAMIC_STATE_DEPTH_BIAS,
              &cmd->vk.dynamic_graphics_state.rs);
   DRAW_STATE_COND(blend, TU_DYNAMIC_STATE_BLEND,
                   cmd->state.dirty & TU_CMD_DIRTY_SUBPASS,
                   &cmd->vk.dynamic_graphics_state.cb,
                   &cmd->vk.dynamic_graphics_state.cal,
                   &cmd->state.vk_rp,
                   cmd->vk.dynamic_graphics_state.ms.alpha_to_coverage_enable,
                   cmd->vk.dynamic_graphics_state.ms.alpha_to_one_enable,
                   cmd->vk.dynamic_graphics_state.ms.sample_mask);
   if (!cmd->state.pipeline_blend_lrz &&
       (EMIT_STATE(blend_lrz) || (cmd->state.dirty & TU_CMD_DIRTY_SUBPASS))) {
      tu_lrz_blend_status blend_status = tu6_calc_blend_lrz(
         &cmd->vk.dynamic_graphics_state.cb, &cmd->state.vk_rp);
      if (blend_status != cmd->state.lrz_blend_status) {
         cmd->state.lrz_blend_status = blend_status;
         cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
      }
   }
   if (!cmd->state.pipeline_bandwidth &&
       (EMIT_STATE(bandwidth) || (cmd->state.dirty & TU_CMD_DIRTY_SUBPASS)))
      tu_calc_bandwidth(&cmd->state.bandwidth, &cmd->vk.dynamic_graphics_state.cb,
                        &cmd->state.vk_rp);

   if (!cmd->state.pipeline_disable_fs &&
       (EMIT_STATE(disable_fs) ||
        (cmd->state.dirty & (TU_CMD_DIRTY_SUBPASS | TU_CMD_DIRTY_FS)))) {
      bool disable_fs = tu_calc_disable_fs(
         &cmd->vk.dynamic_graphics_state.cb, &cmd->state.vk_rp,
         cmd->vk.dynamic_graphics_state.ms.alpha_to_coverage_enable,
         cmd->state.shaders[MESA_SHADER_FRAGMENT]);

      if (disable_fs != cmd->state.disable_fs) {
         cmd->state.disable_fs = disable_fs;
         cmd->state.dirty |= TU_CMD_DIRTY_DISABLE_FS;
      }
   }

   DRAW_STATE(blend_constants, VK_DYNAMIC_STATE_BLEND_CONSTANTS,
              &cmd->vk.dynamic_graphics_state.cb);

   if (cmd->device->physical_device->info->props.has_attachment_shading_rate) {
      DRAW_STATE_COND(fragment_shading_rate,
               TU_DYNAMIC_STATE_A7XX_FRAGMENT_SHADING_RATE,
               cmd->state.dirty & (TU_CMD_DIRTY_SUBPASS | TU_CMD_DIRTY_SHADING_RATE),
               &cmd->vk.dynamic_graphics_state.fsr,
               cmd->state.subpass->fsr_attachment != VK_ATTACHMENT_UNUSED,
               cmd->state.program.writes_shading_rate,
               cmd->state.program.reads_shading_rate);
   }
   DRAW_STATE_COND(rast, TU_DYNAMIC_STATE_RAST,
                   cmd->state.dirty & (TU_CMD_DIRTY_SUBPASS |
                                       TU_CMD_DIRTY_PER_VIEW_VIEWPORT |
                                       TU_CMD_DIRTY_DISABLE_FS),
                   &cmd->vk.dynamic_graphics_state.rs,
                   &cmd->vk.dynamic_graphics_state.vp,
                   cmd->state.vk_rp.view_mask != 0,
                   cmd->state.per_view_viewport,
                   cmd->state.disable_fs);
   DRAW_STATE_COND(ds, TU_DYNAMIC_STATE_DS,
              cmd->state.dirty & TU_CMD_DIRTY_SUBPASS,
              &cmd->vk.dynamic_graphics_state.ds,
              &cmd->state.vk_rp);
   DRAW_STATE_COND(rb_depth_cntl, TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
                   cmd->state.dirty & TU_CMD_DIRTY_SUBPASS,
                   &cmd->vk.dynamic_graphics_state.ds,
                   &cmd->state.vk_rp,
                   &cmd->vk.dynamic_graphics_state.rs);
   DRAW_STATE_COND(patch_control_points,
                   TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
                   cmd->state.dirty & TU_CMD_DIRTY_PROGRAM,
                   cmd->state.shaders[MESA_SHADER_VERTEX],
                   cmd->state.shaders[MESA_SHADER_TESS_CTRL],
                   cmd->state.shaders[MESA_SHADER_TESS_EVAL],
                   &cmd->state.program,
                   cmd->vk.dynamic_graphics_state.ts.patch_control_points);
   if (!cmd->device->physical_device->info->props.has_coherent_ubwc_flag_caches) {
      DRAW_STATE_COND(prim_mode_sysmem,
                      TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM,
                      cmd->state.dirty & (TU_CMD_DIRTY_RAST_ORDER |
                                          TU_CMD_DIRTY_FEEDBACK_LOOPS |
                                          TU_CMD_DIRTY_FS),
                      cmd->state.shaders[MESA_SHADER_FRAGMENT],
                      cmd->state.raster_order_attachment_access,
                      cmd->vk.dynamic_graphics_state.feedback_loops |
                      cmd->state.pipeline_feedback_loops,
                      &cmd->state.rp.sysmem_single_prim_mode);
   }
#undef DRAW_STATE
#undef DRAW_STATE_COND
#undef EMIT_STATE

   return dirty_draw_states;
}
TU_GENX(tu_emit_draw_state);

static void
tu_pipeline_builder_parse_depth_stencil(
   struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
{
   const VkPipelineDepthStencilStateCreateInfo *ds_info =
      builder->create_info->pDepthStencilState;

   if ((builder->graphics_state.rp->attachments ==
        MESA_VK_RP_ATTACHMENT_INFO_INVALID) ||
       (builder->graphics_state.rp->attachments &
        MESA_VK_RP_ATTACHMENT_DEPTH_BIT)) {
      pipeline->ds.raster_order_attachment_access =
         ds_info && (ds_info->flags &
         (VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_EXT |
          VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_EXT));
   }
}

static void
tu_pipeline_builder_parse_multisample_and_color_blend(
   struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
{
   /* The spec says:
    *
    *    pMultisampleState is a pointer to an instance of the
    *    VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline
    *    has rasterization disabled.
    *
    * Also,
    *
    *    pColorBlendState is a pointer to an instance of the
    *    VkPipelineColorBlendStateCreateInfo structure, and is ignored if the
    *    pipeline has rasterization disabled or if the subpass of the render
    *    pass the pipeline is created against does not use any color
    *    attachments.
    *
    * We leave the relevant registers stale when rasterization is disabled.
    */
   if (builder->rasterizer_discard) {
      return;
   }

   static const VkPipelineColorBlendStateCreateInfo dummy_blend_info = {};

   const VkPipelineColorBlendStateCreateInfo *blend_info =
      (builder->graphics_state.rp->attachments &
       MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)
      ? builder->create_info->pColorBlendState
      : &dummy_blend_info;

   if (builder->graphics_state.rp->attachments &
       MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS) {
      pipeline->output.raster_order_attachment_access =
         blend_info && (blend_info->flags &
            VK_PIPELINE_COLOR_BLEND_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_BIT_EXT);
   }
}

template <chip CHIP>
static void
tu_pipeline_builder_parse_rasterization_order(
   struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
{
   if (builder->rasterizer_discard)
      return;

   bool raster_order_attachment_access =
      pipeline->output.raster_order_attachment_access ||
      pipeline->ds.raster_order_attachment_access ||
      TU_DEBUG(RAST_ORDER);

   /* VK_EXT_blend_operation_advanced would also require ordered access
    * when implemented in the future.
    */

   enum a6xx_single_prim_mode gmem_prim_mode = NO_FLUSH;

   if (raster_order_attachment_access) {
      /* VK_EXT_rasterization_order_attachment_access:
       *
       * This extension allow access to framebuffer attachments when used as
       * both input and color attachments from one fragment to the next,
       * in rasterization order, without explicit synchronization.
       */
      gmem_prim_mode = FLUSH_PER_OVERLAP;
   }

   struct tu_cs cs;

   pipeline->prim_order.state_gmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
   tu_cs_emit_regs(&cs, GRAS_SC_CNTL(CHIP,
      .single_prim_mode = gmem_prim_mode,
      .ccusinglecachelinesize = 2,
   ));
}

static void
tu_pipeline_finish(struct tu_pipeline *pipeline,
                   struct tu_device *dev,
                   const VkAllocationCallbacks *alloc)
{
   tu_cs_finish(&pipeline->cs);
   TU_RMV(resource_destroy, dev, &pipeline->bo);

   mtx_lock(&dev->pipeline_mutex);
   tu_suballoc_bo_free(&dev->pipeline_suballoc, &pipeline->bo);
   mtx_unlock(&dev->pipeline_mutex);

   if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB) {
      struct tu_graphics_lib_pipeline *library =
         tu_pipeline_to_graphics_lib(pipeline);

      if (library->nir_shaders)
         vk_pipeline_cache_object_unref(&dev->vk,
                                        &library->nir_shaders->base);

      for (unsigned i = 0; i < library->num_sets; i++) {
         if (library->layouts[i])
            vk_descriptor_set_layout_unref(&dev->vk, &library->layouts[i]->vk);
      }

      vk_free2(&dev->vk.alloc, alloc, library->state_data);
   }

   for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); i++) {
      if (pipeline->shaders[i])
         vk_pipeline_cache_object_unref(&dev->vk,
                                        &pipeline->shaders[i]->base);
   }

   ralloc_free(pipeline->executables_mem_ctx);
}

static VkGraphicsPipelineLibraryFlagBitsEXT
vk_shader_stage_to_pipeline_library_flags(VkShaderStageFlagBits stage)
{
   assert(util_bitcount(stage) == 1);
   switch (stage) {
   case VK_SHADER_STAGE_VERTEX_BIT:
   case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
   case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
   case VK_SHADER_STAGE_GEOMETRY_BIT:
   case VK_SHADER_STAGE_TASK_BIT_EXT:
   case VK_SHADER_STAGE_MESH_BIT_EXT:
      return VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT;
   case VK_SHADER_STAGE_FRAGMENT_BIT:
      return VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT;
   default:
      UNREACHABLE("Invalid shader stage");
   }
}

template <chip CHIP>
static VkResult
tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
                          struct tu_pipeline **pipeline)
{
   VkResult result;

   if (builder->create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR) {
      *pipeline = (struct tu_pipeline *) vk_object_zalloc(
         &builder->device->vk, builder->alloc,
         sizeof(struct tu_graphics_lib_pipeline),
         VK_OBJECT_TYPE_PIPELINE);
      if (!*pipeline)
         return VK_ERROR_OUT_OF_HOST_MEMORY;
      (*pipeline)->type = TU_PIPELINE_GRAPHICS_LIB;
   } else {
      *pipeline = (struct tu_pipeline *) vk_object_zalloc(
         &builder->device->vk, builder->alloc,
         sizeof(struct tu_graphics_pipeline),
         VK_OBJECT_TYPE_PIPELINE);
      if (!*pipeline)
         return VK_ERROR_OUT_OF_HOST_MEMORY;
      (*pipeline)->type = TU_PIPELINE_GRAPHICS;
   }

   (*pipeline)->executables_mem_ctx = ralloc_context(NULL);
   util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx);

   tu_pipeline_builder_parse_libraries(builder, *pipeline);

   VkShaderStageFlags stages = 0;
   for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
      VkShaderStageFlagBits stage = builder->create_info->pStages[i].stage;

      /* Ignore shader stages that don't need to be imported. */
      if (!(vk_shader_stage_to_pipeline_library_flags(stage) & builder->state))
         continue;

      stages |= stage;
   }
   builder->active_stages = stages;

   (*pipeline)->active_stages = stages;
   for (unsigned i = 0; i < builder->num_libraries; i++)
      (*pipeline)->active_stages |= builder->libraries[i]->base.active_stages;

   /* Compile and upload shaders unless a library has already done that. */
   if ((*pipeline)->program.vs_state.size == 0) {
      tu_pipeline_builder_parse_layout(builder, *pipeline);

      result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
      if (result != VK_SUCCESS) {
         tu_pipeline_finish(*pipeline, builder->device, builder->alloc);
         vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
         return result;
      }
   }

   result = tu_pipeline_allocate_cs(builder->device, *pipeline,
                                    &builder->layout, builder, NULL);


   if (set_combined_state(builder, *pipeline,
                          VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
                          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
      if (result != VK_SUCCESS) {
         vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
         return result;
      }

      tu_emit_program_state<CHIP>(&(*pipeline)->cs, &(*pipeline)->program,
                                  (*pipeline)->shaders);

      if (CHIP == A6XX) {
         /* Blob doesn't preload state on A7XX, likely preloading either
          * doesn't work or doesn't provide benefits.
          */
         tu6_emit_load_state(builder->device, *pipeline, &builder->layout);
      }
   }

   if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
      tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
   }

   if (builder->state &
       VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
      tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
   }

   if (set_combined_state(builder, *pipeline,
                          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
                          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
      tu_pipeline_builder_parse_rasterization_order<CHIP>(builder, *pipeline);
   }

   tu_pipeline_builder_emit_state<CHIP>(builder, *pipeline);

   if ((*pipeline)->type == TU_PIPELINE_GRAPHICS_LIB) {
      struct tu_graphics_lib_pipeline *library =
         tu_pipeline_to_graphics_lib(*pipeline);
      result = vk_graphics_pipeline_state_copy(&builder->device->vk,
                                               &library->graphics_state,
                                               &builder->graphics_state,
                                               builder->alloc,
                                               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
                                               &library->state_data);
      if (result != VK_SUCCESS) {
         tu_pipeline_finish(*pipeline, builder->device, builder->alloc);
         return result;
      }
   } else {
      struct tu_graphics_pipeline *gfx_pipeline =
         tu_pipeline_to_graphics(*pipeline);
      gfx_pipeline->dynamic_state.ms.sample_locations =
         &gfx_pipeline->sample_locations;
      vk_dynamic_graphics_state_fill(&gfx_pipeline->dynamic_state,
                                     &builder->graphics_state);
      gfx_pipeline->feedback_loops =
         vk_pipeline_flags_feedback_loops(builder->graphics_state.pipeline_flags);
      gfx_pipeline->feedback_loop_may_involve_textures =
         builder->graphics_state.feedback_loop_not_input_only;
   }

   return VK_SUCCESS;
}

static void
tu_pipeline_builder_finish(struct tu_pipeline_builder *builder)
{
   ralloc_free(builder->mem_ctx);
}

void
tu_fill_render_pass_state(struct vk_render_pass_state *rp,
                          const struct tu_render_pass *pass,
                          const struct tu_subpass *subpass)
{
   rp->view_mask = subpass->multiview_mask;
   rp->color_attachment_count = subpass->color_count;

   const uint32_t a = subpass->depth_stencil_attachment.attachment;
   rp->depth_attachment_format = VK_FORMAT_UNDEFINED;
   rp->stencil_attachment_format = VK_FORMAT_UNDEFINED;
   rp->attachments = MESA_VK_RP_ATTACHMENT_NONE;
   if (a != VK_ATTACHMENT_UNUSED) {
      VkFormat ds_format = pass->attachments[a].format;
      if (vk_format_has_depth(ds_format) && subpass->depth_used) {
         rp->depth_attachment_format = ds_format;
         rp->attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
      }
      if (vk_format_has_stencil(ds_format) && subpass->stencil_used) {
         rp->stencil_attachment_format = ds_format;
         rp->attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
      }
   }

   for (uint32_t i = 0; i < subpass->color_count; i++) {
      const uint32_t a = subpass->color_attachments[i].attachment;
      if (a == VK_ATTACHMENT_UNUSED) {
         rp->color_attachment_formats[i] = VK_FORMAT_UNDEFINED;
         continue;
      }

      rp->color_attachment_formats[i] = pass->attachments[a].format;
      rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
   }

   rp->custom_resolve = subpass->custom_resolve;
}

static void
tu_pipeline_builder_init_graphics(
   struct tu_pipeline_builder *builder,
   struct tu_device *dev,
   struct vk_pipeline_cache *cache,
   const VkGraphicsPipelineCreateInfo *create_info,
   VkPipelineCreateFlags2KHR flags,
   const VkAllocationCallbacks *alloc)
{
   *builder = (struct tu_pipeline_builder) {
      .device = dev,
      .mem_ctx = ralloc_context(NULL),
      .cache = cache,
      .alloc = alloc,
      .create_info = create_info,
      .create_flags = flags,
   };

   const VkGraphicsPipelineLibraryCreateInfoEXT *gpl_info =
      vk_find_struct_const(builder->create_info->pNext,
                           GRAPHICS_PIPELINE_LIBRARY_CREATE_INFO_EXT);

   const VkPipelineLibraryCreateInfoKHR *library_info =
      vk_find_struct_const(builder->create_info->pNext,
                           PIPELINE_LIBRARY_CREATE_INFO_KHR);

   if (gpl_info) {
      builder->state = gpl_info->flags;
   } else {
      /* Implement this bit of spec text:
       *
       *    If this structure is omitted, and either
       *    VkGraphicsPipelineCreateInfo::flags includes
       *    VK_PIPELINE_CREATE_LIBRARY_BIT_KHR or the
       *    VkGraphicsPipelineCreateInfo::pNext chain includes a
       *    VkPipelineLibraryCreateInfoKHR structure with a libraryCount
       *    greater than 0, it is as if flags is 0. Otherwise if this
       *    structure is omitted, it is as if flags includes all possible
       *    subsets of the graphics pipeline (i.e. a complete graphics
       *    pipeline).
       */
      if ((library_info && library_info->libraryCount > 0) ||
          (builder->create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR)) {
         builder->state = 0;
      } else {
         builder->state =
            VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT |
            VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
            VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
            VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT;
      }
   }

   bool rasterizer_discard_dynamic = false;
   if (create_info->pDynamicState) {
      for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) {
         if (create_info->pDynamicState->pDynamicStates[i] ==
               VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE) {
            rasterizer_discard_dynamic = true;
            break;
         }
      }
   }

   builder->rasterizer_discard =
      (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) &&
      !rasterizer_discard_dynamic &&
      builder->create_info->pRasterizationState->rasterizerDiscardEnable;

   struct vk_render_pass_state rp_state = {};
   const struct vk_render_pass_state *driver_rp = NULL;
   VkPipelineCreateFlags2KHR rp_flags = 0;

   builder->unscaled_input_fragcoord = 0;

   /* Extract information we need from the turnip renderpass. This will be
    * filled out automatically if the app is using dynamic rendering or
    * renderpasses are emulated.
    */
   if (!TU_DEBUG(DYNAMIC) &&
       (builder->state &
        (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
         VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
         VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) &&
       builder->create_info->renderPass) {
      const struct tu_render_pass *pass =
         tu_render_pass_from_handle(create_info->renderPass);
      const struct tu_subpass *subpass =
         &pass->subpasses[create_info->subpass];

      tu_fill_render_pass_state(&rp_state, pass, subpass);

      for (unsigned i = 0; i < subpass->input_count; i++) {
         /* Input attachments stored in GMEM must be loaded with unscaled
          * FragCoord.
          */
         if (subpass->input_attachments[i].patch_input_gmem)
            builder->unscaled_input_fragcoord |= 1u << i;
      }

      if (subpass->feedback_loop_color) {
         rp_flags |=
            VK_PIPELINE_CREATE_2_COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT;
      }

      if (subpass->feedback_loop_ds) {
         rp_flags |=
            VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT;
      }

      if (pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) {
         rp_flags |=
            VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT;
      }

      if (subpass->fsr_attachment != VK_ATTACHMENT_UNUSED) {
         rp_flags |=
            VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
      }

      if (pass->has_layered_fdm) {
         rp_flags |=
            VK_PIPELINE_CREATE_2_PER_LAYER_FRAGMENT_DENSITY_BIT_VALVE;
      }

      builder->unscaled_input_fragcoord = 0;
      for (unsigned i = 0; i < subpass->input_count; i++) {
         /* Input attachments stored in GMEM must be loaded with unscaled
          * FragCoord.
          */
         if (subpass->input_attachments[i].patch_input_gmem)
            builder->unscaled_input_fragcoord |= 1u << i;
      }

      driver_rp = &rp_state;
   }

   vk_graphics_pipeline_state_fill(&dev->vk,
                                   &builder->graphics_state,
                                   builder->create_info,
                                   driver_rp,
                                   rp_flags,
                                   &builder->all_state,
                                   NULL, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
                                   NULL);

   if (builder->graphics_state.rp) {
      builder->fragment_density_map = (builder->graphics_state.pipeline_flags &
         VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT) ||
         TU_DEBUG(FDM);
      builder->fdm_per_layer = (builder->graphics_state.pipeline_flags &
                                VK_PIPELINE_CREATE_2_PER_LAYER_FRAGMENT_DENSITY_BIT_VALVE);
      if (builder->fdm_per_layer) {
         const VkPipelineFragmentDensityMapLayeredCreateInfoVALVE *fdm_layered_info =
            vk_find_struct_const(create_info->pNext,
                                 PIPELINE_FRAGMENT_DENSITY_MAP_LAYERED_CREATE_INFO_VALVE);
         if (fdm_layered_info) {
            builder->max_fdm_layers =
               fdm_layered_info->maxFragmentDensityMapLayers;
         }
      }
   }
}

template <chip CHIP>
static VkResult
tu_graphics_pipeline_create(VkDevice device,
                            VkPipelineCache pipelineCache,
                            const VkGraphicsPipelineCreateInfo *pCreateInfo,
                            VkPipelineCreateFlags2KHR flags,
                            const VkAllocationCallbacks *pAllocator,
                            VkPipeline *pPipeline)
{
   VK_FROM_HANDLE(tu_device, dev, device);
   VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);

   cache = cache ? cache : dev->mem_cache;

   struct tu_pipeline_builder builder;
   tu_pipeline_builder_init_graphics(&builder, dev, cache,
                                     pCreateInfo, flags, pAllocator);

   struct tu_pipeline *pipeline = NULL;
   VkResult result = tu_pipeline_builder_build<CHIP>(&builder, &pipeline);
   tu_pipeline_builder_finish(&builder);

   if (result == VK_SUCCESS) {
      TU_RMV(graphics_pipeline_create, dev, tu_pipeline_to_graphics(pipeline));

      *pPipeline = tu_pipeline_to_handle(pipeline);
   } else
      *pPipeline = VK_NULL_HANDLE;

   return result;
}

template <chip CHIP>
VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateGraphicsPipelines(VkDevice device,
                           VkPipelineCache pipelineCache,
                           uint32_t count,
                           const VkGraphicsPipelineCreateInfo *pCreateInfos,
                           const VkAllocationCallbacks *pAllocator,
                           VkPipeline *pPipelines)
{
   MESA_TRACE_FUNC();
   VkResult final_result = VK_SUCCESS;
   uint32_t i = 0;

   for (; i < count; i++) {
      VkPipelineCreateFlags2KHR flags =
         vk_graphics_pipeline_create_flags(&pCreateInfos[i]);

      VkResult result =
         tu_graphics_pipeline_create<CHIP>(device, pipelineCache,
                                           &pCreateInfos[i], flags,
                                           pAllocator, &pPipelines[i]);

      if (result != VK_SUCCESS) {
         final_result = result;
         pPipelines[i] = VK_NULL_HANDLE;

         if (flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
            break;
      }
   }

   for (; i < count; i++)
      pPipelines[i] = VK_NULL_HANDLE;

   return final_result;
}
TU_GENX(tu_CreateGraphicsPipelines);

template <chip CHIP>
static VkResult
tu_compute_pipeline_create(VkDevice device,
                           VkPipelineCache pipelineCache,
                           const VkComputePipelineCreateInfo *pCreateInfo,
                           VkPipelineCreateFlags2KHR flags,
                           const VkAllocationCallbacks *pAllocator,
                           VkPipeline *pPipeline)
{
   VK_FROM_HANDLE(tu_device, dev, device);
   VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
   VK_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
   const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
   VkResult result;
   const struct ir3_shader_variant *v = NULL;

   cache = cache ? cache : dev->mem_cache;

   struct tu_compute_pipeline *pipeline;

   *pPipeline = VK_NULL_HANDLE;

   VkPipelineCreationFeedback pipeline_feedback = {
      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
   };

   const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
      vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);

   int64_t pipeline_start = os_time_get_nano();

   pipeline = (struct tu_compute_pipeline *) vk_object_zalloc(
      &dev->vk, pAllocator, sizeof(*pipeline), VK_OBJECT_TYPE_PIPELINE);
   if (!pipeline)
      return VK_ERROR_OUT_OF_HOST_MEMORY;
   pipeline->base.type = TU_PIPELINE_COMPUTE;

   pipeline->base.executables_mem_ctx = ralloc_context(NULL);
   util_dynarray_init(&pipeline->base.executables, pipeline->base.executables_mem_ctx);
   pipeline->base.active_stages = VK_SHADER_STAGE_COMPUTE_BIT;

   struct tu_shader_key key = { };
   bool allow_varying_subgroup_size =
      (stage_info->flags &
       VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT);
   bool require_full_subgroups =
      stage_info->flags &
      VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT;
   const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info =
      vk_find_struct_const(stage_info,
                           PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
   tu_shader_key_subgroup_size(&key, allow_varying_subgroup_size,
                               require_full_subgroups, subgroup_info,
                               dev);

   struct vk_pipeline_robustness_state rs;
   vk_pipeline_robustness_state_fill(&dev->vk, &rs,
                                     pCreateInfo->pNext,
                                     stage_info->pNext);
   tu_shader_key_robustness(&key, &rs);

   void *pipeline_mem_ctx = ralloc_context(NULL);

   unsigned char pipeline_sha1[20];
   tu_hash_compute(pipeline_sha1, flags, stage_info, layout, &key);

   struct tu_shader *shader = NULL;

   const bool executable_info = flags &
      VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;

   bool application_cache_hit = false;

   if (!executable_info) {
      shader =
         tu_pipeline_cache_lookup(cache, pipeline_sha1, sizeof(pipeline_sha1),
                                  &application_cache_hit);
   }

   if (application_cache_hit && cache != dev->mem_cache) {
      pipeline_feedback.flags |=
         VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
   }

   char *nir_initial_disasm = NULL;

   if (!shader) {
      if (flags &
          VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR) {
         result = VK_PIPELINE_COMPILE_REQUIRED;
         goto fail;
      }

      struct ir3_shader_key ir3_key = {};

      nir_shader *nir = tu_spirv_to_nir(dev, pipeline_mem_ctx, flags,
                                        stage_info, &key, MESA_SHADER_COMPUTE);

      nir_initial_disasm = executable_info ?
         nir_shader_as_str(nir, pipeline->base.executables_mem_ctx) : NULL;

      result = tu_shader_create(dev, &shader, nir, &key, &ir3_key,
                                pipeline_sha1, sizeof(pipeline_sha1), layout,
                                executable_info);
      if (!shader) {
         goto fail;
      }

      shader = tu_pipeline_cache_insert(cache, shader);
   }

   pipeline_feedback.duration = os_time_get_nano() - pipeline_start;

   if (creation_feedback) {
      *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
      if (creation_feedback->pipelineStageCreationFeedbackCount > 0) {
         assert(creation_feedback->pipelineStageCreationFeedbackCount == 1);
         creation_feedback->pPipelineStageCreationFeedbacks[0] = pipeline_feedback;
      }
   }

   pipeline->base.active_desc_sets = shader->active_desc_sets;

   v = shader->variant;

   tu_pipeline_set_linkage(&pipeline->base.program.link[MESA_SHADER_COMPUTE],
                           &shader->const_state, v);

   result = tu_pipeline_allocate_cs(dev, &pipeline->base, layout, NULL, v);
   if (result != VK_SUCCESS)
      goto fail;

   for (int i = 0; i < 3; i++)
      pipeline->local_size[i] = v->local_size[i];

   if (CHIP == A6XX) {
      tu6_emit_load_state(dev, &pipeline->base, layout);
   }

   tu_append_executable(&pipeline->base, v, nir_initial_disasm);

   pipeline->instrlen = v->instrlen;

   pipeline->base.shaders[MESA_SHADER_COMPUTE] = shader;

   ralloc_free(pipeline_mem_ctx);

   TU_RMV(compute_pipeline_create, dev, pipeline);

   *pPipeline = tu_pipeline_to_handle(&pipeline->base);

   return VK_SUCCESS;

fail:
   if (shader)
      vk_pipeline_cache_object_unref(&dev->vk, &shader->base);

   ralloc_free(pipeline->base.executables_mem_ctx);
   ralloc_free(pipeline_mem_ctx);

   vk_object_free(&dev->vk, pAllocator, pipeline);

   return result;
}

template <chip CHIP>
VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateComputePipelines(VkDevice device,
                          VkPipelineCache pipelineCache,
                          uint32_t count,
                          const VkComputePipelineCreateInfo *pCreateInfos,
                          const VkAllocationCallbacks *pAllocator,
                          VkPipeline *pPipelines)
{
   MESA_TRACE_FUNC();
   VkResult final_result = VK_SUCCESS;
   uint32_t i = 0;

   for (; i < count; i++) {
      VkPipelineCreateFlags2KHR flags =
         vk_compute_pipeline_create_flags(&pCreateInfos[i]);

      VkResult result =
         tu_compute_pipeline_create<CHIP>(device, pipelineCache,
                                          &pCreateInfos[i], flags,
                                          pAllocator, &pPipelines[i]);
      if (result != VK_SUCCESS) {
         final_result = result;
         pPipelines[i] = VK_NULL_HANDLE;

         if (flags &
             VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
            break;
      }
   }

   for (; i < count; i++)
      pPipelines[i] = VK_NULL_HANDLE;

   return final_result;
}
TU_GENX(tu_CreateComputePipelines);

VKAPI_ATTR void VKAPI_CALL
tu_DestroyPipeline(VkDevice _device,
                   VkPipeline _pipeline,
                   const VkAllocationCallbacks *pAllocator)
{
   VK_FROM_HANDLE(tu_device, dev, _device);
   VK_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);

   if (!_pipeline)
      return;

   TU_RMV(resource_destroy, dev, pipeline);

   tu_pipeline_finish(pipeline, dev, pAllocator);
   vk_object_free(&dev->vk, pAllocator, pipeline);
}

static const struct tu_pipeline_executable *
tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index)
{
   assert(index < util_dynarray_num_elements(&pipeline->executables,
                                             struct tu_pipeline_executable));
   return util_dynarray_element(
      &pipeline->executables, struct tu_pipeline_executable, index);
}

VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutablePropertiesKHR(
      VkDevice _device,
      const VkPipelineInfoKHR* pPipelineInfo,
      uint32_t* pExecutableCount,
      VkPipelineExecutablePropertiesKHR* pProperties)
{
   VK_FROM_HANDLE(tu_device, dev, _device);
   VK_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline);
   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
                          pProperties, pExecutableCount);

   util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) {
      vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
         mesa_shader_stage stage = exe->stage;
         props->stages = mesa_to_vk_shader_stage(stage);

         if (!exe->is_binning)
            VK_COPY_STR(props->name, _mesa_shader_stage_to_abbrev(stage));
         else
            VK_COPY_STR(props->name, "Binning VS");

         VK_COPY_STR(props->description, _mesa_shader_stage_to_string(stage));

         props->subgroupSize =
            dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1);
      }
   }

   return vk_outarray_status(&out);
}

VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableStatisticsKHR(
      VkDevice _device,
      const VkPipelineExecutableInfoKHR* pExecutableInfo,
      uint32_t* pStatisticCount,
      VkPipelineExecutableStatisticKHR* pStatistics)
{
   VK_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
                          pStatistics, pStatisticCount);

   const struct tu_pipeline_executable *exe =
      tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);

   struct adreno_stats stats;
   stats.maxwaves = exe->stats.max_waves;
   stats.inst = exe->stats.instrs_count;
   stats.code_size = exe->stats.sizedwords;
   stats.nops = exe->stats.nops_count;
   stats.mov = exe->stats.mov_count;
   stats.cov = exe->stats.cov_count;
   stats.full = exe->stats.max_reg + 1;
   stats.half = exe->stats.max_half_reg + 1;
   stats.last_baryf = exe->stats.last_baryf;
   stats.last_helper = exe->stats.last_helper;
   stats.ss = exe->stats.ss;
   stats.sy = exe->stats.sy;
   stats.ss_stall = exe->stats.sstall;
   stats.sy_stall = exe->stats.systall;
   stats.loops = exe->stats.loops;
   stats.stps = exe->stats.stp_count;
   stats.ldps = exe->stats.ldp_count;
   stats.preamble_inst = exe->stats.preamble_instrs_count;
   stats.early_preamble = exe->stats.early_preamble;
   stats.constlen = exe->stats.constlen;

   for (unsigned i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); ++i) {
      stats.cat[i] = exe->stats.instrs_per_cat[i];
   }

   vk_add_adreno_stats(out, &stats);
   return vk_outarray_status(&out);
}

static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
              const char *data)
{
   ir->isText = VK_TRUE;

   size_t data_len = strlen(data) + 1;

   if (ir->pData == NULL) {
      ir->dataSize = data_len;
      return true;
   }

   strncpy((char *) ir->pData, data, ir->dataSize);
   if (ir->dataSize < data_len)
      return false;

   ir->dataSize = data_len;
   return true;
}

VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableInternalRepresentationsKHR(
    VkDevice _device,
    const VkPipelineExecutableInfoKHR* pExecutableInfo,
    uint32_t* pInternalRepresentationCount,
    VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
{
   VK_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
                          pInternalRepresentations, pInternalRepresentationCount);
   bool incomplete_text = false;

   const struct tu_pipeline_executable *exe =
      tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);

   if (exe->nir_from_spirv) {
      vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
         VK_COPY_STR(ir->name, "NIR from SPIRV");
         VK_COPY_STR(ir->description, "Initial NIR before any optimizations");

         if (!write_ir_text(ir, exe->nir_from_spirv))
            incomplete_text = true;
      }
   }

   if (exe->nir_final) {
      vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
         VK_COPY_STR(ir->name, "Final NIR");
         VK_COPY_STR(ir->description,
                     "Final NIR before going into the back-end compiler");

         if (!write_ir_text(ir, exe->nir_final))
            incomplete_text = true;
      }
   }

   if (exe->disasm) {
      vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
         VK_COPY_STR(ir->name, "IR3 Assembly");
         VK_COPY_STR(ir->description,
                     "Final IR3 assembly for the generated shader binary");

         if (!write_ir_text(ir, exe->disasm))
            incomplete_text = true;
      }
   }

   return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
}