mesa/src/broadcom/vulkan/v3dv_pipeline.c

/*
 * Copyright © 2019 Raspberry Pi
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "vk_util.h"

#include "v3dv_debug.h"
#include "v3dv_private.h"

#include "vk_format_info.h"

#include "common/v3d_debug.h"

#include "compiler/nir/nir_builder.h"

#include "vulkan/util/vk_format.h"

#include "broadcom/cle/v3dx_pack.h"

VkResult
v3dv_CreateShaderModule(VkDevice _device,
                        const VkShaderModuleCreateInfo *pCreateInfo,
                        const VkAllocationCallbacks *pAllocator,
                        VkShaderModule *pShaderModule)
{
   V3DV_FROM_HANDLE(v3dv_device, device, _device);
   struct v3dv_shader_module *module;

   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO);
   assert(pCreateInfo->flags == 0);

   module = vk_alloc2(&device->alloc, pAllocator,
                       sizeof(*module) + pCreateInfo->codeSize, 8,
                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (module == NULL)
      return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);

   module->size = pCreateInfo->codeSize;
   memcpy(module->data, pCreateInfo->pCode, module->size);

   _mesa_sha1_compute(module->data, module->size, module->sha1);

   *pShaderModule = v3dv_shader_module_to_handle(module);

   return VK_SUCCESS;
}

void
v3dv_DestroyShaderModule(VkDevice _device,
                         VkShaderModule _module,
                         const VkAllocationCallbacks *pAllocator)
{
   V3DV_FROM_HANDLE(v3dv_device, device, _device);
   V3DV_FROM_HANDLE(v3dv_shader_module, module, _module);

   if (!module)
      return;

   vk_free2(&device->alloc, pAllocator, module);
}

static void
destroy_pipeline_stage(struct v3dv_device *device,
                       struct v3dv_pipeline_stage *p_stage,
                       const VkAllocationCallbacks *pAllocator)
{
   v3dv_bo_free(device, p_stage->assembly_bo);

   vk_free2(&device->alloc, pAllocator, p_stage);
}

void
v3dv_DestroyPipeline(VkDevice _device,
                     VkPipeline _pipeline,
                     const VkAllocationCallbacks *pAllocator)
{
   V3DV_FROM_HANDLE(v3dv_device, device, _device);
   V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, _pipeline);

   if (!pipeline)
      return;

   /* FIXME: we can't just use a loop over mesa stage due the bin, would be
    * good to find an alternative.
    */
   destroy_pipeline_stage(device, pipeline->vs, pAllocator);
   destroy_pipeline_stage(device, pipeline->vs_bin, pAllocator);
   destroy_pipeline_stage(device, pipeline->fs, pAllocator);

   if (pipeline->default_attribute_values) {
      v3dv_bo_free(device, pipeline->default_attribute_values);
      pipeline->default_attribute_values = NULL;
   }

   vk_free2(&device->alloc, pAllocator, pipeline);
}

static const struct spirv_to_nir_options default_spirv_options =  {
   .caps = { false },
   .ubo_addr_format = nir_address_format_32bit_index_offset,
   .ssbo_addr_format = nir_address_format_32bit_index_offset,
   .phys_ssbo_addr_format = nir_address_format_64bit_global,
   .push_const_addr_format = nir_address_format_logical,
   .shared_addr_format = nir_address_format_32bit_offset,
   .frag_coord_is_sysval = false,
};

const nir_shader_compiler_options v3dv_nir_options = {
   .lower_all_io_to_temps = true,
   .lower_extract_byte = true,
   .lower_extract_word = true,
   .lower_bitfield_insert_to_shifts = true,
   .lower_bitfield_extract_to_shifts = true,
   .lower_bitfield_reverse = true,
   .lower_bit_count = true,
   .lower_cs_local_id_from_index = true,
   .lower_ffract = true,
   .lower_fmod = true,
   .lower_pack_unorm_2x16 = true,
   .lower_pack_snorm_2x16 = true,
   .lower_pack_unorm_4x8 = true,
   .lower_pack_snorm_4x8 = true,
   .lower_unpack_unorm_4x8 = true,
   .lower_unpack_snorm_4x8 = true,
   .lower_pack_half_2x16 = true,
   .lower_unpack_half_2x16 = true,
   /* FIXME: see if we can avoid the uadd_carry and usub_borrow lowering and
    * get the tests to pass since it might produce slightly better code.
    */
   .lower_uadd_carry = true,
   .lower_usub_borrow = true,
   /* FIXME: check if we can use multop + umul24 to implement mul2x32_64
    * without lowering.
    */
   .lower_mul_2x32_64 = true,
   .lower_fdiv = true,
   .lower_find_lsb = true,
   .lower_ffma16 = true,
   .lower_ffma32 = true,
   .lower_ffma64 = true,
   .lower_flrp32 = true,
   .lower_fpow = true,
   .lower_fsat = true,
   .lower_fsqrt = true,
   .lower_ifind_msb = true,
   .lower_isign = true,
   .lower_ldexp = true,
   .lower_mul_high = true,
   .lower_wpos_pntc = true,
   .lower_rotate = true,
   .lower_to_scalar = true,
   .vertex_id_zero_based = false, /* FIXME: to set this to true, the intrinsic
                                   * needs to be supported */
};

#define OPT(pass, ...) ({                                  \
   bool this_progress = false;                             \
   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
   if (this_progress)                                      \
      progress = true;                                     \
   this_progress;                                          \
})

static void
nir_optimize(nir_shader *nir,
             struct v3dv_pipeline_stage *stage,
             bool allow_copies)
{
   bool progress;

   do {
      progress = false;
      OPT(nir_split_array_vars, nir_var_function_temp);
      OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
      OPT(nir_opt_deref);
      OPT(nir_lower_vars_to_ssa);
      if (allow_copies) {
         /* Only run this pass in the first call to nir_optimize.  Later calls
          * assume that we've lowered away any copy_deref instructions and we
          * don't want to introduce any more.
          */
         OPT(nir_opt_find_array_copies);
      }
      OPT(nir_opt_copy_prop_vars);
      OPT(nir_opt_dead_write_vars);
      OPT(nir_opt_combine_stores, nir_var_all);

      OPT(nir_lower_alu_to_scalar, NULL, NULL);

      OPT(nir_copy_prop);
      OPT(nir_lower_phis_to_scalar);

      OPT(nir_copy_prop);
      OPT(nir_opt_dce);
      OPT(nir_opt_cse);
      OPT(nir_opt_combine_stores, nir_var_all);

      /* Passing 0 to the peephole select pass causes it to convert
       * if-statements that contain only move instructions in the branches
       * regardless of the count.
       *
       * Passing 1 to the peephole select pass causes it to convert
       * if-statements that contain at most a single ALU instruction (total)
       * in both branches.
       */
      OPT(nir_opt_peephole_select, 0, false, false);
      OPT(nir_opt_peephole_select, 8, false, true);

      OPT(nir_opt_intrinsics);
      OPT(nir_opt_idiv_const, 32);
      OPT(nir_opt_algebraic);
      OPT(nir_opt_constant_folding);

      OPT(nir_opt_dead_cf);

      OPT(nir_opt_if, false);
      OPT(nir_opt_conditional_discard);

      OPT(nir_opt_remove_phis);
      OPT(nir_opt_undef);
      OPT(nir_lower_pack);
   } while (progress);

   OPT(nir_remove_dead_variables, nir_var_function_temp, NULL);
}

static void
preprocess_nir(nir_shader *nir,
               struct v3dv_pipeline_stage *stage)
{
   /* Make sure we lower variable initializers on output variables so that
    * nir_remove_dead_variables below sees the corresponding stores
    */
   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_shader_out);

   /* Now that we've deleted all but the main function, we can go ahead and
    * lower the rest of the variable initializers.
    */
   NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);

   /* Split member structs.  We do this before lower_io_to_temporaries so that
    * it doesn't lower system values to temporaries by accident.
    */
   NIR_PASS_V(nir, nir_split_var_copies);
   NIR_PASS_V(nir, nir_split_per_member_structs);

   if (nir->info.stage == MESA_SHADER_FRAGMENT)
      NIR_PASS_V(nir, nir_lower_io_to_vector, nir_var_shader_out);
   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
      NIR_PASS_V(nir, nir_lower_input_attachments,
                 &(nir_input_attachment_options) {
                    .use_fragcoord_sysval = false,
                       });
   }

   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_in |
              nir_var_shader_out | nir_var_system_value | nir_var_mem_shared,
              NULL);

   NIR_PASS_V(nir, nir_propagate_invariant);
   NIR_PASS_V(nir, nir_lower_io_to_temporaries,
              nir_shader_get_entrypoint(nir), true, false);

   NIR_PASS_V(nir, nir_lower_system_values);
   NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);

   NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);

   NIR_PASS_V(nir, nir_normalize_cubemap_coords);

   NIR_PASS_V(nir, nir_lower_global_vars_to_local);

   NIR_PASS_V(nir, nir_split_var_copies);
   NIR_PASS_V(nir, nir_split_struct_vars, nir_var_function_temp);

   nir_optimize(nir, stage, true);

   NIR_PASS_V(nir, nir_lower_load_const_to_scalar);

   /* Lower a bunch of stuff */
   NIR_PASS_V(nir, nir_lower_var_copies);

   NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_shader_in |
              nir_var_shader_out |
              nir_var_function_temp, UINT32_MAX);

   NIR_PASS_V(nir, nir_lower_array_deref_of_vec,
              nir_var_mem_ubo | nir_var_mem_ssbo,
              nir_lower_direct_array_deref_of_vec_load);

   /* Get rid of split copies */
   nir_optimize(nir, stage, false);
}

static nir_shader *
shader_module_compile_to_nir(struct v3dv_device *device,
                             struct v3dv_pipeline_stage *stage)
{
   nir_shader *nir;
   const nir_shader_compiler_options *nir_options = &v3dv_nir_options;

   uint32_t *spirv = (uint32_t *) stage->module->data;
   assert(stage->module->size % 4 == 0);

   if (V3D_DEBUG & V3D_DEBUG_DUMP_SPIRV)
      v3dv_print_spirv(stage->module->data, stage->module->size, stderr);

   uint32_t num_spec_entries = 0;
   struct nir_spirv_specialization *spec_entries = NULL;

   const struct spirv_to_nir_options spirv_options = default_spirv_options;
   nir = spirv_to_nir(spirv, stage->module->size / 4,
                      spec_entries, num_spec_entries,
                      stage->stage, stage->entrypoint,
                      &spirv_options, nir_options);
   assert(nir->info.stage == stage->stage);
   nir_validate_shader(nir, "after spirv_to_nir");

   if (V3D_DEBUG & (V3D_DEBUG_NIR |
                    v3d_debug_flag_for_shader_stage(stage->stage))) {
      fprintf(stderr, "Initial form: %s prog %d NIR:\n",
              gl_shader_stage_name(stage->stage),
              stage->program_id);
      nir_print_shader(nir, stderr);
      fprintf(stderr, "\n");
   }

   free(spec_entries);

   /* We have to lower away local variable initializers right before we
    * inline functions.  That way they get properly initialized at the top
    * of the function and not at the top of its caller.
    */
   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
   NIR_PASS_V(nir, nir_lower_returns);
   NIR_PASS_V(nir, nir_inline_functions);
   NIR_PASS_V(nir, nir_opt_deref);

   /* Pick off the single entrypoint that we want */
   foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
      if (func->is_entrypoint)
         func->name = ralloc_strdup(func, "main");
      else
         exec_node_remove(&func->node);
   }
   assert(exec_list_length(&nir->functions) == 1);

   /* Vulkan uses the separate-shader linking model */
   nir->info.separate_shader = true;

   preprocess_nir(nir, stage);

   return nir;
}

static int
type_size_vec4(const struct glsl_type *type, bool bindless)
{
   return glsl_count_attribute_slots(type, false);
}

static unsigned
descriptor_map_add(struct v3dv_descriptor_map *map,
                   int set,
                   int binding,
                   int array_index,
                   int array_size)
{
   assert(array_index < array_size);

   unsigned index = 0;
   for (unsigned i = 0; i < map->num_desc; i++) {
      if (set == map->set[i] &&
          binding == map->binding[i] &&
          array_index == map->array_index[i]) {
         assert(array_size == map->array_size[i]);
         return index;
      }
      index++;
   }

   assert(index == map->num_desc);

   map->set[map->num_desc] = set;
   map->binding[map->num_desc] = binding;
   map->array_index[map->num_desc] = array_index;
   map->array_size[map->num_desc] = array_size;
   map->num_desc++;

   return index;
}


static void
lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *instr,
                         struct v3dv_pipeline *pipeline)
{
   assert(instr->intrinsic == nir_intrinsic_load_push_constant);

   /* FIXME: next assert it not something that should happen in general, just
    * to catch any test example under that case and deal with it
    */
   assert(nir_intrinsic_base(instr) == 0);

   instr->intrinsic = nir_intrinsic_load_uniform;
}

/* Gathers info from the intrinsic (set and binding) and then lowers it so it
 * could be used by the v3d_compiler */
static void
lower_vulkan_resource_index(nir_builder *b,
                            nir_intrinsic_instr *instr,
                            struct v3dv_pipeline *pipeline,
                            const struct v3dv_pipeline_layout *layout)
{
   assert(instr->intrinsic == nir_intrinsic_vulkan_resource_index);

   nir_const_value *const_val = nir_src_as_const_value(instr->src[0]);

   unsigned set = nir_intrinsic_desc_set(instr);
   unsigned binding = nir_intrinsic_binding(instr);
   struct v3dv_descriptor_set_layout *set_layout = layout->set[set].layout;
   struct v3dv_descriptor_set_binding_layout *binding_layout =
      &set_layout->binding[binding];
   unsigned index = 0;

   switch (nir_intrinsic_desc_type(instr)) {
   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
      struct v3dv_descriptor_map *descriptor_map =
         nir_intrinsic_desc_type(instr) == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ?
         &pipeline->ubo_map : &pipeline->ssbo_map;

      if (!const_val)
         unreachable("non-constant vulkan_resource_index array index");

      index = descriptor_map_add(descriptor_map, set, binding,
                                 const_val->u32,
                                 binding_layout->array_size);

      if (nir_intrinsic_desc_type(instr) == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
         /* skip index 0 which is used for push constants */
         index++;
      }
      break;
   }

   default:
      unreachable("unsupported desc_type for vulkan_resource_index");
      break;
   }

   nir_ssa_def_rewrite_uses(&instr->dest.ssa,
                            nir_src_for_ssa(nir_imm_int(b, index)));
   nir_instr_remove(&instr->instr);
}

static bool
lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
                struct v3dv_pipeline *pipeline,
                const struct v3dv_pipeline_layout *layout)
{
   switch (instr->intrinsic) {

   case nir_intrinsic_load_push_constant:
      lower_load_push_constant(b, instr, pipeline);
      pipeline->use_push_constants = true;
      return true;

   case nir_intrinsic_vulkan_resource_index:
      lower_vulkan_resource_index(b, instr, pipeline, layout);
      return true;

   default:
      return false;
   }
}

static bool
lower_impl(nir_function_impl *impl,
           struct v3dv_pipeline *pipeline,
           const struct v3dv_pipeline_layout *layout)
{
   nir_builder b;
   nir_builder_init(&b, impl);
   bool progress = false;

   nir_foreach_block(block, impl) {
      nir_foreach_instr_safe(instr, block) {
         b.cursor = nir_before_instr(instr);
         switch (instr->type) {
         case nir_instr_type_intrinsic:
            progress |=
               lower_intrinsic(&b, nir_instr_as_intrinsic(instr), pipeline, layout);
            break;
         default:
            break;
         }
      }
   }

   return progress;
}

static bool
lower_pipeline_layout_info(nir_shader *shader,
                           struct v3dv_pipeline *pipeline,
                           const struct v3dv_pipeline_layout *layout)
{
   bool progress = false;

   nir_foreach_function(function, shader) {
      if (function->impl)
         progress |= lower_impl(function->impl, pipeline, layout);
   }

   return progress;
}


static void
lower_fs_io(nir_shader *nir)
{
   /* Our backend doesn't handle array fragment shader outputs */
   NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);
   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_out, NULL);

   nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
                               MESA_SHADER_FRAGMENT);

   nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
                               MESA_SHADER_FRAGMENT);

   NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
              type_size_vec4, 0);
}

static void
lower_vs_io(struct nir_shader *nir)
{
   NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);

   nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
                               MESA_SHADER_VERTEX);

   nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
                               MESA_SHADER_VERTEX);

   /* FIXME: if we call nir_lower_io, we get a crash later. Likely because it
    * overlaps with v3d_nir_lower_io. Need further research though.
    */
}

static void
shader_debug_output(const char *message, void *data)
{
   /* FIXME: We probably don't want to debug anything extra here, and in fact
    * the compiler is not using this callback too much, only as an alternative
    * way to debug out the shaderdb stats, that you can already get using
    * V3D_DEBUG=shaderdb. Perhaps it would make sense to revisit the v3d
    * compiler to remove that callback.
    */
}

static void
pipeline_populate_v3d_key(struct v3d_key *key,
                          const VkGraphicsPipelineCreateInfo *pCreateInfo,
                          const struct v3dv_pipeline_stage *p_stage)
{
   /* default value. Would be override on the vs/gs populate methods when GS
    * gets supported
    */
   key->is_last_geometry_stage = true;

   /* Vulkan provides a way to define clip distances, but not clip planes, so
    * we understand that this would be always zero. Probably would need to be
    * revisited based on all the clip related extensions available.
    */
   key->ucp_enables = 0;

   key->environment = V3D_ENVIRONMENT_VULKAN;
}

/* FIXME: anv maps to hw primitive type. Perhaps eventually we would do the
 * same. For not using prim_mode that is the one already used on v3d
 */
static const enum pipe_prim_type vk_to_pipe_prim_type[] = {
   [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = PIPE_PRIM_POINTS,
   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = PIPE_PRIM_LINES,
   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = PIPE_PRIM_LINE_STRIP,
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = PIPE_PRIM_TRIANGLES,
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = PIPE_PRIM_TRIANGLE_STRIP,
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = PIPE_PRIM_TRIANGLE_FAN,
   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = PIPE_PRIM_LINES_ADJACENCY,
   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_LINE_STRIP_ADJACENCY,
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLES_ADJACENCY,
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY,
};

static const enum pipe_logicop vk_to_pipe_logicop[] = {
   [VK_LOGIC_OP_CLEAR] = PIPE_LOGICOP_CLEAR,
   [VK_LOGIC_OP_AND] = PIPE_LOGICOP_AND,
   [VK_LOGIC_OP_AND_REVERSE] = PIPE_LOGICOP_AND_REVERSE,
   [VK_LOGIC_OP_COPY] = PIPE_LOGICOP_COPY,
   [VK_LOGIC_OP_AND_INVERTED] = PIPE_LOGICOP_AND_INVERTED,
   [VK_LOGIC_OP_NO_OP] = PIPE_LOGICOP_NOOP,
   [VK_LOGIC_OP_XOR] = PIPE_LOGICOP_XOR,
   [VK_LOGIC_OP_OR] = PIPE_LOGICOP_OR,
   [VK_LOGIC_OP_NOR] = PIPE_LOGICOP_NOR,
   [VK_LOGIC_OP_EQUIVALENT] = PIPE_LOGICOP_EQUIV,
   [VK_LOGIC_OP_INVERT] = PIPE_LOGICOP_INVERT,
   [VK_LOGIC_OP_OR_REVERSE] = PIPE_LOGICOP_OR_REVERSE,
   [VK_LOGIC_OP_COPY_INVERTED] = PIPE_LOGICOP_COPY_INVERTED,
   [VK_LOGIC_OP_OR_INVERTED] = PIPE_LOGICOP_OR_INVERTED,
   [VK_LOGIC_OP_NAND] = PIPE_LOGICOP_NAND,
   [VK_LOGIC_OP_SET] = PIPE_LOGICOP_SET,
};

static void
pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
                             const struct v3dv_pipeline_stage *p_stage)
{
   memset(key, 0, sizeof(*key));

   pipeline_populate_v3d_key(&key->base, pCreateInfo, p_stage);

   const VkPipelineInputAssemblyStateCreateInfo *ia_info =
      pCreateInfo->pInputAssemblyState;
   uint8_t topology = vk_to_pipe_prim_type[ia_info->topology];

   key->is_points = (topology == PIPE_PRIM_POINTS);
   key->is_lines = (topology >= PIPE_PRIM_LINES &&
                    topology <= PIPE_PRIM_LINE_STRIP);

   /* Vulkan doesn't appear to specify (anv does the same) */
   key->clamp_color = false;

   const VkPipelineColorBlendStateCreateInfo *cb_info =
      pCreateInfo->pColorBlendState;

   key->logicop_func = cb_info && cb_info->logicOpEnable == VK_TRUE ?
                       vk_to_pipe_logicop[cb_info->logicOp] :
                       PIPE_LOGICOP_COPY;

   const VkPipelineMultisampleStateCreateInfo *ms_info =
      pCreateInfo->pMultisampleState;

   /* FIXME: msaa not supported yet (although we add some of the code to
    * translate vk sample info in advance)
    */
   key->msaa = false;
   if (key->msaa & (ms_info != NULL)) {
      uint32_t sample_mask = 0xffff;

      if (ms_info->pSampleMask)
         sample_mask = ms_info->pSampleMask[0] & 0xffff;

      key->sample_coverage = (sample_mask != (1 << V3D_MAX_SAMPLES) - 1);
      key->sample_alpha_to_coverage = ms_info->alphaToCoverageEnable;
      key->sample_alpha_to_one = ms_info->alphaToOneEnable;
   }

   const VkPipelineDepthStencilStateCreateInfo *ds_info =
      pCreateInfo->pDepthStencilState;

   key->depth_enabled = (ds_info == NULL ? false : ds_info->depthTestEnable);

   /* Vulkan doesn't support alpha test */
   key->alpha_test = false;
   key->alpha_test_func = COMPARE_FUNC_NEVER;

   /* FIXME: placeholder. Final value for swap_color_rb depends on the format
    * of the surface to be used.
    */
   key->swap_color_rb = false;

   const struct v3dv_render_pass *pass =
      v3dv_render_pass_from_handle(pCreateInfo->renderPass);
   const struct v3dv_subpass *subpass = p_stage->pipeline->subpass;
   for (uint32_t i = 0; i < subpass->color_count; i++) {
      const uint32_t att_idx = subpass->color_attachments[i].attachment;
      if (att_idx == VK_ATTACHMENT_UNUSED)
         continue;

      key->cbufs |= 1 << i;

      VkFormat fb_format = pass->attachments[att_idx].desc.format;
      enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format);

      /* If logic operations are enabled then we might emit color reads and we
       * need to know the color buffer format and swizzle for that
       */
      if (key->logicop_func != PIPE_LOGICOP_COPY) {
         key->color_fmt[i].format = fb_pipe_format;
         key->color_fmt[i].swizzle = v3dv_get_format_swizzle(fb_format);
      }

      const struct util_format_description *desc =
         vk_format_description(fb_format);

      if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
          desc->channel[0].size == 32) {
         key->f32_color_rb |= 1 << i;
      }

      if (p_stage->nir->info.fs.untyped_color_outputs) {
         if (util_format_is_pure_uint(fb_pipe_format))
            key->uint_color_rb |= 1 << i;
         else if (util_format_is_pure_sint(fb_pipe_format))
            key->int_color_rb |= 1 << i;
      }

      if (key->is_points) {
         /* FIXME: The mask would need to be computed based on the shader
          * inputs. On gallium it is done at st_atom_rasterizer
          * (sprite_coord_enable). anv seems (need to confirm) to do that on
          * genX_pipeline (PointSpriteTextureCoordinateEnable). Would be also
          * better to have tests to guide filling the mask.
          */
         key->point_sprite_mask = 0;

         /* Vulkan mandates upper left. */
         key->point_coord_upper_left = true;
      }
   }

   /* FIXME: we understand that this is used on GL to configure fixed-function
    * two side lighting support, and not make sense for Vulkan. Need to
    * confirm though.
    */
   key->light_twoside = false;

   /* FIXME: ditto, although for flat lighting. Again, neet to confirm.*/
   key->shade_model_flat = false;
}

static void
pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
                             const struct v3dv_pipeline_stage *p_stage)
{
   memset(key, 0, sizeof(*key));

   pipeline_populate_v3d_key(&key->base, pCreateInfo, p_stage);

   /* Vulkan doesn't appear to specify (anv does the same) */
   key->clamp_color = false;

   /* Vulkan specifies a point size per vertex, so true for if the prim are
    * points, like on ES2)
    */
   const VkPipelineInputAssemblyStateCreateInfo *ia_info =
      pCreateInfo->pInputAssemblyState;
   uint8_t topology = vk_to_pipe_prim_type[ia_info->topology];

   /* FIXME: not enough to being PRIM_POINTS, on gallium the full check is
    * PIPE_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */
   key->per_vertex_point_size = (topology == PIPE_PRIM_POINTS);

   key->is_coord = p_stage->is_coord;
   if (p_stage->is_coord) {
      /* The only output varying on coord shaders are for transform
       * feedback. Set to 0 as VK_EXT_transform_feedback is not supported.
       */
      key->num_used_outputs = 0;
   } else {
      struct v3dv_pipeline *pipeline = p_stage->pipeline;
      key->num_used_outputs = pipeline->fs->prog_data.fs->num_inputs;
      STATIC_ASSERT(sizeof(key->used_outputs) ==
                    sizeof(pipeline->fs->prog_data.fs->input_slots));
      memcpy(key->used_outputs, pipeline->fs->prog_data.fs->input_slots,
             sizeof(key->used_outputs));
   }
}

/*
 * Creates the pipeline_stage for the coordinate shader. Initially a clone of
 * the vs pipeline_stage, with is_coord to true;
 */
static struct v3dv_pipeline_stage*
pipeline_stage_create_vs_bin(const struct v3dv_pipeline_stage *src,
                             const VkAllocationCallbacks *alloc)
{
   struct v3dv_device *device = src->pipeline->device;

   struct v3dv_pipeline_stage *p_stage =
      vk_zalloc2(&device->alloc, alloc, sizeof(*p_stage), 8,
                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

   p_stage->pipeline = src->pipeline;
   assert(src->stage == MESA_SHADER_VERTEX);
   p_stage->stage = src->stage;
   p_stage->entrypoint = src->entrypoint;
   p_stage->module = src->module;
   p_stage->nir = src->nir;

   p_stage->is_coord = true;

   return p_stage;
}

/* FIXME: right now this just asks for an bo for the exact size of the qpu
 * assembly. It would be good to be slighly smarter and having one "all
 * shaders" bo per pipeline, so each p_stage would save their offset on
 * such. That is really relevant due the fact that bo are always aligned to
 * 4096, so that would allow to use less memory.
 *
 * For now one-bo per-assembly would work.
 */
static void
upload_assembly(struct v3dv_pipeline_stage *p_stage,
                const void *data,
                uint32_t size)
{
   const char *name = NULL;
   /* We are uploading the assembly just once, so at this point we shouldn't
    * have any bo
    */
   assert(p_stage->assembly_bo == NULL);
   struct v3dv_device *device = p_stage->pipeline->device;

   switch (p_stage->stage) {
   case MESA_SHADER_VERTEX:
      name = (p_stage->is_coord == true) ? "coord_shader_assembly" :
         "vertex_shader_assembly";
      break;
   case MESA_SHADER_FRAGMENT:
      name = "fragment_shader_assembly";
      break;
   default:
      unreachable("Stage not supported\n");
      break;
   };

   struct v3dv_bo *bo = v3dv_bo_alloc(device, size, name);
   if (!bo) {
      fprintf(stderr, "failed to allocate memory for shader\n");
      abort();
   }

   bool ok = v3dv_bo_map(device, bo, size);
   if (!ok) {
      fprintf(stderr, "failed to map source shader buffer\n");
      abort();
   }

   memcpy(bo->map, data, size);

   v3dv_bo_unmap(device, bo);

   p_stage->assembly_bo = bo;
}

static void
compile_pipeline_stage(struct v3dv_pipeline_stage *p_stage)
{
   struct v3dv_physical_device *physical_device =
      &p_stage->pipeline->device->instance->physicalDevice;
   const struct v3d_compiler *compiler = physical_device->compiler;

   /* We don't support variants (and probably will never support them) */
   int variant_id = 0;

   /* Note that we are assigning program_id slightly differently that
    * v3d. Here we are assigning one per pipeline stage, so vs and vs_bin
    * would have a different program_id, while v3d would have the same for
    * both. For the case of v3dv, it is more natural to have an id this way,
    * as right now we are using it for debugging, not for shader-db.
    */
   p_stage->program_id = physical_device->next_program_id++;

   if (V3D_DEBUG & (V3D_DEBUG_NIR |
                    v3d_debug_flag_for_shader_stage(p_stage->stage))) {
      fprintf(stderr, "Just before v3d_compile: %s prog %d NIR:\n",
              gl_shader_stage_name(p_stage->stage),
              p_stage->program_id);
      nir_print_shader(p_stage->nir, stderr);
      fprintf(stderr, "\n");
   }

   uint64_t *qpu_insts;
   uint32_t qpu_insts_size;

   qpu_insts = v3d_compile(compiler,
                           &p_stage->key.base, &p_stage->prog_data.base,
                           p_stage->nir,
                           shader_debug_output, NULL,
                           p_stage->program_id,
                           variant_id,
                           &qpu_insts_size);

   if (!qpu_insts) {
      fprintf(stderr, "Failed to compile %s prog %d NIR to VIR\n",
              gl_shader_stage_name(p_stage->stage),
              p_stage->program_id);
   } else {
      upload_assembly(p_stage, qpu_insts, qpu_insts_size);
   }

   free(qpu_insts);
}

/* FIXME: C&P from st, common place? */
static void
st_nir_opts(nir_shader *nir)
{
   bool progress;

   do {
      progress = false;

      NIR_PASS_V(nir, nir_lower_vars_to_ssa);

      /* Linking deals with unused inputs/outputs, but here we can remove
       * things local to the shader in the hopes that we can cleanup other
       * things. This pass will also remove variables with only stores, so we
       * might be able to make progress after it.
       */
      NIR_PASS(progress, nir, nir_remove_dead_variables,
               (nir_variable_mode)(nir_var_function_temp |
                                   nir_var_shader_temp |
                                   nir_var_mem_shared),
               NULL);

      NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
      NIR_PASS(progress, nir, nir_opt_dead_write_vars);

      if (nir->options->lower_to_scalar) {
         NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
         NIR_PASS_V(nir, nir_lower_phis_to_scalar);
      }

      NIR_PASS_V(nir, nir_lower_alu);
      NIR_PASS_V(nir, nir_lower_pack);
      NIR_PASS(progress, nir, nir_copy_prop);
      NIR_PASS(progress, nir, nir_opt_remove_phis);
      NIR_PASS(progress, nir, nir_opt_dce);
      if (nir_opt_trivial_continues(nir)) {
         progress = true;
         NIR_PASS(progress, nir, nir_copy_prop);
         NIR_PASS(progress, nir, nir_opt_dce);
      }
      NIR_PASS(progress, nir, nir_opt_if, false);
      NIR_PASS(progress, nir, nir_opt_dead_cf);
      NIR_PASS(progress, nir, nir_opt_cse);
      NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);

      NIR_PASS(progress, nir, nir_opt_algebraic);
      NIR_PASS(progress, nir, nir_opt_constant_folding);

      NIR_PASS(progress, nir, nir_opt_undef);
      NIR_PASS(progress, nir, nir_opt_conditional_discard);
   } while (progress);
}

static void
link_shaders(nir_shader *producer, nir_shader *consumer)
{
   assert(producer);
   assert(consumer);

   if (producer->options->lower_to_scalar) {
      NIR_PASS_V(producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
      NIR_PASS_V(consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
   }

   nir_lower_io_arrays_to_elements(producer, consumer);

   st_nir_opts(producer);
   st_nir_opts(consumer);

   if (nir_link_opt_varyings(producer, consumer))
      st_nir_opts(consumer);

   NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
   NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);

   if (nir_remove_unused_varyings(producer, consumer)) {
      NIR_PASS_V(producer, nir_lower_global_vars_to_local);
      NIR_PASS_V(consumer, nir_lower_global_vars_to_local);

      st_nir_opts(producer);
      st_nir_opts(consumer);

      /* Optimizations can cause varyings to become unused.
       * nir_compact_varyings() depends on all dead varyings being removed so
       * we need to call nir_remove_dead_variables() again here.
       */
      NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
      NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
   }
}

static void
pipeline_lower_nir(struct v3dv_pipeline *pipeline,
                   struct v3dv_pipeline_stage *p_stage,
                   struct v3dv_pipeline_layout *layout)
{
   nir_shader_gather_info(p_stage->nir, nir_shader_get_entrypoint(p_stage->nir));

   /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
   NIR_PASS_V(p_stage->nir, lower_pipeline_layout_info, pipeline, layout);
}

static VkResult
pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
                          const VkGraphicsPipelineCreateInfo *pCreateInfo,
                          const VkAllocationCallbacks *alloc)
{
   struct v3dv_pipeline_stage *stages[MESA_SHADER_STAGES] = { };
   struct v3dv_device *device = pipeline->device;

   /* First pass to get the the common info from the shader and the nir
    * shader. We don't care of the coord shader for now.
    */
   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
      const VkPipelineShaderStageCreateInfo *sinfo = &pCreateInfo->pStages[i];
      gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);

      struct v3dv_pipeline_stage *p_stage =
         vk_zalloc2(&device->alloc, alloc, sizeof(*p_stage), 8,
                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

      p_stage->pipeline = pipeline;
      p_stage->stage = stage;
      if (stage == MESA_SHADER_VERTEX)
         p_stage->is_coord = false;
      p_stage->entrypoint = sinfo->pName;
      p_stage->module = v3dv_shader_module_from_handle(sinfo->module);

      pipeline->active_stages |= sinfo->stage;

      /* FIXME: when cache support is in place, first check if for the given
       * spirv module and options, we already have a nir shader.
       */
      p_stage->nir = shader_module_compile_to_nir(pipeline->device, p_stage);

      stages[stage] = p_stage;
   }

   /* Add a no-op fragment shader if needed */
   if (!stages[MESA_SHADER_FRAGMENT]) {
      nir_builder b;
      nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT,
                                     &v3dv_nir_options);
      b.shader->info.name = ralloc_strdup(b.shader, "noop_fs");

      struct v3dv_pipeline_stage *p_stage =
         vk_zalloc2(&device->alloc, alloc, sizeof(*p_stage), 8,
                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

      p_stage->pipeline = pipeline;
      p_stage->stage = MESA_SHADER_FRAGMENT;
      p_stage->entrypoint = "main";
      p_stage->module = 0;
      p_stage->nir = b.shader;

      stages[MESA_SHADER_FRAGMENT] = p_stage;
      pipeline->active_stages |= MESA_SHADER_FRAGMENT;
   }

   /* Linking */
   struct v3dv_pipeline_stage *next_stage = NULL;
   for (int stage = MESA_SHADER_STAGES - 1; stage >= 0; stage--) {
      if (stages[stage] == NULL || stages[stage]->entrypoint == NULL)
         continue;

      struct v3dv_pipeline_stage *p_stage = stages[stage];

      switch(stage) {
      case MESA_SHADER_VERTEX:
         link_shaders(p_stage->nir, next_stage->nir);
         break;
      case MESA_SHADER_FRAGMENT:
         /* FIXME: not doing any specific linking stuff here yet */
         break;
      default:
         unreachable("not supported shader stage");
      }

      next_stage = stages[stage];
   }

   /* Compiling to vir */
   for (int stage = MESA_SHADER_STAGES - 1; stage >= 0; stage--) {
      if (stages[stage] == NULL || stages[stage]->entrypoint == NULL)
         continue;

      struct v3dv_pipeline_stage *p_stage = stages[stage];

      pipeline_lower_nir(pipeline, p_stage, pipeline->layout);

      switch(stage) {
      case MESA_SHADER_VERTEX:
         /* Right now we only support pipelines with both vertex and fragment
          * shader.
          */
         assert(pipeline->fs);

         pipeline->vs = p_stage;

         pipeline->vs_bin = pipeline_stage_create_vs_bin(pipeline->vs, alloc);

         /* FIXME: likely this to be moved to a gather info method to a full
          * struct inside pipeline_stage
          */
         const VkPipelineInputAssemblyStateCreateInfo *ia_info =
            pCreateInfo->pInputAssemblyState;
         pipeline->vs->topology = vk_to_pipe_prim_type[ia_info->topology];

         lower_vs_io(p_stage->nir);

         /* Note that at this point we would compile twice, one for vs and
          * other for vs_bin. For now we are maintaining two pipeline_stage
          * and two keys. Eventually we could reuse the key.
          */
         pipeline_populate_v3d_vs_key(&pipeline->vs->key.vs, pCreateInfo, pipeline->vs);
         pipeline_populate_v3d_vs_key(&pipeline->vs_bin->key.vs, pCreateInfo, pipeline->vs_bin);

         compile_pipeline_stage(pipeline->vs);
         compile_pipeline_stage(pipeline->vs_bin);
         break;
      case MESA_SHADER_FRAGMENT:
         pipeline->fs = p_stage;

         pipeline_populate_v3d_fs_key(&p_stage->key.fs, pCreateInfo,
                             p_stage);

         lower_fs_io(p_stage->nir);

         compile_pipeline_stage(pipeline->fs);
         break;
      default:
         unreachable("not supported shader stage");
      }
   }

   /* FIXME: values below are default when non-GS is available. Would need to
    * provide real values if GS gets supported
    */
   pipeline->vpm_cfg_bin.As = 1;
   pipeline->vpm_cfg_bin.Ve = 0;
   pipeline->vpm_cfg_bin.Vc = pipeline->vs_bin->prog_data.vs->vcm_cache_size;

   pipeline->vpm_cfg.As = 1;
   pipeline->vpm_cfg.Ve = 0;
   pipeline->vpm_cfg.Vc = pipeline->vs->prog_data.vs->vcm_cache_size;

   return VK_SUCCESS;
}

static unsigned
v3dv_dynamic_state_mask(VkDynamicState state)
{
   switch(state) {
   case VK_DYNAMIC_STATE_VIEWPORT:
      return V3DV_DYNAMIC_VIEWPORT;
   case VK_DYNAMIC_STATE_SCISSOR:
      return V3DV_DYNAMIC_SCISSOR;
   case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
      return V3DV_DYNAMIC_STENCIL_COMPARE_MASK;
   case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
      return V3DV_DYNAMIC_STENCIL_WRITE_MASK;
   case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
      return V3DV_DYNAMIC_STENCIL_REFERENCE;
   default:
      unreachable("Unhandled dynamic state");
   }
}

static void
pipeline_init_dynamic_state(struct v3dv_pipeline *pipeline,
                            const VkGraphicsPipelineCreateInfo *pCreateInfo)
{
   pipeline->dynamic_state = default_dynamic_state;
   struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state;

   /* Create a mask of enabled dynamic states */
   uint32_t dynamic_states = 0;
   if (pCreateInfo->pDynamicState) {
      uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
      for (uint32_t s = 0; s < count; s++) {
         dynamic_states |=
            v3dv_dynamic_state_mask(pCreateInfo->pDynamicState->pDynamicStates[s]);
      }
   }

   /* For any pipeline states that are not dynamic, set the dynamic state
    * from the static pipeline state.
    *
    * Notice that we don't let the number of viewports and scissort rects to
    * be set dynamically, so these are always copied from the pipeline state.
    */
   dynamic->viewport.count = pCreateInfo->pViewportState->viewportCount;
   if (!(dynamic_states & V3DV_DYNAMIC_VIEWPORT)) {
      assert(pCreateInfo->pViewportState);

      typed_memcpy(dynamic->viewport.viewports,
                   pCreateInfo->pViewportState->pViewports,
                   pCreateInfo->pViewportState->viewportCount);

      for (uint32_t i = 0; i < dynamic->viewport.count; i++) {
         v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i],
                                     dynamic->viewport.scale[i],
                                     dynamic->viewport.translate[i]);
      }
   }

   dynamic->scissor.count = pCreateInfo->pViewportState->scissorCount;
   if (!(dynamic_states & V3DV_DYNAMIC_SCISSOR)) {
      typed_memcpy(dynamic->scissor.scissors,
                   pCreateInfo->pViewportState->pScissors,
                   pCreateInfo->pViewportState->scissorCount);
   }

   if (pCreateInfo->pDepthStencilState != NULL) {

      if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_COMPARE_MASK)) {
         dynamic->stencil_compare_mask.front =
            pCreateInfo->pDepthStencilState->front.compareMask;
         dynamic->stencil_compare_mask.back =
            pCreateInfo->pDepthStencilState->back.compareMask;
      }

      if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_WRITE_MASK)) {
         dynamic->stencil_write_mask.front =
            pCreateInfo->pDepthStencilState->front.writeMask;
         dynamic->stencil_write_mask.back =
            pCreateInfo->pDepthStencilState->back.writeMask;
      }

      if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_REFERENCE)) {
         dynamic->stencil_reference.front =
            pCreateInfo->pDepthStencilState->front.reference;
         dynamic->stencil_reference.back =
            pCreateInfo->pDepthStencilState->back.reference;
      }
   }

   pipeline->dynamic_state.mask = dynamic_states;
}

static void
pack_cfg_bits(struct v3dv_pipeline *pipeline,
              const VkPipelineDepthStencilStateCreateInfo *ds_info,
              const VkPipelineRasterizationStateCreateInfo *rs_info,
              const VkPipelineColorBlendStateCreateInfo *cb_info)
{
   assert(sizeof(pipeline->cfg_bits) == cl_packet_length(CFG_BITS));

   /* CFG_BITS allow to set a overall blend_enable that it is anded with the
    * per-target blend enable. v3d so far creates a mask with each target, so
    * we just set to true if any attachment has blending enabled
    */
   bool overall_blend_enable = false;
   if (cb_info) {
      for (uint32_t i = 0; i < cb_info->attachmentCount; i++) {
         const VkPipelineColorBlendAttachmentState *b_state =
            &cb_info->pAttachments[i];

         overall_blend_enable |= b_state->blendEnable;
      }
   }

   v3dv_pack(pipeline->cfg_bits, CFG_BITS, config) {
      config.enable_forward_facing_primitive =
         rs_info ? !(rs_info->cullMode & VK_CULL_MODE_FRONT_BIT) : false;

      config.enable_reverse_facing_primitive =
         rs_info ? !(rs_info->cullMode & VK_CULL_MODE_BACK_BIT) : false;

      /* Seems like the hardware is backwards regarding this setting... */
      config.clockwise_primitives =
         rs_info ? rs_info->frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE : false;

      config.enable_depth_offset = rs_info ? rs_info->depthBiasEnable: false;

      /* FIXME: oversample_mode postponed until msaa gets supported */
      config.rasterizer_oversample_mode = false;

      /* From the Vulkan spec:
       *
       *   "Provoking Vertex:
       *
       *       The vertex in a primitive from which flat shaded attribute
       *       values are taken. This is generally the “first” vertex in the
       *       primitive, and depends on the primitive topology."
       *
       * First vertex is the Direct3D style for provoking vertex. OpenGL uses
       * the last vertex by default.
       */
      config.direct3d_provoking_vertex = true;

      config.blend_enable = overall_blend_enable;

      /* Note: ez state may update based on the compiled FS, along with zsa
       * (FIXME: not done)
       */
      config.early_z_updates_enable = false;
      if (ds_info && ds_info->depthTestEnable) {
         config.z_updates_enable = true;
         config.early_z_enable = false;
         config.depth_test_function = ds_info->depthCompareOp;
      } else {
         config.depth_test_function = VK_COMPARE_OP_ALWAYS;
      }

      config.stencil_enable = ds_info ? ds_info->stencilTestEnable : false;
   };
}

static uint32_t
translate_stencil_op(enum pipe_stencil_op op)
{
   switch (op) {
   case VK_STENCIL_OP_KEEP:
      return V3D_STENCIL_OP_KEEP;
   case VK_STENCIL_OP_ZERO:
      return V3D_STENCIL_OP_ZERO;
   case VK_STENCIL_OP_REPLACE:
      return V3D_STENCIL_OP_REPLACE;
   case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
      return V3D_STENCIL_OP_INCR;
   case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
      return V3D_STENCIL_OP_DECR;
   case VK_STENCIL_OP_INVERT:
      return V3D_STENCIL_OP_INVERT;
   case VK_STENCIL_OP_INCREMENT_AND_WRAP:
      return V3D_STENCIL_OP_INCWRAP;
   case VK_STENCIL_OP_DECREMENT_AND_WRAP:
      return V3D_STENCIL_OP_DECWRAP;
   default:
      unreachable("bad stencil op");
   }
}

static void
pack_single_stencil_cfg(struct v3dv_pipeline *pipeline,
                        uint8_t *stencil_cfg,
                        bool is_front,
                        bool is_back,
                        const VkStencilOpState *stencil_state)
{
   /* From the Vulkan spec:
    *
    *   "Reference is an integer reference value that is used in the unsigned
    *    stencil comparison. The reference value used by stencil comparison
    *    must be within the range [0,2^s-1] , where s is the number of bits in
    *    the stencil framebuffer attachment, otherwise the reference value is
    *    considered undefined."
    *
    * In our case, 's' is always 8, so we clamp to that to prevent our packing
    * functions to assert in debug mode if they see larger values.
    *
    * If we have dynamic state we need to make sure we set the corresponding
    * state bits to 0, since cl_emit_with_prepacked ORs the new value with
    * the old.
    */
   const uint8_t write_mask =
      pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK ?
         0 : stencil_state->writeMask & 0xff;

   const uint8_t compare_mask =
      pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK ?
         0 : stencil_state->compareMask & 0xff;

   const uint8_t reference =
      pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK ?
         0 : stencil_state->reference & 0xff;

   v3dv_pack(stencil_cfg, STENCIL_CFG, config) {
      config.front_config = is_front;
      config.back_config = is_back;
      config.stencil_write_mask = write_mask;
      config.stencil_test_mask = compare_mask;
      config.stencil_test_function = stencil_state->compareOp;
      config.stencil_pass_op = translate_stencil_op(stencil_state->passOp);
      config.depth_test_fail_op = translate_stencil_op(stencil_state->depthFailOp);
      config.stencil_test_fail_op = translate_stencil_op(stencil_state->failOp);
      config.stencil_ref_value = reference;
   }
}

static void
pack_stencil_cfg(struct v3dv_pipeline *pipeline,
                 const VkPipelineDepthStencilStateCreateInfo *ds_info)
{
   assert(sizeof(pipeline->stencil_cfg) == 2 * cl_packet_length(STENCIL_CFG));

   if (!ds_info || !ds_info->stencilTestEnable)
      return;

   const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK |
                                           V3DV_DYNAMIC_STENCIL_WRITE_MASK |
                                           V3DV_DYNAMIC_STENCIL_REFERENCE;


   /* If front != back or we have dynamic stencil state we can't emit a single
    * packet for both faces.
    */
   bool needs_front_and_back = false;
   if ((pipeline->dynamic_state.mask & dynamic_stencil_states) ||
       memcmp(&ds_info->front, &ds_info->back, sizeof(ds_info->front)))
      needs_front_and_back = true;

   /* If the front and back configurations are the same we can emit both with
    * a single packet.
    */
   pipeline->emit_stencil_cfg[0] = true;
   if (!needs_front_and_back) {
      pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[0],
                              true, true, &ds_info->front);
   } else {
      pipeline->emit_stencil_cfg[1] = true;
      pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[0],
                              true, false, &ds_info->front);
      pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[1],
                              false, true, &ds_info->back);
   }
}

static bool
stencil_op_is_no_op(const VkStencilOpState *stencil)
{
   return stencil->depthFailOp == VK_STENCIL_OP_KEEP &&
          stencil->compareOp == VK_COMPARE_OP_ALWAYS;
}

static void
pipeline_set_ez_state(struct v3dv_pipeline *pipeline,
                      const VkPipelineDepthStencilStateCreateInfo *ds_info)
{
   if (!ds_info || !ds_info->depthTestEnable) {
      pipeline->ez_state = VC5_EZ_DISABLED;
      return;
   }

   switch (ds_info->depthCompareOp) {
   case VK_COMPARE_OP_LESS:
   case VK_COMPARE_OP_LESS_OR_EQUAL:
      pipeline->ez_state = VC5_EZ_LT_LE;
      break;
   case VK_COMPARE_OP_GREATER:
   case VK_COMPARE_OP_GREATER_OR_EQUAL:
      pipeline->ez_state = VC5_EZ_GT_GE;
      break;
   case VK_COMPARE_OP_NEVER:
   case VK_COMPARE_OP_EQUAL:
      pipeline->ez_state = VC5_EZ_UNDECIDED;
      break;
   default:
      pipeline->ez_state = VC5_EZ_DISABLED;
      break;
   }

   /* If stencil is enabled and is not a no-op, we need to disable EZ */
   if (ds_info->stencilTestEnable &&
       (!stencil_op_is_no_op(&ds_info->front) ||
        !stencil_op_is_no_op(&ds_info->back))) {
         pipeline->ez_state = VC5_EZ_DISABLED;
   }
}

static void
pack_shader_state_record(struct v3dv_pipeline *pipeline)
{
   assert(sizeof(pipeline->shader_state_record) ==
          cl_packet_length(GL_SHADER_STATE_RECORD));

   /* Note: we are not packing addresses, as we need the job (see
    * cl_pack_emit_reloc). Additionally uniforms can't be filled up at this
    * point as they depend on dynamic info that can be set after create the
    * pipeline (like viewport), . Would need to be filled later, so we are
    * doing a partial prepacking.
    */
   v3dv_pack(pipeline->shader_state_record, GL_SHADER_STATE_RECORD, shader) {
      shader.enable_clipping = true;

      shader.point_size_in_shaded_vertex_data =
         pipeline->vs->key.vs.per_vertex_point_size;

      /* Must be set if the shader modifies Z, discards, or modifies
       * the sample mask.  For any of these cases, the fragment
       * shader needs to write the Z value (even just discards).
       */
      shader.fragment_shader_does_z_writes =
         pipeline->fs->prog_data.fs->writes_z;
      /* Set if the EZ test must be disabled (due to shader side
       * effects and the early_z flag not being present in the
       * shader).
       */
      shader.turn_off_early_z_test =
         pipeline->fs->prog_data.fs->disable_ez;

      shader.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2 =
         pipeline->fs->prog_data.fs->uses_center_w;

      shader.any_shader_reads_hardware_written_primitive_id = false;

      shader.do_scoreboard_wait_on_first_thread_switch =
         pipeline->fs->prog_data.fs->lock_scoreboard_on_first_thrsw;
      shader.disable_implicit_point_line_varyings =
         !pipeline->fs->prog_data.fs->uses_implicit_point_line_varyings;

      shader.number_of_varyings_in_fragment_shader =
         pipeline->fs->prog_data.fs->num_inputs;

      shader.coordinate_shader_propagate_nans = true;
      shader.vertex_shader_propagate_nans = true;
      shader.fragment_shader_propagate_nans = true;

      /* Note: see previous note about adresses */
      /* shader.coordinate_shader_code_address */
      /* shader.vertex_shader_code_address */
      /* shader.fragment_shader_code_address */

      /* FIXME: Use combined input/output size flag in the common case (also
       * on v3d, see v3dx_draw).
       */
      shader.coordinate_shader_has_separate_input_and_output_vpm_blocks =
         pipeline->vs_bin->prog_data.vs->separate_segments;
      shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
         pipeline->vs->prog_data.vs->separate_segments;

      shader.coordinate_shader_input_vpm_segment_size =
         pipeline->vs_bin->prog_data.vs->separate_segments ?
         pipeline->vs_bin->prog_data.vs->vpm_input_size : 1;
      shader.vertex_shader_input_vpm_segment_size =
         pipeline->vs->prog_data.vs->separate_segments ?
         pipeline->vs->prog_data.vs->vpm_input_size : 1;

      shader.coordinate_shader_output_vpm_segment_size =
         pipeline->vs_bin->prog_data.vs->vpm_output_size;
      shader.vertex_shader_output_vpm_segment_size =
         pipeline->vs->prog_data.vs->vpm_output_size;

      /* Note: see previous note about adresses */
      /* shader.coordinate_shader_uniforms_address */
      /* shader.vertex_shader_uniforms_address */
      /* shader.fragment_shader_uniforms_address */

      shader.min_coord_shader_input_segments_required_in_play =
         pipeline->vpm_cfg_bin.As;
      shader.min_vertex_shader_input_segments_required_in_play =
         pipeline->vpm_cfg.As;

      shader.min_coord_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size =
         pipeline->vpm_cfg_bin.Ve;
      shader.min_vertex_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size =
         pipeline->vpm_cfg.Ve;

      shader.coordinate_shader_4_way_threadable =
         pipeline->vs_bin->prog_data.vs->base.threads == 4;
      shader.vertex_shader_4_way_threadable =
         pipeline->vs->prog_data.vs->base.threads == 4;
      shader.fragment_shader_4_way_threadable =
         pipeline->fs->prog_data.fs->base.threads == 4;

      shader.coordinate_shader_start_in_final_thread_section =
         pipeline->vs_bin->prog_data.vs->base.single_seg;
      shader.vertex_shader_start_in_final_thread_section =
         pipeline->vs->prog_data.vs->base.single_seg;
      shader.fragment_shader_start_in_final_thread_section =
         pipeline->fs->prog_data.fs->base.single_seg;

      shader.vertex_id_read_by_coordinate_shader =
         pipeline->vs_bin->prog_data.vs->uses_vid;
      shader.instance_id_read_by_coordinate_shader =
         pipeline->vs_bin->prog_data.vs->uses_iid;
      shader.vertex_id_read_by_vertex_shader =
         pipeline->vs->prog_data.vs->uses_vid;
      shader.instance_id_read_by_vertex_shader =
         pipeline->vs->prog_data.vs->uses_iid;

      /* Note: see previous note about adresses */
      /* shader.address_of_default_attribute_values */
   }
}

static void
pack_vcm_cache_size(struct v3dv_pipeline *pipeline)
{
   assert(sizeof(pipeline->vcm_cache_size) ==
          cl_packet_length(VCM_CACHE_SIZE));

   v3dv_pack(pipeline->vcm_cache_size, VCM_CACHE_SIZE, vcm) {
      vcm.number_of_16_vertex_batches_for_binning = pipeline->vpm_cfg_bin.Vc;
      vcm.number_of_16_vertex_batches_for_rendering = pipeline->vpm_cfg.Vc;
   }
}

/* As defined on the GL_SHADER_STATE_ATTRIBUTE_RECORD */
static uint8_t
get_attr_type(const struct util_format_description *desc)
{
   uint32_t r_size = desc->channel[0].size;
   uint8_t attr_type = ATTRIBUTE_FLOAT;

   switch (desc->channel[0].type) {
   case UTIL_FORMAT_TYPE_FLOAT:
      if (r_size == 32) {
         attr_type = ATTRIBUTE_FLOAT;
      } else {
         assert(r_size == 16);
         attr_type = ATTRIBUTE_HALF_FLOAT;
      }
      break;

   case UTIL_FORMAT_TYPE_SIGNED:
   case UTIL_FORMAT_TYPE_UNSIGNED:
      switch (r_size) {
      case 32:
         attr_type = ATTRIBUTE_INT;
         break;
      case 16:
         attr_type = ATTRIBUTE_SHORT;
         break;
      case 10:
         attr_type = ATTRIBUTE_INT2_10_10_10;
         break;
      case 8:
         attr_type = ATTRIBUTE_BYTE;
         break;
      default:
         fprintf(stderr,
                 "format %s unsupported\n",
                 desc->name);
         attr_type = ATTRIBUTE_BYTE;
         abort();
      }
      break;

   default:
      fprintf(stderr,
              "format %s unsupported\n",
              desc->name);
      abort();
   }

   return attr_type;
}

static void
create_default_attribute_values(struct v3dv_pipeline *pipeline,
                                const VkPipelineVertexInputStateCreateInfo *vi_info)
{
   uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;

   if (pipeline->default_attribute_values == NULL) {
      pipeline->default_attribute_values = v3dv_bo_alloc(pipeline->device, size,
                                                         "default_vi_attributes");

      if (!pipeline->default_attribute_values) {
         fprintf(stderr, "failed to allocate memory for the default "
                 "attribute values\n");
      }
   }

   bool ok = v3dv_bo_map(pipeline->device,
                         pipeline->default_attribute_values, size);
   if (!ok) {
      fprintf(stderr, "failed to map default attribute values buffer\n");
      abort();
   }

   uint32_t *attrs = pipeline->default_attribute_values->map;

   for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
      attrs[i * 4 + 0] = 0;
      attrs[i * 4 + 1] = 0;
      attrs[i * 4 + 2] = 0;
      if (i < pipeline->va_count && vk_format_is_int(pipeline->va[i].vk_format)) {
         attrs[i * 4 + 3] = 1;
      } else {
         attrs[i * 4 + 3] = fui(1.0);
      }
   }

   v3dv_bo_unmap(pipeline->device, pipeline->default_attribute_values);
}

static void
pack_shader_state_attribute_record(struct v3dv_pipeline *pipeline,
                                   uint32_t index,
                                   const VkVertexInputAttributeDescription *vi_desc)
{
   const uint32_t packet_length =
      cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);

   const struct util_format_description *desc =
      vk_format_description(vi_desc->format);

   uint32_t binding = vi_desc->binding;

   v3dv_pack(&pipeline->vertex_attrs[index * packet_length],
             GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {

      /* vec_size == 0 means 4 */
      attr.vec_size = desc->nr_channels & 3;
      attr.signed_int_type = (desc->channel[0].type ==
                              UTIL_FORMAT_TYPE_SIGNED);
      attr.normalized_int_type = desc->channel[0].normalized;
      attr.read_as_int_uint = desc->channel[0].pure_integer;

      attr.instance_divisor = MIN2(pipeline->vb[binding].instance_divisor,
                                   0xffff);
      attr.stride = pipeline->vb[binding].stride;
      attr.type = get_attr_type(desc);
   }
}

static VkResult
pipeline_init(struct v3dv_pipeline *pipeline,
              struct v3dv_device *device,
              const VkGraphicsPipelineCreateInfo *pCreateInfo,
              const VkAllocationCallbacks *alloc)
{
   VkResult result = VK_SUCCESS;

   pipeline->device = device;

   V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, pCreateInfo->layout);
   pipeline->layout = layout;

   V3DV_FROM_HANDLE(v3dv_render_pass, render_pass, pCreateInfo->renderPass);
   assert(pCreateInfo->subpass < render_pass->subpass_count);
   pipeline->subpass = &render_pass->subpasses[pCreateInfo->subpass];

   pipeline_init_dynamic_state(pipeline, pCreateInfo);

   /* If rasterization is not enabled, various CreateInfo structs must be
    * ignored.
    */
   const bool raster_enabled =
      !pCreateInfo->pRasterizationState->rasterizerDiscardEnable;

   const VkPipelineDepthStencilStateCreateInfo *ds_info =
      raster_enabled ? pCreateInfo->pDepthStencilState : NULL;

   const VkPipelineRasterizationStateCreateInfo *rs_info =
      raster_enabled ? pCreateInfo->pRasterizationState : NULL;

   const VkPipelineColorBlendStateCreateInfo *cb_info =
      raster_enabled ? pCreateInfo->pColorBlendState : NULL;

   pack_cfg_bits(pipeline, ds_info, rs_info, cb_info);
   pack_stencil_cfg(pipeline, ds_info);
   pipeline_set_ez_state(pipeline, ds_info);

   pipeline->primitive_restart =
      pCreateInfo->pInputAssemblyState->primitiveRestartEnable;

   result = pipeline_compile_graphics(pipeline, pCreateInfo, alloc);

   if (result != VK_SUCCESS) {
      /* Caller would already destroy the pipeline, and we didn't allocate any
       * extra info. We don't need to do anything else.
       */
      return result;
   }

   pack_shader_state_record(pipeline);
   pack_vcm_cache_size(pipeline);

   const VkPipelineVertexInputStateCreateInfo *vi_info =
      pCreateInfo->pVertexInputState;

   pipeline->vb_count = vi_info->vertexBindingDescriptionCount;
   for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
      const VkVertexInputBindingDescription *desc =
         &vi_info->pVertexBindingDescriptions[i];

      pipeline->vb[desc->binding].stride = desc->stride;
      pipeline->vb[desc->binding].instance_divisor = desc->inputRate;
   }

   pipeline->va_count = 0;
   nir_shader *shader = pipeline->vs->nir;

   for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
      const VkVertexInputAttributeDescription *desc =
         &vi_info->pVertexAttributeDescriptions[i];
      uint32_t location = desc->location + VERT_ATTRIB_GENERIC0;

      nir_variable *var = nir_find_variable_with_location(shader, nir_var_shader_in, location);

      if (var != NULL) {
         unsigned driver_location = var->data.driver_location;

         pipeline->va[pipeline->va_count].offset = desc->offset;
         pipeline->va[pipeline->va_count].binding = desc->binding;
         pipeline->va[pipeline->va_count].driver_location = driver_location;
         pipeline->va[pipeline->va_count].vk_format = desc->format;

         pack_shader_state_attribute_record(pipeline, pipeline->va_count, desc);

         pipeline->va_count++;
      }
   }
   create_default_attribute_values(pipeline, vi_info);

   return result;
}

static VkResult
graphics_pipeline_create(VkDevice _device,
                         VkPipelineCache _cache,
                         const VkGraphicsPipelineCreateInfo *pCreateInfo,
                         const VkAllocationCallbacks *pAllocator,
                         VkPipeline *pPipeline)
{
   V3DV_FROM_HANDLE(v3dv_device, device, _device);

   struct v3dv_pipeline *pipeline;
   VkResult result;

   pipeline = vk_zalloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (pipeline == NULL)
      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

   result = pipeline_init(pipeline, device,
                          pCreateInfo,
                          pAllocator);

   if (result != VK_SUCCESS) {
      vk_free2(&device->alloc, pAllocator, pipeline);
      return result;
   }

   *pPipeline = v3dv_pipeline_to_handle(pipeline);

   return VK_SUCCESS;
}

VkResult
v3dv_CreateGraphicsPipelines(VkDevice _device,
                             VkPipelineCache pipelineCache,
                             uint32_t count,
                             const VkGraphicsPipelineCreateInfo *pCreateInfos,
                             const VkAllocationCallbacks *pAllocator,
                             VkPipeline *pPipelines)
{
   VkResult result = VK_SUCCESS;

   for (uint32_t i = 0; i < count; i++) {
      VkResult local_result;

      local_result = graphics_pipeline_create(_device,
                                              pipelineCache,
                                              &pCreateInfos[i],
                                              pAllocator,
                                              &pPipelines[i]);

      if (local_result != VK_SUCCESS) {
         result = local_result;
         pPipelines[i] = VK_NULL_HANDLE;
      }
   }

   return result;
}