Merge branch 'panvk/pilot' into 'main'

Draft: Very draft! Panvk pilot shaders See merge request mesa/mesa!39009
2025-12-20 05:10:11 +01:00 · 2025-12-20 01:47:24 +00:00 · 2025-12-20 01:47:24 +00:00 · 600fdfbc09
commit 600fdfbc09
parent c430f394c5 07076b1dcb
18 changed files with 697 additions and 166 deletions
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@ -6623,6 +6623,8 @@ bool nir_opt_load_skip_helpers(nir_shader *shader, nir_opt_load_skip_helpers_opt
 void nir_sweep(nir_shader *shader);
 void nir_steal_function(nir_shader *nir, nir_function *f);
 nir_intrinsic_op nir_intrinsic_from_system_value(gl_system_value val);
 gl_system_value nir_system_value_from_intrinsic(nir_intrinsic_op intrin);
--- a/src/compiler/nir/nir_opt_preamble.c
+++ b/src/compiler/nir/nir_opt_preamble.c
@ -167,7 +167,6 @@ can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
   case nir_intrinsic_load_fb_layers_v3d:
   case nir_intrinsic_load_fep_w_v3d:
   case nir_intrinsic_load_tcs_num_patches_amd:
   case nir_intrinsic_load_sample_positions_pan:
   case nir_intrinsic_load_pipeline_stat_query_enabled_amd:
   case nir_intrinsic_load_prim_gen_query_enabled_amd:
   case nir_intrinsic_load_prim_xfb_query_enabled_amd:
@ -233,6 +232,8 @@ can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
   case nir_intrinsic_load_constant_agx:
   case nir_intrinsic_bindless_image_agx:
   case nir_intrinsic_bindless_sampler_agx:
   case nir_intrinsic_load_texel_buf_conv_pan:
   case nir_intrinsic_load_texel_buf_index_address_pan:
      return can_move_srcs(&instr->instr, ctx);
   /* Image/SSBO loads can be moved if they are CAN_REORDER and their
@ -240,6 +241,7 @@ can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
    */
   case nir_intrinsic_image_load:
   case nir_intrinsic_image_samples_identical:
   case nir_intrinsic_image_texel_address:
   case nir_intrinsic_bindless_image_load:
   case nir_intrinsic_load_global_bounded:
   case nir_intrinsic_load_ssbo:
@ -813,7 +815,7 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
   if (num_candidates == 0) {
      free(ctx.states);
-      return false;
+      return nir_no_progress(impl);
   }
   def_state **candidates = malloc(sizeof(*candidates) * num_candidates);
@ -880,7 +882,7 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
   if (num_candidates == 0) {
      free(ctx.states);
      free(candidates);
-      return false;
+      return nir_no_progress(impl);
   }
   /* Step 4: Figure out which candidates we're going to replace and assign an
@ -940,6 +942,12 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
   if (analyze_speculation_for_cf_list(&ctx, &impl->body))
      analyze_reconstructed(&ctx, impl);
   if (!BITSET_TEST_RANGE(ctx.reconstructed_defs, 0, impl->ssa_alloc)) {
      free(ctx.states);
      free(ctx.reconstructed_defs);
      return nir_no_progress(impl);
   }
   /* Step 5: Actually do the replacement. */
   struct hash_table *remap_table =
      _mesa_pointer_hash_table_create(NULL);
@ -980,11 +988,10 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
      }
   }
   nir_progress(true, impl, nir_metadata_control_flow);
   ralloc_free(remap_table);
   free(ctx.states);
   free(ctx.reconstructed_defs);
   _mesa_set_destroy(ctx.reconstructed_ifs, NULL);
-   return true;
+
   return nir_progress(true, impl, nir_metadata_control_flow);
 }
--- a/src/compiler/nir/nir_sweep.c
+++ b/src/compiler/nir/nir_sweep.c
@ -185,6 +185,29 @@ sweep_function(nir_shader *nir, nir_function *f)
      sweep_impl(nir, f->impl);
 }
 /** Steals a function from one shader into another.
 *
 * This knows nothing about dependencies so if this function calls other
 * functions, you'll need a more complex pass that uses nir_function_clone()
 * to steal them all.
 */
 void
 nir_steal_function(nir_shader *nir, nir_function *f)
 {
 #ifndef NDEBUG
   if (f->impl) {
      nir_foreach_block(block, f->impl) {
         nir_foreach_instr(instr, block) {
            assert(instr->type != nir_instr_type_call);
         }
      }
   }
 #endif
   exec_node_remove(&f->node);
   exec_list_push_tail(&nir->functions, &f->node);
   sweep_function(nir, f);
 }
 void
 nir_sweep(nir_shader *nir)
 {
--- a/src/panfrost/compiler/bifrost/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost/bifrost_compile.c
@ -6504,7 +6504,7 @@ bifrost_lower_texture_late_nir(nir_shader *nir, unsigned gpu_id)
 }
 static int
-compare_u32(const void* a, const void* b, void* _)
+compare_u32(const void* a, const void* b)
 {
   const uint32_t va = (uintptr_t)a;
   const uint32_t vb = (uintptr_t)b;
@ -6673,9 +6673,9 @@ bi_compile_variant_nir(nir_shader *nir,
         sorted[idx++] = (uintptr_t)entry.data;
      }
-      util_qsort_r(sorted, const_amount, sizeof(uint32_t), compare_u32, NULL);
+      qsort(sorted, const_amount, sizeof(uint32_t), compare_u32);
      uint32_t max_amount = MIN2(const_amount, ctx->inputs->fau_consts.max_amount);
-      uint32_t min_count_for_fau = max_amount > 0 ? sorted[max_amount - 1] : 0; 
+      uint32_t min_count_for_fau = max_amount > 0 ? sorted[max_amount - 1] : 0;
      ralloc_free(sorted);
      bi_foreach_instr_global_safe(ctx, I) {
@ -6837,7 +6837,7 @@ bi_compile_variant(nir_shader *nir,
   /* Software invariant: Only a secondary shader can appear at a nonzero
    * offset, to keep the ABI simple. */
-   assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
+   assert((offset == 0) ^ (idvs == BI_IDVS_VARYING || idvs == BI_IDVS_PILOT));
   struct pan_stats *stats =
      idvs == BI_IDVS_VARYING ? &info->stats_idvs_varying : &info->stats;
@ -6874,6 +6874,11 @@ bi_compile_variant(nir_shader *nir,
      info->vs.secondary_offset = offset;
      info->vs.secondary_preload = preload;
      info->vs.secondary_work_reg_count = ctx->info.work_reg_count;
   } else if (idvs == BI_IDVS_PILOT) {
      info->pilot.enable = (binary->size > offset);
      info->pilot.offset = offset;
      info->pilot.preload = preload;
      info->pilot.work_reg_count = ctx->info.work_reg_count;
   } else {
      info->preload = preload;
      info->work_reg_count = ctx->info.work_reg_count;
@ -6996,6 +7001,26 @@ bifrost_compile_shader_nir(nir_shader *nir,
   info->tls_size = nir->scratch_size;
   nir_shader *pilot = NULL;
   nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
   if (entrypoint->preamble) {
      pilot = nir_shader_create(nir, MESA_SHADER_COMPUTE, nir->options);
      pilot->info = nir->info;
      pilot->info.stage = MESA_SHADER_COMPUTE;
      pilot->info.workgroup_size[0] = 1;
      pilot->info.workgroup_size[1] = 1;
      pilot->info.workgroup_size[2] = 1;
      memset(&pilot->info.cs, 0, sizeof(pilot->info.cs));
      nir_function *pilot_main = entrypoint->preamble;
      nir_steal_function(pilot, pilot_main);
      entrypoint->preamble = NULL;
      pilot_main->is_preamble = false;
      pilot_main->is_entrypoint = true;
   }
   pan_nir_collect_varyings(nir, info, PAN_MEDIUMP_VARY_32BIT);
   if (nir->info.stage == MESA_SHADER_VERTEX && info->vs.idvs) {
@ -7010,6 +7035,9 @@ bifrost_compile_shader_nir(nir_shader *nir,
      bi_compile_variant(nir, inputs, binary, info, BI_IDVS_NONE);
   }
   if (pilot)
      bi_compile_variant(pilot, inputs, binary, info, BI_IDVS_PILOT);
   if (mesa_shader_stage_is_compute(nir->info.stage)) {
      /* Workgroups may be merged if the structure of the workgroup is
       * not software visible. This is true if neither shared memory
--- a/src/panfrost/compiler/bifrost/compiler.h
+++ b/src/panfrost/compiler/bifrost/compiler.h
@ -1055,6 +1055,9 @@ enum bi_idvs_mode {
   /* IDVS2 in use. Compiling a deferred shader (v12+) */
   BI_IDVS_ALL = 3,
   /* This is a pilot shader, and not a position or vertex shader */
   BI_IDVS_PILOT = 4,
 };
 #define BI_MAX_REGS 64
--- a/src/panfrost/compiler/meson.build
+++ b/src/panfrost/compiler/meson.build
@ -16,6 +16,7 @@ libpanfrost_compiler_files = files(
  'pan_nir_lower_vertex_id.c',
  'pan_nir_lower_writeout.c',
  'pan_nir_lower_xfb.c',
  'pan_nir_opt_pilot.c',
 )
 subdir('bifrost')
--- a/src/panfrost/compiler/pan_compiler.h
+++ b/src/panfrost/compiler/pan_compiler.h
@ -315,6 +315,23 @@ struct pan_shader_info {
      } cs;
   };
   struct {
      /* True if a pilot shader is used */
      bool enable;
      /* If the pilot shader is enabled, number of work registers used by
       * the pilot shader
       */
      uint16_t work_reg_count;
      /* If the pilot shader is enabled, the pilot shader's offset in the
       * program binary.
       */
      uint32_t offset;
      /* If the pilot shader is enabled, bit mask of preloaded registers
       * used by the varying shader
       */
      uint64_t preload;
   } pilot;
   /* Does the shader contains a barrier? or (for fragment shaders) does it
    * require helper invocations, which demand the same ordering guarantees
    * of the hardware? These notions are unified in the hardware, so we
--- a/src/panfrost/compiler/pan_nir.h
+++ b/src/panfrost/compiler/pan_nir.h
@ -60,6 +60,26 @@ bool pan_nir_lower_framebuffer(nir_shader *shader,
                               unsigned blend_shader_nr_samples,
                               bool broken_ld_special);
 struct pan_nir_opt_pilot_params {
   unsigned gpu_id;
   /* Offset in the FAU where the address to the FAU memory lives.  This is
    * used for writing FAUs from the pilot shader.
    */
   uint32_t fau_addr_offset_B;
   /* Offset in the FAU where the pilot data should go */
   uint32_t fau_pilot_data_offset_B;
   /* Amount of space in the FAU available for pilot data */
   uint32_t fau_pilot_data_space_B;
   /* Amount of actual pilot data consumed.  This is an output parameter */
   uint32_t pilot_data_size_B;
 };
 bool pan_nir_opt_pilot(nir_shader *nir, struct pan_nir_opt_pilot_params *p);
 uint32_t pan_nir_collect_noperspective_varyings_fs(nir_shader *s);
 /* Specify the mediump lowering behavior for pan_nir_collect_varyings */
--- a/src/panfrost/compiler/pan_nir_lower_writeout.c
+++ b/src/panfrost/compiler/pan_nir_lower_writeout.c
@ -93,144 +93,151 @@ kill_depth_stencil_writes(nir_builder *b, nir_intrinsic_instr *intr,
   return true;
 }
 static bool
 lower_zs_store_impl(nir_function_impl *impl)
 {
   nir_intrinsic_instr *stores[3] = {NULL};
   nir_intrinsic_instr *last_mask_store = NULL;
   nir_block *mask_block = NULL;
   unsigned writeout = 0;
   nir_foreach_block(block, impl) {
      nir_foreach_instr_safe(instr, block) {
         if (instr->type != nir_instr_type_intrinsic)
            continue;
         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
         if (intr->intrinsic != nir_intrinsic_store_output)
            continue;
         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
         if (sem.location == FRAG_RESULT_DEPTH) {
            stores[0] = intr;
            writeout |= PAN_WRITEOUT_Z;
         } else if (sem.location == FRAG_RESULT_STENCIL) {
            stores[1] = intr;
            writeout |= PAN_WRITEOUT_S;
         } else if (sem.dual_source_blend_index) {
            assert(!stores[2]); /* there should be only 1 source for dual blending */
            stores[2] = intr;
            writeout |= PAN_WRITEOUT_2;
         } else if (sem.location == FRAG_RESULT_SAMPLE_MASK) {
            last_mask_store = intr;
            mask_block = intr->instr.block;
         }
      }
   }
   if (!writeout && !last_mask_store)
      return nir_no_progress(impl);
   nir_block *common_block = mask_block;
   /* Ensure all stores are in the same block */
   for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) {
      if (!stores[i])
         continue;
      nir_block *block = stores[i]->instr.block;
      if (common_block)
         assert(common_block == block);
      else
         common_block = block;
   }
   /* move data stores in the common block to after the last mask store */
   if (last_mask_store) {
      nir_cursor insert_point = nir_after_instr(&last_mask_store->instr);
      nir_foreach_instr_safe(instr, mask_block) {
         if (instr->type != nir_instr_type_intrinsic)
            continue;
         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
         /* stop when we've reached the last store to mask */
         if (intr == last_mask_store)
            break;
         if (intr->intrinsic != nir_intrinsic_store_output)
            continue;
         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
         if (sem.location >= FRAG_RESULT_DATA0 &&
             sem.location <= FRAG_RESULT_DATA7) {
            nir_instr_move(insert_point, instr);
            insert_point = nir_after_instr(instr);
         }
      }
   }
   bool replaced = false;
   nir_foreach_block(block, impl) {
      nir_foreach_instr_safe(instr, block) {
         if (instr->type != nir_instr_type_intrinsic)
            continue;
         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
         if (intr->intrinsic != nir_intrinsic_store_output)
            continue;
         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
         if (sem.location < FRAG_RESULT_DATA0)
            continue;
         if (sem.dual_source_blend_index)
            continue;
         assert(nir_src_is_const(intr->src[1]) && "no indirect outputs");
         nir_builder b =
            nir_builder_at(nir_after_block_before_jump(instr->block));
         /* Trying to write depth twice results in the
          * wrong blend shader being executed on
          * Midgard */
         unsigned this_store = PAN_WRITEOUT_C | (replaced ? 0 : writeout);
         pan_nir_emit_combined_store(&b, intr, this_store, stores);
         nir_instr_remove(instr);
         replaced = true;
      }
   }
   /* Insert a store to the depth RT (0xff) if needed */
   if (!replaced) {
      nir_builder b =
         nir_builder_at(nir_after_block_before_jump(common_block));
      pan_nir_emit_combined_store(&b, NULL, writeout, stores);
   }
   for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) {
      if (stores[i])
         nir_instr_remove(&stores[i]->instr);
   }
   return nir_progress(true, impl, nir_metadata_control_flow);
 }
 bool
 pan_nir_lower_zs_store(nir_shader *nir)
 {
   bool progress = false;
-   if (nir->info.stage != MESA_SHADER_FRAGMENT)
+   if (nir->info.stage != MESA_SHADER_FRAGMENT) {
      nir_shader_preserve_all_metadata(nir);
      return false;
   }
   /* Remove all stencil/depth writes if early fragment test is forced. */
   if (nir->info.fs.early_fragment_tests)
      progress |= nir_shader_intrinsics_pass(nir, kill_depth_stencil_writes,
                                             nir_metadata_control_flow, NULL);
-   nir_foreach_function_impl(impl, nir) {
+   nir_foreach_function_impl(impl, nir)
-      nir_intrinsic_instr *stores[3] = {NULL};
+      progress |= lower_zs_store_impl(impl);
      nir_intrinsic_instr *last_mask_store = NULL;
      nir_block *mask_block = NULL;
      unsigned writeout = 0;
      nir_foreach_block(block, impl) {
         nir_foreach_instr_safe(instr, block) {
            if (instr->type != nir_instr_type_intrinsic)
               continue;
            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
            if (intr->intrinsic != nir_intrinsic_store_output)
               continue;
            nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
            if (sem.location == FRAG_RESULT_DEPTH) {
               stores[0] = intr;
               writeout |= PAN_WRITEOUT_Z;
            } else if (sem.location == FRAG_RESULT_STENCIL) {
               stores[1] = intr;
               writeout |= PAN_WRITEOUT_S;
            } else if (sem.dual_source_blend_index) {
               assert(!stores[2]); /* there should be only 1 source for dual blending */
               stores[2] = intr;
               writeout |= PAN_WRITEOUT_2;
            } else if (sem.location == FRAG_RESULT_SAMPLE_MASK) {
               last_mask_store = intr;
               mask_block = intr->instr.block;
            }
         }
      }
      if (!writeout && !last_mask_store)
         continue;
      nir_block *common_block = mask_block;
      /* Ensure all stores are in the same block */
      for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) {
         if (!stores[i])
            continue;
         nir_block *block = stores[i]->instr.block;
         if (common_block)
            assert(common_block == block);
         else
            common_block = block;
      }
      /* move data stores in the common block to after the last mask store */
      if (last_mask_store) {
         nir_cursor insert_point = nir_after_instr(&last_mask_store->instr);
         nir_foreach_instr_safe(instr, mask_block) {
            if (instr->type != nir_instr_type_intrinsic)
               continue;
            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
            /* stop when we've reached the last store to mask */
            if (intr == last_mask_store)
               break;
            if (intr->intrinsic != nir_intrinsic_store_output)
               continue;
            nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
            if (sem.location >= FRAG_RESULT_DATA0 &&
                sem.location <= FRAG_RESULT_DATA7) {
               nir_instr_move(insert_point, instr);
               insert_point = nir_after_instr(instr);
            }
         }
      }
      bool replaced = false;
      nir_foreach_block(block, impl) {
         nir_foreach_instr_safe(instr, block) {
            if (instr->type != nir_instr_type_intrinsic)
               continue;
            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
            if (intr->intrinsic != nir_intrinsic_store_output)
               continue;
            nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
            if (sem.location < FRAG_RESULT_DATA0)
               continue;
            if (sem.dual_source_blend_index)
               continue;
            assert(nir_src_is_const(intr->src[1]) && "no indirect outputs");
            nir_builder b =
               nir_builder_at(nir_after_block_before_jump(instr->block));
            /* Trying to write depth twice results in the
             * wrong blend shader being executed on
             * Midgard */
            unsigned this_store = PAN_WRITEOUT_C | (replaced ? 0 : writeout);
            pan_nir_emit_combined_store(&b, intr, this_store, stores);
            nir_instr_remove(instr);
            replaced = true;
         }
      }
      /* Insert a store to the depth RT (0xff) if needed */
      if (!replaced) {
         nir_builder b =
            nir_builder_at(nir_after_block_before_jump(common_block));
         pan_nir_emit_combined_store(&b, NULL, writeout, stores);
      }
      for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) {
         if (stores[i])
            nir_instr_remove(&stores[i]->instr);
      }
      progress = nir_progress(true, impl, nir_metadata_control_flow);
   }
   return progress;
 }
--- a/src/panfrost/compiler/pan_nir_opt_pilot.c
+++ b/src/panfrost/compiler/pan_nir_opt_pilot.c
@ -0,0 +1,194 @@
 /*
 * Copyright (C) 2025 Collabora, Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
 #include "compiler/nir/nir_builder.h"
 #include "panfrost/model/pan_model.h"
 #include "pan_nir.h"
 static void
 pilot_def_size(nir_def *def, unsigned *size, unsigned *alignment,
               nir_preamble_class *class)
 {
   unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
   unsigned bits =bit_size * def->num_components;
   *size = align(bits / 8, 4);
   *alignment = 4;
   *class = nir_preamble_class_general;
 }
 static float
 pilot_instr_cost(nir_instr *instr, const void *data)
 {
   switch (instr->type) {
   case nir_instr_type_alu: {
      nir_alu_instr *alu = nir_instr_as_alu(instr);
      if (alu->def.bit_size == 64 || alu->src[0].src.ssa->bit_size == 64)
         return 10.0;
      /* Shrug */
      return 2.0;
   }
   case nir_instr_type_tex:
      return 20.0;
   case nir_instr_type_intrinsic:
      switch (nir_instr_as_intrinsic(instr)->intrinsic) {
      case nir_intrinsic_load_ubo:
      case nir_intrinsic_load_global:
      case nir_intrinsic_load_global_constant:
      case nir_intrinsic_load_ssbo:
         return 20.0;
      default:
         /* Assume it's a sysval or something */
         return 0.0;
      }
   case nir_instr_type_load_const:
   case nir_instr_type_undef:
      return 0.0;
   default:
      return 1.0;
   }
 }
 static float
 pilot_rewrite_cost(nir_def *def, const void *data)
 {
   unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
   unsigned bits = bit_size * def->num_components;
   unsigned dwords = DIV_ROUND_UP(bits, 32);
   bool needs_mov = false;
   nir_foreach_use(use, def) {
      nir_instr *parent_instr = nir_src_parent_instr(use);
      switch (parent_instr->type) {
      case nir_instr_type_tex:
      case nir_instr_type_intrinsic:
         needs_mov = true;
         break;
      default:
         break;
      }
   }
   /* It's not a mov but booleans require a conversion */
   if (def->bit_size == 1)
      needs_mov = true;
   /* We want rewrites to always cost something or else we'll pull constants
    * and undefs into the pilot shader.
    */
   return needs_mov ? dwords * 2.0 : 1.0;
 }
 static bool
 pilot_avoid_instr(const nir_instr *instr, const void *data)
 {
   return false;
 }
 static bool
 lower_pilot_intr(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
   const struct pan_nir_opt_pilot_params *p = data;
   switch (intr->intrinsic) {
   case nir_intrinsic_load_subgroup_size: {
      b->cursor = nir_before_instr(&intr->instr);
      unsigned sg_size = pan_subgroup_size(pan_arch(p->gpu_id));
      nir_def_replace(&intr->def, nir_imm_int(b, sg_size));
      return true;
   }
   case nir_intrinsic_load_preamble: {
      b->cursor = nir_before_instr(&intr->instr);
      uint32_t offset = p->fau_pilot_data_offset_B + nir_intrinsic_base(intr);
      assert(offset % 4 == 0);
      unsigned bit_size = intr->def.bit_size == 1 ? 32 : intr->def.bit_size;
      unsigned bits = bit_size * intr->def.num_components;
      nir_def *val = nir_load_push_constant(b,
         DIV_ROUND_UP(bits, 32), 32, nir_imm_int(b, offset));
      val = nir_bitcast_vector(b, val, bit_size);
      val = nir_trim_vector(b, val, intr->def.num_components);
      if (intr->def.bit_size == 1)
         val = nir_b2b1(b, val);
      nir_def_replace(&intr->def, val);
      return true;
   }
   case nir_intrinsic_store_preamble: {
      b->cursor = nir_before_instr(&intr->instr);
      nir_def *val = intr->src[0].ssa;
      if (val->bit_size == 1)
         val = nir_b2b32(b, val);
      nir_def *push_const_buf = nir_load_push_constant(
         b, 1, 64, nir_imm_int(b, p->fau_addr_offset_B));
      uint32_t offset = p->fau_pilot_data_offset_B + nir_intrinsic_base(intr);
      nir_store_global(b, val, nir_iadd_imm(b, push_const_buf, offset),
                       .align_mul = 4);
      nir_instr_remove(&intr->instr);
      return true;
   }
   default:
      return false;
   }
 }
 bool
 pan_nir_opt_pilot(nir_shader *nir, struct pan_nir_opt_pilot_params *p)
 {
   bool progress = false;
   const nir_opt_preamble_options preamble_opts = {
      .subgroup_size_uniform = true,
      .def_size = pilot_def_size,
      .preamble_storage_size = {
         [nir_preamble_class_general] = p->fau_pilot_data_space_B,
      },
      .instr_cost_cb = pilot_instr_cost,
      .rewrite_cost_cb = pilot_rewrite_cost,
      .avoid_instr_cb = pilot_avoid_instr,
   };
   NIR_PASS(progress, nir, nir_opt_preamble, &preamble_opts,
            &p->pilot_data_size_B);
   if (!progress)
      return false;
   NIR_PASS(progress, nir, nir_shader_intrinsics_pass, lower_pilot_intr,
            nir_metadata_control_flow, p);
   return progress;
 }
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
@ -172,9 +172,11 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
   if (result != VK_SUCCESS)
      return;
-   result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1);
+   if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
-   if (result != VK_SUCCESS)
+      result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1);
-      return;
+      if (result != VK_SUCCESS)
         return;
   }
   if (compute_state_dirty(cmdbuf, CS) ||
       compute_state_dirty(cmdbuf, DESC_STATE)) {
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
@ -452,6 +452,89 @@ prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
   return VK_SUCCESS;
 }
 static void
 launch_pilot(struct panvk_cmd_buffer *cmdbuf,
             const struct panvk_shader_variant *s,
             struct panvk_shader_desc_state *desc_state,
             uint64_t push_uniforms)
 {
   if (!s->info.pilot.enable)
      return;
   const struct pan_compute_dim job_size = { 1, 1, 1 };
   struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
   cs_update_compute_ctx(b) {
      cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SRT_2),
                   desc_state->res_table);
      uint64_t fau_ptr = push_uniforms | ((uint64_t)s->fau.total_count << 56);
      cs_move64_to(b, cs_sr_reg64(b, COMPUTE, FAU_2), fau_ptr);
      cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_2),
                   panvk_priv_mem_dev_addr(s->pilot_spd));
      /* TSD_2 is set in update_tls() */
      struct mali_compute_size_workgroup_packed wg_size;
      pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
         cfg.workgroup_size_x = 1;
         cfg.workgroup_size_y = 1;
         cfg.workgroup_size_z = 1;
         cfg.allow_merging_workgroups = true;
      }
      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, WG_SIZE), wg_size.opaque[0]);
      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_OFFSET_X), 0);
      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_OFFSET_Y), 0);
      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_OFFSET_Z), 0);
      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_X), job_size.x);
      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_Y), job_size.y);
      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_Z), job_size.z);
   }
   const struct cs_tracing_ctx *tracing_ctx =
      &cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].tracing;
   cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
                        1, MALI_TASK_AXIS_X, cs_shader_res_sel(2, 2, 2, 2));
 #if PAN_ARCH >= 11
   struct cs_index sync_addr = cs_scratch_reg64(b, 0);
   struct cs_index add_val = cs_scratch_reg64(b, 2);
   cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
                offsetof(struct panvk_cs_subqueue_context, syncobjs));
   cs_add64(b, sync_addr, sync_addr,
            PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
   cs_move64_to(b, add_val, 1);
   panvk_instr_sync64_add(cmdbuf, PANVK_SUBQUEUE_COMPUTE, true,
                          MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,
                          cs_defer_indirect());
 #else
   struct cs_index sync_addr = cs_scratch_reg64(b, 0);
   struct cs_index iter_sb = cs_scratch_reg32(b, 2);
   struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
   struct cs_index add_val = cs_scratch_reg64(b, 4);
   cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
              BITFIELD_MASK(3),
              offsetof(struct panvk_cs_subqueue_context, syncobjs));
   cs_add64(b, sync_addr, sync_addr,
            PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
   cs_move64_to(b, add_val, 1);
   cs_match_iter_sb(b, x, iter_sb, cmp_scratch) {
      panvk_instr_sync64_add(cmdbuf, PANVK_SUBQUEUE_COMPUTE, true,
                             MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,
                             cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)));
   }
 #endif
   ++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point;
 }
 static bool
 has_depth_att(struct panvk_cmd_buffer *cmdbuf)
 {
@ -596,6 +679,8 @@ update_tls(struct panvk_cmd_buffer *cmdbuf)
      panvk_shader_only_variant(cmdbuf->state.gfx.fs.shader);
   struct cs_builder *b =
      panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
   struct cs_builder *cs_b =
      panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
   if (!cmdbuf->state.gfx.tsd) {
      if (!state->desc.gpu) {
@ -616,6 +701,10 @@ update_tls(struct panvk_cmd_buffer *cmdbuf)
         cs_move64_to(b, cs_sr_reg64(b, IDVS, TSD_0), state->desc.gpu);
 #endif
      }
      cs_update_compute_ctx(cs_b) {
         cs_move64_to(cs_b, cs_sr_reg64(b, COMPUTE, TSD_2), state->desc.gpu);
      }
   }
   state->info.tls.size =
@ -1764,7 +1853,10 @@ prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf,
      }
   }
-   if (fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) {
+   bool fs_pilot = fs && fs->info.pilot.enable;
   if (fs_user_dirty(cmdbuf) ||
       (fs_pilot && gfx_state_dirty(cmdbuf, DESC_STATE)) ||
       gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) {
      uint64_t fau_ptr = 0;
      if (fs) {
@ -1772,6 +1864,10 @@ prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf,
         if (result != VK_SUCCESS)
            return result;
         if (fs->info.pilot.enable)
            launch_pilot(cmdbuf, fs, &cmdbuf->state.gfx.fs.desc,
                         cmdbuf->state.gfx.fs.push_uniforms);
         fau_ptr = cmdbuf->state.gfx.fs.push_uniforms |
                   ((uint64_t)fs->fau.total_count << 56);
      }
@ -2310,10 +2406,6 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
   panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, draw);
   result = prepare_push_uniforms(cmdbuf, draw);
   if (result != VK_SUCCESS)
      return result;
   result = prepare_vs(cmdbuf, draw);
   if (result != VK_SUCCESS)
      return result;
@ -2322,6 +2414,10 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
   if (result != VK_SUCCESS)
      return result;
   result = prepare_push_uniforms(cmdbuf, draw);
   if (result != VK_SUCCESS)
      return result;
   /* Assumes 16 byte slots. We could do better. */
   uint32_t varying_size = get_varying_slots(cmdbuf) * 16;
@ -3069,6 +3165,17 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
   struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
   bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0;
   /* Wait on any pilot shaders */
   const struct panvk_cs_deps deps = {
      .src[PANVK_SUBQUEUE_COMPUTE] = {
         .wait_sb_mask = dev->csf.sb.all_iters_mask,
      },
      .dst[PANVK_SUBQUEUE_FRAGMENT] = {
         .wait_subqueue_mask = BITFIELD_BIT(PANVK_SUBQUEUE_COMPUTE),
      },
   };
   panvk_per_arch(emit_barrier)(cmdbuf, deps);
   /* Now initialize the fragment bits. */
   cs_update_frag_ctx(b) {
      cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN),
--- a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c
@ -103,9 +103,11 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
   panvk_per_arch(cmd_prepare_dispatch_sysvals)(cmdbuf, info);
-   result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1);
+   if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
-   if (result != VK_SUCCESS)
+      result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1);
-      return;
+      if (result != VK_SUCCESS)
         return;
   }
   struct pan_ptr copy_desc_job = {0};
--- a/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c
@ -1471,12 +1471,14 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw)
         gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
      }
-      result = panvk_per_arch(cmd_prepare_push_uniforms)(
+      if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
-         cmdbuf, vs, 1);
+         result = panvk_per_arch(cmd_prepare_push_uniforms)(
-      if (result != VK_SUCCESS)
+            cmdbuf, vs, 1);
-         return;
+         if (result != VK_SUCCESS)
            return;
      }
-      if (fs) {
+      if (fs && compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
         result = panvk_per_arch(cmd_prepare_push_uniforms)(
            cmdbuf, fs, 1);
         if (result != VK_SUCCESS)
@ -1633,12 +1635,14 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
         gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
      }
-      result = panvk_per_arch(cmd_prepare_push_uniforms)(
+      if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
-         cmdbuf, vs, 1);
+         result = panvk_per_arch(cmd_prepare_push_uniforms)(
-      if (result != VK_SUCCESS)
+            cmdbuf, vs, 1);
-         return;
+         if (result != VK_SUCCESS)
            return;
      }
-      if (fs) {
+      if (fs && compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
         result = panvk_per_arch(cmd_prepare_push_uniforms)(
            cmdbuf, fs, 1);
         if (result != VK_SUCCESS)
--- a/src/panfrost/vulkan/panvk_shader.h
+++ b/src/panfrost/vulkan/panvk_shader.h
@ -348,6 +348,8 @@ struct panvk_shader_fau_info {
   BITSET_DECLARE(used_sysvals, MAX_SYSVAL_FAUS);
   BITSET_DECLARE(used_push_consts, MAX_PUSH_CONST_FAUS);
   uint32_t sysval_count;
   uint32_t push_count;
   uint32_t pilot_count;
   uint32_t total_count;
 };
@ -416,6 +418,7 @@ struct panvk_shader_variant {
 #endif
      } spds;
   };
   struct panvk_priv_mem pilot_spd;
 #endif
   const char *nir_str;
--- a/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c
+++ b/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c
@ -17,18 +17,12 @@ panvk_per_arch(cmd_prepare_push_uniforms)(
   switch (shader->info.stage) {
   case MESA_SHADER_COMPUTE:
      if (!compute_state_dirty(cmdbuf, PUSH_UNIFORMS))
         return VK_SUCCESS;
      push_ptr = &cmdbuf->state.compute.push_uniforms;
      break;
   case MESA_SHADER_VERTEX:
      if (!gfx_state_dirty(cmdbuf, VS_PUSH_UNIFORMS))
         return VK_SUCCESS;
      push_ptr = &cmdbuf->state.gfx.vs.push_uniforms;
      break;
   case MESA_SHADER_FRAGMENT:
      if (!gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS))
         return VK_SUCCESS;
      push_ptr = &cmdbuf->state.gfx.fs.push_uniforms;
      break;
   default:
@ -78,6 +72,9 @@ panvk_per_arch(cmd_prepare_push_uniforms)(
      BITSET_FOREACH_SET(w, shader->fau.used_push_consts, MAX_PUSH_CONST_FAUS)
         faus[fau++] = push_consts[w];
      /* These are populated by the pilot shader */
      fau += shader->fau.pilot_count;
      for (uint32_t i = 0; i < shader->info.fau_consts_count; i += 2) {
         faus[fau++] = (uint64_t)shader->info.fau_consts[i + 1] << 32 |
                       shader->info.fau_consts[i];
--- a/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c
+++ b/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c
@ -569,7 +569,7 @@ load_resource_deref_desc(nir_builder *b, nir_deref_instr *deref,
      b, num_components, bit_size,
      nir_imm_int(b, pan_res_handle(VALHALL_RESOURCE_TABLE_IDX, set + 1)),
      set_offset, .range = ~0u, .align_mul = PANVK_DESCRIPTOR_SIZE,
-      .align_offset = desc_offset);
+      .align_offset = desc_offset, .access = ACCESS_CAN_SPECULATE);
 #endif
 }
@ -1162,6 +1162,9 @@ static bool
 collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
                        struct lower_desc_ctx *ctx)
 {
   /* We'll also set can_speculate here. */
   tex->can_speculate = true;
   bool recorded = false;
   uint32_t plane = 0;
   int sampler_src_idx =
@ -1184,6 +1187,9 @@ collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
      struct panvk_subdesc_info subdesc =
         get_sampler_subdesc_info(binding_layout->type, plane);
      if (binding_layout->flags & VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT)
         tex->can_speculate = false;
      record_binding(ctx, set, binding, subdesc, max_idx);
      recorded = true;
   }
@ -1203,6 +1209,9 @@ collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
      struct panvk_subdesc_info subdesc =
         get_tex_subdesc_info(binding_layout->type, plane);
      if (binding_layout->flags & VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT)
         tex->can_speculate = false;
      record_binding(ctx, set, binding, subdesc, max_idx);
      recorded = true;
   }
@ -1210,11 +1219,66 @@ collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
   return recorded;
 }
 static bool
 mark_load_can_speculate(nir_intrinsic_instr *load,
                        const struct lower_desc_ctx *ctx)
 {
   assert(load->intrinsic == nir_intrinsic_load_deref);
   /* If it's already marked CAN_SPECULATE, there's nothing to do */
   enum gl_access_qualifier access = nir_intrinsic_access(load);
   if (access & ACCESS_CAN_SPECULATE)
      return false;
   nir_deref_instr *deref = nir_src_as_deref(load->src[0]);
   if (!nir_deref_mode_is_one_of(deref, nir_var_mem_ubo | nir_var_mem_ssbo))
      return false;
   if (nir_deref_mode_is(deref, nir_var_mem_ssbo) &&
       ((access & ACCESS_VOLATILE) || !(access & ACCESS_CAN_REORDER)))
      return false;
   while (true) {
      nir_deref_instr *parent = nir_deref_instr_parent(deref);
      if (parent == NULL)
         break;
      deref = parent;
   }
   assert(deref->deref_type == nir_deref_type_cast);
   nir_intrinsic_instr *desc_load = nir_src_as_intrinsic(deref->parent);
   if (desc_load == NULL ||
       desc_load->intrinsic != nir_intrinsic_load_vulkan_descriptor)
      return false;
   nir_intrinsic_instr *desc_index = nir_src_as_intrinsic(desc_load->src[0]);
   if (desc_index == NULL ||
       desc_index->intrinsic != nir_intrinsic_vulkan_resource_index)
      return false;
   unsigned set = nir_intrinsic_desc_set(desc_index);
   unsigned binding = nir_intrinsic_binding(desc_index);
   const struct panvk_descriptor_set_layout *set_layout =
      get_set_layout(set, ctx);
   const struct panvk_descriptor_set_binding_layout *bind_layout =
      &set_layout->bindings[binding];
   if (bind_layout->flags & VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT)
      return false;
   access |= ACCESS_CAN_SPECULATE;
   nir_intrinsic_set_access(load, access);
   return true;
 }
 static bool
 collect_intr_desc_access(nir_builder *b, nir_intrinsic_instr *intrin,
                         struct lower_desc_ctx *ctx)
 {
   switch (intrin->intrinsic) {
   case nir_intrinsic_load_deref:
      return mark_load_can_speculate(intrin, ctx);
   case nir_intrinsic_vulkan_resource_index: {
      unsigned set, binding;
@ -1338,6 +1402,9 @@ panvk_per_arch(nir_lower_descriptors)(
   for (uint32_t i = 0; i < set_layout_count; i++)
      ctx.set_layouts[i] = to_panvk_descriptor_set_layout(set_layouts[i]);
   /* Collect descriptor access.  This also marks texture and UBO/SSBO loads
    * as can_speculate based on descriptor binding flags.
    */
   NIR_PASS(progress, nir, nir_shader_instructions_pass,
            collect_instr_desc_access, nir_metadata_all, &ctx);
   if (!progress)
--- a/src/panfrost/vulkan/panvk_vX_shader.c
+++ b/src/panfrost/vulkan/panvk_vX_shader.c
@ -65,6 +65,12 @@
 #include "vk_shader.h"
 #include "vk_util.h"
 static inline bool
 panvk_wants_pilot_shader(mesa_shader_stage stage, unsigned arch)
 {
   return arch >= 9 && stage == MESA_SHADER_FRAGMENT;
 }
 #define FAU_WORD_COUNT 64
 struct panvk_lower_sysvals_context {
@ -700,6 +706,9 @@ lower_load_push_consts(nir_shader *nir, struct panvk_shader_variant *shader)
      shader_use_sysval(shader, graphics, blend.constants);
   }
   if (panvk_wants_pilot_shader(nir->info.stage, PAN_ARCH))
      shader_use_sysval(shader, common, push_uniforms);
   progress = false;
   NIR_PASS(progress, nir, nir_shader_intrinsics_pass, collect_push_constant,
            nir_metadata_all, shader);
@ -717,8 +726,8 @@ lower_load_push_consts(nir_shader *nir, struct panvk_shader_variant *shader)
   /* 32 FAUs (256 bytes) are reserved for API push constants */
   assert(shader->fau.sysval_count <= FAU_WORD_COUNT - 32 &&
          "too many sysval FAUs");
-   shader->fau.total_count =
+   shader->fau.push_count = BITSET_COUNT(shader->fau.used_push_consts);
-      shader->fau.sysval_count + BITSET_COUNT(shader->fau.used_push_consts);
+   shader->fau.total_count = shader->fau.sysval_count + shader->fau.push_count;
   assert(shader->fau.total_count <= FAU_WORD_COUNT &&
          "asking for more FAUs than the hardware has to offer");
@ -925,6 +934,26 @@ panvk_compile_nir(struct panvk_device *dev, nir_shader *nir,
   lower_load_push_consts(nir, shader);
   if (panvk_wants_pilot_shader(nir->info.stage, PAN_ARCH)) {
      /* Our pilots will be more efficient if we optimize first. */
      pan_optimize_nir(nir, input.gpu_id);
      struct pan_nir_opt_pilot_params pilot = {
         .gpu_id = input.gpu_id,
         .fau_addr_offset_B = shader_remapped_sysval_offset(
            shader, sysval_offset(common, push_uniforms)),
         .fau_pilot_data_offset_B =
            FAU_WORD_SIZE * shader->fau.total_count,
         .fau_pilot_data_space_B =
            FAU_WORD_SIZE * (FAU_WORD_COUNT - shader->fau.total_count),
      };
      pan_nir_opt_pilot(nir, &pilot);
      shader->fau.pilot_count =
         DIV_ROUND_UP(pilot.pilot_data_size_B, FAU_WORD_SIZE);
      shader->fau.total_count += shader->fau.pilot_count;
   }
   /* Allow the remaining FAU space to be filled with constants. */
   input.fau_consts.max_amount =
      2 * (FAU_WORD_COUNT - shader->fau.total_count);
@ -1208,6 +1237,23 @@ panvk_shader_upload(struct panvk_device *dev,
      }
 #endif
   }
   if (shader->info.pilot.enable) {
      shader->pilot_spd =
         panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
      if (!panvk_priv_mem_check_alloc(shader->pilot_spd))
         return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
      panvk_priv_mem_write_desc(shader->pilot_spd, 0, SHADER_PROGRAM, cfg) {
         cfg.stage = MALI_SHADER_STAGE_COMPUTE,
         cfg.register_allocation =
            pan_register_allocation(shader->info.pilot.work_reg_count);
         cfg.binary = panvk_shader_variant_get_dev_addr(shader) +
                      shader->info.pilot.offset;
         cfg.preload.r48_r63 = (shader->info.pilot.preload >> 48);
         cfg.flush_to_zero_mode = shader_ftz_mode(shader);
      }
   }
 #endif
   return VK_SUCCESS;
@ -1237,6 +1283,7 @@ panvk_shader_variant_destroy(struct panvk_shader_variant *shader)
      panvk_pool_free_mem(&shader->spds.pos_triangles);
 #endif
   }
   panvk_pool_free_mem(&shader->pilot_spd);
 #endif
   if (shader->own_bin)