diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 7209d28e853..442e34b2f10 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -6623,6 +6623,8 @@ bool nir_opt_load_skip_helpers(nir_shader *shader, nir_opt_load_skip_helpers_opt
 
 void nir_sweep(nir_shader *shader);
 
+void nir_steal_function(nir_shader *nir, nir_function *f);
+
 nir_intrinsic_op nir_intrinsic_from_system_value(gl_system_value val);
 gl_system_value nir_system_value_from_intrinsic(nir_intrinsic_op intrin);
 
diff --git a/src/compiler/nir/nir_opt_preamble.c b/src/compiler/nir/nir_opt_preamble.c
index 8686926aab3..3084fda9519 100644
--- a/src/compiler/nir/nir_opt_preamble.c
+++ b/src/compiler/nir/nir_opt_preamble.c
@@ -167,7 +167,6 @@ can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
    case nir_intrinsic_load_fb_layers_v3d:
    case nir_intrinsic_load_fep_w_v3d:
    case nir_intrinsic_load_tcs_num_patches_amd:
-   case nir_intrinsic_load_sample_positions_pan:
    case nir_intrinsic_load_pipeline_stat_query_enabled_amd:
    case nir_intrinsic_load_prim_gen_query_enabled_amd:
    case nir_intrinsic_load_prim_xfb_query_enabled_amd:
@@ -233,6 +232,8 @@ can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
    case nir_intrinsic_load_constant_agx:
    case nir_intrinsic_bindless_image_agx:
    case nir_intrinsic_bindless_sampler_agx:
+   case nir_intrinsic_load_texel_buf_conv_pan:
+   case nir_intrinsic_load_texel_buf_index_address_pan:
       return can_move_srcs(&instr->instr, ctx);
 
    /* Image/SSBO loads can be moved if they are CAN_REORDER and their
@@ -240,6 +241,7 @@ can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
     */
    case nir_intrinsic_image_load:
    case nir_intrinsic_image_samples_identical:
+   case nir_intrinsic_image_texel_address:
    case nir_intrinsic_bindless_image_load:
    case nir_intrinsic_load_global_bounded:
    case nir_intrinsic_load_ssbo:
@@ -813,7 +815,7 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
 
    if (num_candidates == 0) {
       free(ctx.states);
-      return false;
+      return nir_no_progress(impl);
    }
 
    def_state **candidates = malloc(sizeof(*candidates) * num_candidates);
@@ -880,7 +882,7 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
    if (num_candidates == 0) {
       free(ctx.states);
       free(candidates);
-      return false;
+      return nir_no_progress(impl);
    }
 
    /* Step 4: Figure out which candidates we're going to replace and assign an
@@ -940,6 +942,12 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
    if (analyze_speculation_for_cf_list(&ctx, &impl->body))
       analyze_reconstructed(&ctx, impl);
 
+   if (!BITSET_TEST_RANGE(ctx.reconstructed_defs, 0, impl->ssa_alloc)) {
+      free(ctx.states);
+      free(ctx.reconstructed_defs);
+      return nir_no_progress(impl);
+   }
+
    /* Step 5: Actually do the replacement. */
    struct hash_table *remap_table =
       _mesa_pointer_hash_table_create(NULL);
@@ -980,11 +988,10 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
       }
    }
 
-   nir_progress(true, impl, nir_metadata_control_flow);
-
    ralloc_free(remap_table);
    free(ctx.states);
    free(ctx.reconstructed_defs);
    _mesa_set_destroy(ctx.reconstructed_ifs, NULL);
-   return true;
+
+   return nir_progress(true, impl, nir_metadata_control_flow);
 }
diff --git a/src/compiler/nir/nir_sweep.c b/src/compiler/nir/nir_sweep.c
index 0bba79450a5..25c7663ce87 100644
--- a/src/compiler/nir/nir_sweep.c
+++ b/src/compiler/nir/nir_sweep.c
@@ -185,6 +185,29 @@ sweep_function(nir_shader *nir, nir_function *f)
       sweep_impl(nir, f->impl);
 }
 
+/** Steals a function from one shader into another.
+ *
+ * This knows nothing about dependencies so if this function calls other
+ * functions, you'll need a more complex pass that uses nir_function_clone()
+ * to steal them all.
+ */
+void
+nir_steal_function(nir_shader *nir, nir_function *f)
+{
+#ifndef NDEBUG
+   if (f->impl) {
+      nir_foreach_block(block, f->impl) {
+         nir_foreach_instr(instr, block) {
+            assert(instr->type != nir_instr_type_call);
+         }
+      }
+   }
+#endif
+   exec_node_remove(&f->node);
+   exec_list_push_tail(&nir->functions, &f->node);
+   sweep_function(nir, f);
+}
+
 void
 nir_sweep(nir_shader *nir)
 {
diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c
index 558ee51abf1..ba77467d315 100644
--- a/src/panfrost/compiler/bifrost/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost/bifrost_compile.c
@@ -6504,7 +6504,7 @@ bifrost_lower_texture_late_nir(nir_shader *nir, unsigned gpu_id)
 }
 
 static int
-compare_u32(const void* a, const void* b, void* _)
+compare_u32(const void* a, const void* b)
 {
    const uint32_t va = (uintptr_t)a;
    const uint32_t vb = (uintptr_t)b;
@@ -6673,9 +6673,9 @@ bi_compile_variant_nir(nir_shader *nir,
          sorted[idx++] = (uintptr_t)entry.data;
       }
 
-      util_qsort_r(sorted, const_amount, sizeof(uint32_t), compare_u32, NULL);
+      qsort(sorted, const_amount, sizeof(uint32_t), compare_u32);
       uint32_t max_amount = MIN2(const_amount, ctx->inputs->fau_consts.max_amount);
-      uint32_t min_count_for_fau = max_amount > 0 ? sorted[max_amount - 1] : 0; 
+      uint32_t min_count_for_fau = max_amount > 0 ? sorted[max_amount - 1] : 0;
       ralloc_free(sorted);
 
       bi_foreach_instr_global_safe(ctx, I) {
@@ -6837,7 +6837,7 @@ bi_compile_variant(nir_shader *nir,
 
    /* Software invariant: Only a secondary shader can appear at a nonzero
     * offset, to keep the ABI simple. */
-   assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
+   assert((offset == 0) ^ (idvs == BI_IDVS_VARYING || idvs == BI_IDVS_PILOT));
 
    struct pan_stats *stats =
       idvs == BI_IDVS_VARYING ? &info->stats_idvs_varying : &info->stats;
@@ -6874,6 +6874,11 @@ bi_compile_variant(nir_shader *nir,
       info->vs.secondary_offset = offset;
       info->vs.secondary_preload = preload;
       info->vs.secondary_work_reg_count = ctx->info.work_reg_count;
+   } else if (idvs == BI_IDVS_PILOT) {
+      info->pilot.enable = (binary->size > offset);
+      info->pilot.offset = offset;
+      info->pilot.preload = preload;
+      info->pilot.work_reg_count = ctx->info.work_reg_count;
    } else {
       info->preload = preload;
       info->work_reg_count = ctx->info.work_reg_count;
@@ -6996,6 +7001,26 @@ bifrost_compile_shader_nir(nir_shader *nir,
 
    info->tls_size = nir->scratch_size;
 
+   nir_shader *pilot = NULL;
+   nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
+   if (entrypoint->preamble) {
+      pilot = nir_shader_create(nir, MESA_SHADER_COMPUTE, nir->options);
+
+      pilot->info = nir->info;
+      pilot->info.stage = MESA_SHADER_COMPUTE;
+      pilot->info.workgroup_size[0] = 1;
+      pilot->info.workgroup_size[1] = 1;
+      pilot->info.workgroup_size[2] = 1;
+      memset(&pilot->info.cs, 0, sizeof(pilot->info.cs));
+
+      nir_function *pilot_main = entrypoint->preamble;
+      nir_steal_function(pilot, pilot_main);
+      entrypoint->preamble = NULL;
+
+      pilot_main->is_preamble = false;
+      pilot_main->is_entrypoint = true;
+   }
+
    pan_nir_collect_varyings(nir, info, PAN_MEDIUMP_VARY_32BIT);
 
    if (nir->info.stage == MESA_SHADER_VERTEX && info->vs.idvs) {
@@ -7010,6 +7035,9 @@ bifrost_compile_shader_nir(nir_shader *nir,
       bi_compile_variant(nir, inputs, binary, info, BI_IDVS_NONE);
    }
 
+   if (pilot)
+      bi_compile_variant(pilot, inputs, binary, info, BI_IDVS_PILOT);
+
    if (mesa_shader_stage_is_compute(nir->info.stage)) {
       /* Workgroups may be merged if the structure of the workgroup is
        * not software visible. This is true if neither shared memory
diff --git a/src/panfrost/compiler/bifrost/compiler.h b/src/panfrost/compiler/bifrost/compiler.h
index 6985d9f2191..fc4f9d6f16b 100644
--- a/src/panfrost/compiler/bifrost/compiler.h
+++ b/src/panfrost/compiler/bifrost/compiler.h
@@ -1055,6 +1055,9 @@ enum bi_idvs_mode {
 
    /* IDVS2 in use. Compiling a deferred shader (v12+) */
    BI_IDVS_ALL = 3,
+
+   /* This is a pilot shader, and not a position or vertex shader */
+   BI_IDVS_PILOT = 4,
 };
 
 #define BI_MAX_REGS 64
diff --git a/src/panfrost/compiler/meson.build b/src/panfrost/compiler/meson.build
index 4ce16f92dd7..8693dac41ac 100644
--- a/src/panfrost/compiler/meson.build
+++ b/src/panfrost/compiler/meson.build
@@ -16,6 +16,7 @@ libpanfrost_compiler_files = files(
   'pan_nir_lower_vertex_id.c',
   'pan_nir_lower_writeout.c',
   'pan_nir_lower_xfb.c',
+  'pan_nir_opt_pilot.c',
 )
 
 subdir('bifrost')
diff --git a/src/panfrost/compiler/pan_compiler.h b/src/panfrost/compiler/pan_compiler.h
index e29c9158474..68d3ab256e8 100644
--- a/src/panfrost/compiler/pan_compiler.h
+++ b/src/panfrost/compiler/pan_compiler.h
@@ -315,6 +315,23 @@ struct pan_shader_info {
       } cs;
    };
 
+   struct {
+      /* True if a pilot shader is used */
+      bool enable;
+      /* If the pilot shader is enabled, number of work registers used by
+       * the pilot shader
+       */
+      uint16_t work_reg_count;
+      /* If the pilot shader is enabled, the pilot shader's offset in the
+       * program binary.
+       */
+      uint32_t offset;
+      /* If the pilot shader is enabled, bit mask of preloaded registers
+       * used by the varying shader
+       */
+      uint64_t preload;
+   } pilot;
+
    /* Does the shader contains a barrier? or (for fragment shaders) does it
     * require helper invocations, which demand the same ordering guarantees
     * of the hardware? These notions are unified in the hardware, so we
diff --git a/src/panfrost/compiler/pan_nir.h b/src/panfrost/compiler/pan_nir.h
index a7afdf97724..fc3d46ec200 100644
--- a/src/panfrost/compiler/pan_nir.h
+++ b/src/panfrost/compiler/pan_nir.h
@@ -60,6 +60,26 @@ bool pan_nir_lower_framebuffer(nir_shader *shader,
                                unsigned blend_shader_nr_samples,
                                bool broken_ld_special);
 
+struct pan_nir_opt_pilot_params {
+   unsigned gpu_id;
+
+   /* Offset in the FAU where the address to the FAU memory lives.  This is
+    * used for writing FAUs from the pilot shader.
+    */
+   uint32_t fau_addr_offset_B;
+
+   /* Offset in the FAU where the pilot data should go */
+   uint32_t fau_pilot_data_offset_B;
+
+   /* Amount of space in the FAU available for pilot data */
+   uint32_t fau_pilot_data_space_B;
+
+   /* Amount of actual pilot data consumed.  This is an output parameter */
+   uint32_t pilot_data_size_B;
+};
+
+bool pan_nir_opt_pilot(nir_shader *nir, struct pan_nir_opt_pilot_params *p);
+
 uint32_t pan_nir_collect_noperspective_varyings_fs(nir_shader *s);
 
 /* Specify the mediump lowering behavior for pan_nir_collect_varyings */
diff --git a/src/panfrost/compiler/pan_nir_lower_writeout.c b/src/panfrost/compiler/pan_nir_lower_writeout.c
index bfe16713bc8..6e52d8a9e45 100644
--- a/src/panfrost/compiler/pan_nir_lower_writeout.c
+++ b/src/panfrost/compiler/pan_nir_lower_writeout.c
@@ -93,144 +93,151 @@ kill_depth_stencil_writes(nir_builder *b, nir_intrinsic_instr *intr,
    return true;
 }
 
+static bool
+lower_zs_store_impl(nir_function_impl *impl)
+{
+   nir_intrinsic_instr *stores[3] = {NULL};
+   nir_intrinsic_instr *last_mask_store = NULL;
+   nir_block *mask_block = NULL;
+   unsigned writeout = 0;
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+         if (intr->intrinsic != nir_intrinsic_store_output)
+            continue;
+
+         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+         if (sem.location == FRAG_RESULT_DEPTH) {
+            stores[0] = intr;
+            writeout |= PAN_WRITEOUT_Z;
+         } else if (sem.location == FRAG_RESULT_STENCIL) {
+            stores[1] = intr;
+            writeout |= PAN_WRITEOUT_S;
+         } else if (sem.dual_source_blend_index) {
+            assert(!stores[2]); /* there should be only 1 source for dual blending */
+            stores[2] = intr;
+            writeout |= PAN_WRITEOUT_2;
+         } else if (sem.location == FRAG_RESULT_SAMPLE_MASK) {
+            last_mask_store = intr;
+            mask_block = intr->instr.block;
+         }
+      }
+   }
+
+   if (!writeout && !last_mask_store)
+      return nir_no_progress(impl);
+
+   nir_block *common_block = mask_block;
+
+   /* Ensure all stores are in the same block */
+   for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) {
+      if (!stores[i])
+         continue;
+
+      nir_block *block = stores[i]->instr.block;
+
+      if (common_block)
+         assert(common_block == block);
+      else
+         common_block = block;
+   }
+
+   /* move data stores in the common block to after the last mask store */
+   if (last_mask_store) {
+      nir_cursor insert_point = nir_after_instr(&last_mask_store->instr);
+      nir_foreach_instr_safe(instr, mask_block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+         /* stop when we've reached the last store to mask */
+         if (intr == last_mask_store)
+            break;
+         if (intr->intrinsic != nir_intrinsic_store_output)
+            continue;
+         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+         if (sem.location >= FRAG_RESULT_DATA0 &&
+             sem.location <= FRAG_RESULT_DATA7) {
+            nir_instr_move(insert_point, instr);
+            insert_point = nir_after_instr(instr);
+         }
+      }
+   }
+
+   bool replaced = false;
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+         if (intr->intrinsic != nir_intrinsic_store_output)
+            continue;
+
+         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+
+         if (sem.location < FRAG_RESULT_DATA0)
+            continue;
+
+         if (sem.dual_source_blend_index)
+            continue;
+
+         assert(nir_src_is_const(intr->src[1]) && "no indirect outputs");
+
+         nir_builder b =
+            nir_builder_at(nir_after_block_before_jump(instr->block));
+
+         /* Trying to write depth twice results in the
+          * wrong blend shader being executed on
+          * Midgard */
+         unsigned this_store = PAN_WRITEOUT_C | (replaced ? 0 : writeout);
+
+         pan_nir_emit_combined_store(&b, intr, this_store, stores);
+
+         nir_instr_remove(instr);
+
+         replaced = true;
+      }
+   }
+
+   /* Insert a store to the depth RT (0xff) if needed */
+   if (!replaced) {
+      nir_builder b =
+         nir_builder_at(nir_after_block_before_jump(common_block));
+
+      pan_nir_emit_combined_store(&b, NULL, writeout, stores);
+   }
+
+   for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) {
+      if (stores[i])
+         nir_instr_remove(&stores[i]->instr);
+   }
+
+   return nir_progress(true, impl, nir_metadata_control_flow);
+}
+
 bool
 pan_nir_lower_zs_store(nir_shader *nir)
 {
    bool progress = false;
 
-   if (nir->info.stage != MESA_SHADER_FRAGMENT)
+   if (nir->info.stage != MESA_SHADER_FRAGMENT) {
+      nir_shader_preserve_all_metadata(nir);
       return false;
+   }
 
    /* Remove all stencil/depth writes if early fragment test is forced. */
    if (nir->info.fs.early_fragment_tests)
       progress |= nir_shader_intrinsics_pass(nir, kill_depth_stencil_writes,
                                              nir_metadata_control_flow, NULL);
 
-   nir_foreach_function_impl(impl, nir) {
-      nir_intrinsic_instr *stores[3] = {NULL};
-      nir_intrinsic_instr *last_mask_store = NULL;
-      nir_block *mask_block = NULL;
-      unsigned writeout = 0;
-
-      nir_foreach_block(block, impl) {
-         nir_foreach_instr_safe(instr, block) {
-            if (instr->type != nir_instr_type_intrinsic)
-               continue;
-
-            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-            if (intr->intrinsic != nir_intrinsic_store_output)
-               continue;
-
-            nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
-            if (sem.location == FRAG_RESULT_DEPTH) {
-               stores[0] = intr;
-               writeout |= PAN_WRITEOUT_Z;
-            } else if (sem.location == FRAG_RESULT_STENCIL) {
-               stores[1] = intr;
-               writeout |= PAN_WRITEOUT_S;
-            } else if (sem.dual_source_blend_index) {
-               assert(!stores[2]); /* there should be only 1 source for dual blending */
-               stores[2] = intr;
-               writeout |= PAN_WRITEOUT_2;
-            } else if (sem.location == FRAG_RESULT_SAMPLE_MASK) {
-               last_mask_store = intr;
-               mask_block = intr->instr.block;
-            }
-         }
-      }
-
-      if (!writeout && !last_mask_store)
-         continue;
-
-      nir_block *common_block = mask_block;
-
-      /* Ensure all stores are in the same block */
-      for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) {
-         if (!stores[i])
-            continue;
-
-         nir_block *block = stores[i]->instr.block;
-
-         if (common_block)
-            assert(common_block == block);
-         else
-            common_block = block;
-      }
-
-      /* move data stores in the common block to after the last mask store */
-      if (last_mask_store) {
-         nir_cursor insert_point = nir_after_instr(&last_mask_store->instr);
-         nir_foreach_instr_safe(instr, mask_block) {
-            if (instr->type != nir_instr_type_intrinsic)
-               continue;
-            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-            /* stop when we've reached the last store to mask */
-            if (intr == last_mask_store)
-               break;
-            if (intr->intrinsic != nir_intrinsic_store_output)
-               continue;
-            nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
-            if (sem.location >= FRAG_RESULT_DATA0 &&
-                sem.location <= FRAG_RESULT_DATA7) {
-               nir_instr_move(insert_point, instr);
-               insert_point = nir_after_instr(instr);
-            }
-         }
-      }
-
-      bool replaced = false;
-
-      nir_foreach_block(block, impl) {
-         nir_foreach_instr_safe(instr, block) {
-            if (instr->type != nir_instr_type_intrinsic)
-               continue;
-
-            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-            if (intr->intrinsic != nir_intrinsic_store_output)
-               continue;
-
-            nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
-
-            if (sem.location < FRAG_RESULT_DATA0)
-               continue;
-
-            if (sem.dual_source_blend_index)
-               continue;
-
-            assert(nir_src_is_const(intr->src[1]) && "no indirect outputs");
-
-            nir_builder b =
-               nir_builder_at(nir_after_block_before_jump(instr->block));
-
-            /* Trying to write depth twice results in the
-             * wrong blend shader being executed on
-             * Midgard */
-            unsigned this_store = PAN_WRITEOUT_C | (replaced ? 0 : writeout);
-
-            pan_nir_emit_combined_store(&b, intr, this_store, stores);
-
-            nir_instr_remove(instr);
-
-            replaced = true;
-         }
-      }
-
-      /* Insert a store to the depth RT (0xff) if needed */
-      if (!replaced) {
-         nir_builder b =
-            nir_builder_at(nir_after_block_before_jump(common_block));
-
-         pan_nir_emit_combined_store(&b, NULL, writeout, stores);
-      }
-
-      for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) {
-         if (stores[i])
-            nir_instr_remove(&stores[i]->instr);
-      }
-
-      progress = nir_progress(true, impl, nir_metadata_control_flow);
-   }
+   nir_foreach_function_impl(impl, nir)
+      progress |= lower_zs_store_impl(impl);
 
    return progress;
 }
diff --git a/src/panfrost/compiler/pan_nir_opt_pilot.c b/src/panfrost/compiler/pan_nir_opt_pilot.c
new file mode 100644
index 00000000000..698e833997e
--- /dev/null
+++ b/src/panfrost/compiler/pan_nir_opt_pilot.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright (C) 2025 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "panfrost/model/pan_model.h"
+#include "pan_nir.h"
+
+static void
+pilot_def_size(nir_def *def, unsigned *size, unsigned *alignment,
+               nir_preamble_class *class)
+{
+   unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
+   unsigned bits =bit_size * def->num_components;
+   *size = align(bits / 8, 4);
+   *alignment = 4;
+   *class = nir_preamble_class_general;
+}
+
+static float
+pilot_instr_cost(nir_instr *instr, const void *data)
+{
+   switch (instr->type) {
+   case nir_instr_type_alu: {
+      nir_alu_instr *alu = nir_instr_as_alu(instr);
+      if (alu->def.bit_size == 64 || alu->src[0].src.ssa->bit_size == 64)
+         return 10.0;
+
+      /* Shrug */
+      return 2.0;
+   }
+
+   case nir_instr_type_tex:
+      return 20.0;
+
+   case nir_instr_type_intrinsic:
+      switch (nir_instr_as_intrinsic(instr)->intrinsic) {
+      case nir_intrinsic_load_ubo:
+      case nir_intrinsic_load_global:
+      case nir_intrinsic_load_global_constant:
+      case nir_intrinsic_load_ssbo:
+         return 20.0;
+      default:
+         /* Assume it's a sysval or something */
+         return 0.0;
+      }
+
+   case nir_instr_type_load_const:
+   case nir_instr_type_undef:
+      return 0.0;
+
+   default:
+      return 1.0;
+   }
+}
+
+static float
+pilot_rewrite_cost(nir_def *def, const void *data)
+{
+   unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
+   unsigned bits = bit_size * def->num_components;
+   unsigned dwords = DIV_ROUND_UP(bits, 32);
+
+   bool needs_mov = false;
+   nir_foreach_use(use, def) {
+      nir_instr *parent_instr = nir_src_parent_instr(use);
+      switch (parent_instr->type) {
+      case nir_instr_type_tex:
+      case nir_instr_type_intrinsic:
+         needs_mov = true;
+         break;
+      default:
+         break;
+      }
+   }
+
+   /* It's not a mov but booleans require a conversion */
+   if (def->bit_size == 1)
+      needs_mov = true;
+
+   /* We want rewrites to always cost something or else we'll pull constants
+    * and undefs into the pilot shader.
+    */
+   return needs_mov ? dwords * 2.0 : 1.0;
+}
+
+static bool
+pilot_avoid_instr(const nir_instr *instr, const void *data)
+{
+   return false;
+}
+
+static bool
+lower_pilot_intr(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   const struct pan_nir_opt_pilot_params *p = data;
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_subgroup_size: {
+      b->cursor = nir_before_instr(&intr->instr);
+      unsigned sg_size = pan_subgroup_size(pan_arch(p->gpu_id));
+      nir_def_replace(&intr->def, nir_imm_int(b, sg_size));
+      return true;
+   }
+
+   case nir_intrinsic_load_preamble: {
+      b->cursor = nir_before_instr(&intr->instr);
+
+      uint32_t offset = p->fau_pilot_data_offset_B + nir_intrinsic_base(intr);
+      assert(offset % 4 == 0);
+
+      unsigned bit_size = intr->def.bit_size == 1 ? 32 : intr->def.bit_size;
+      unsigned bits = bit_size * intr->def.num_components;
+      nir_def *val = nir_load_push_constant(b,
+         DIV_ROUND_UP(bits, 32), 32, nir_imm_int(b, offset));
+
+      val = nir_bitcast_vector(b, val, bit_size);
+      val = nir_trim_vector(b, val, intr->def.num_components);
+
+      if (intr->def.bit_size == 1)
+         val = nir_b2b1(b, val);
+
+      nir_def_replace(&intr->def, val);
+      return true;
+   }
+
+   case nir_intrinsic_store_preamble: {
+      b->cursor = nir_before_instr(&intr->instr);
+
+      nir_def *val = intr->src[0].ssa;
+      if (val->bit_size == 1)
+         val = nir_b2b32(b, val);
+
+      nir_def *push_const_buf = nir_load_push_constant(
+         b, 1, 64, nir_imm_int(b, p->fau_addr_offset_B));
+
+      uint32_t offset = p->fau_pilot_data_offset_B + nir_intrinsic_base(intr);
+
+      nir_store_global(b, val, nir_iadd_imm(b, push_const_buf, offset),
+                       .align_mul = 4);
+
+      nir_instr_remove(&intr->instr);
+      return true;
+   }
+
+   default:
+      return false;
+   }
+}
+
+bool
+pan_nir_opt_pilot(nir_shader *nir, struct pan_nir_opt_pilot_params *p)
+{
+   bool progress = false;
+
+   const nir_opt_preamble_options preamble_opts = {
+      .subgroup_size_uniform = true,
+      .def_size = pilot_def_size,
+      .preamble_storage_size = {
+         [nir_preamble_class_general] = p->fau_pilot_data_space_B,
+      },
+      .instr_cost_cb = pilot_instr_cost,
+      .rewrite_cost_cb = pilot_rewrite_cost,
+      .avoid_instr_cb = pilot_avoid_instr,
+   };
+   NIR_PASS(progress, nir, nir_opt_preamble, &preamble_opts,
+            &p->pilot_data_size_B);
+   if (!progress)
+      return false;
+
+   NIR_PASS(progress, nir, nir_shader_intrinsics_pass, lower_pilot_intr,
+            nir_metadata_control_flow, p);
+
+   return progress;
+}
diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
index e1be606f699..99c259dcffd 100644
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
@@ -172,9 +172,11 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
    if (result != VK_SUCCESS)
       return;
 
-   result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1);
-   if (result != VK_SUCCESS)
-      return;
+   if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
+      result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1);
+      if (result != VK_SUCCESS)
+         return;
+   }
 
    if (compute_state_dirty(cmdbuf, CS) ||
        compute_state_dirty(cmdbuf, DESC_STATE)) {
diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
index b0c56c5f0aa..195f19604c7 100644
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
@@ -452,6 +452,89 @@ prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
    return VK_SUCCESS;
 }
 
+static void
+launch_pilot(struct panvk_cmd_buffer *cmdbuf,
+             const struct panvk_shader_variant *s,
+             struct panvk_shader_desc_state *desc_state,
+             uint64_t push_uniforms)
+{
+   if (!s->info.pilot.enable)
+      return;
+
+   const struct pan_compute_dim job_size = { 1, 1, 1 };
+
+   struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
+   cs_update_compute_ctx(b) {
+      cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SRT_2),
+                   desc_state->res_table);
+
+      uint64_t fau_ptr = push_uniforms | ((uint64_t)s->fau.total_count << 56);
+      cs_move64_to(b, cs_sr_reg64(b, COMPUTE, FAU_2), fau_ptr);
+
+      cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_2),
+                   panvk_priv_mem_dev_addr(s->pilot_spd));
+
+      /* TSD_2 is set in update_tls() */
+
+      struct mali_compute_size_workgroup_packed wg_size;
+      pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
+         cfg.workgroup_size_x = 1;
+         cfg.workgroup_size_y = 1;
+         cfg.workgroup_size_z = 1;
+         cfg.allow_merging_workgroups = true;
+      }
+      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, WG_SIZE), wg_size.opaque[0]);
+
+      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_OFFSET_X), 0);
+      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_OFFSET_Y), 0);
+      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_OFFSET_Z), 0);
+      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_X), job_size.x);
+      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_Y), job_size.y);
+      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_Z), job_size.z);
+   }
+
+   const struct cs_tracing_ctx *tracing_ctx =
+      &cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].tracing;
+   cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
+                        1, MALI_TASK_AXIS_X, cs_shader_res_sel(2, 2, 2, 2));
+
+#if PAN_ARCH >= 11
+   struct cs_index sync_addr = cs_scratch_reg64(b, 0);
+   struct cs_index add_val = cs_scratch_reg64(b, 2);
+
+   cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
+                offsetof(struct panvk_cs_subqueue_context, syncobjs));
+
+   cs_add64(b, sync_addr, sync_addr,
+            PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
+   cs_move64_to(b, add_val, 1);
+   panvk_instr_sync64_add(cmdbuf, PANVK_SUBQUEUE_COMPUTE, true,
+                          MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,
+                          cs_defer_indirect());
+#else
+   struct cs_index sync_addr = cs_scratch_reg64(b, 0);
+   struct cs_index iter_sb = cs_scratch_reg32(b, 2);
+   struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
+   struct cs_index add_val = cs_scratch_reg64(b, 4);
+
+   cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
+              BITFIELD_MASK(3),
+              offsetof(struct panvk_cs_subqueue_context, syncobjs));
+
+   cs_add64(b, sync_addr, sync_addr,
+            PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
+   cs_move64_to(b, add_val, 1);
+
+   cs_match_iter_sb(b, x, iter_sb, cmp_scratch) {
+      panvk_instr_sync64_add(cmdbuf, PANVK_SUBQUEUE_COMPUTE, true,
+                             MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,
+                             cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)));
+   }
+#endif
+
+   ++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point;
+}
+
 static bool
 has_depth_att(struct panvk_cmd_buffer *cmdbuf)
 {
@@ -596,6 +679,8 @@ update_tls(struct panvk_cmd_buffer *cmdbuf)
       panvk_shader_only_variant(cmdbuf->state.gfx.fs.shader);
    struct cs_builder *b =
       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
+   struct cs_builder *cs_b =
+      panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
 
    if (!cmdbuf->state.gfx.tsd) {
       if (!state->desc.gpu) {
@@ -616,6 +701,10 @@ update_tls(struct panvk_cmd_buffer *cmdbuf)
          cs_move64_to(b, cs_sr_reg64(b, IDVS, TSD_0), state->desc.gpu);
 #endif
       }
+
+      cs_update_compute_ctx(cs_b) {
+         cs_move64_to(cs_b, cs_sr_reg64(b, COMPUTE, TSD_2), state->desc.gpu);
+      }
    }
 
    state->info.tls.size =
@@ -1764,7 +1853,10 @@ prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf,
       }
    }
 
-   if (fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) {
+   bool fs_pilot = fs && fs->info.pilot.enable;
+   if (fs_user_dirty(cmdbuf) ||
+       (fs_pilot && gfx_state_dirty(cmdbuf, DESC_STATE)) ||
+       gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) {
       uint64_t fau_ptr = 0;
 
       if (fs) {
@@ -1772,6 +1864,10 @@ prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf,
          if (result != VK_SUCCESS)
             return result;
 
+         if (fs->info.pilot.enable)
+            launch_pilot(cmdbuf, fs, &cmdbuf->state.gfx.fs.desc,
+                         cmdbuf->state.gfx.fs.push_uniforms);
+
          fau_ptr = cmdbuf->state.gfx.fs.push_uniforms |
                    ((uint64_t)fs->fau.total_count << 56);
       }
@@ -2310,10 +2406,6 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
 
    panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, draw);
 
-   result = prepare_push_uniforms(cmdbuf, draw);
-   if (result != VK_SUCCESS)
-      return result;
-
    result = prepare_vs(cmdbuf, draw);
    if (result != VK_SUCCESS)
       return result;
@@ -2322,6 +2414,10 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
    if (result != VK_SUCCESS)
       return result;
 
+   result = prepare_push_uniforms(cmdbuf, draw);
+   if (result != VK_SUCCESS)
+      return result;
+
    /* Assumes 16 byte slots. We could do better. */
    uint32_t varying_size = get_varying_slots(cmdbuf) * 16;
 
@@ -3069,6 +3165,17 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
    bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0;
 
+   /* Wait on any pilot shaders */
+   const struct panvk_cs_deps deps = {
+      .src[PANVK_SUBQUEUE_COMPUTE] = {
+         .wait_sb_mask = dev->csf.sb.all_iters_mask,
+      },
+      .dst[PANVK_SUBQUEUE_FRAGMENT] = {
+         .wait_subqueue_mask = BITFIELD_BIT(PANVK_SUBQUEUE_COMPUTE),
+      },
+   };
+   panvk_per_arch(emit_barrier)(cmdbuf, deps);
+
    /* Now initialize the fragment bits. */
    cs_update_frag_ctx(b) {
       cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN),
diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c
index 50e09514f58..b55e92f22c5 100644
--- a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c
@@ -103,9 +103,11 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
 
    panvk_per_arch(cmd_prepare_dispatch_sysvals)(cmdbuf, info);
 
-   result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1);
-   if (result != VK_SUCCESS)
-      return;
+   if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
+      result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1);
+      if (result != VK_SUCCESS)
+         return;
+   }
 
    struct pan_ptr copy_desc_job = {0};
 
diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c
index e381fd614a8..7d524342373 100644
--- a/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c
@@ -1471,12 +1471,14 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw)
          gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
       }
 
-      result = panvk_per_arch(cmd_prepare_push_uniforms)(
-         cmdbuf, vs, 1);
-      if (result != VK_SUCCESS)
-         return;
+      if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
+         result = panvk_per_arch(cmd_prepare_push_uniforms)(
+            cmdbuf, vs, 1);
+         if (result != VK_SUCCESS)
+            return;
+      }
 
-      if (fs) {
+      if (fs && compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
          result = panvk_per_arch(cmd_prepare_push_uniforms)(
             cmdbuf, fs, 1);
          if (result != VK_SUCCESS)
@@ -1633,12 +1635,14 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
          gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
       }
 
-      result = panvk_per_arch(cmd_prepare_push_uniforms)(
-         cmdbuf, vs, 1);
-      if (result != VK_SUCCESS)
-         return;
+      if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
+         result = panvk_per_arch(cmd_prepare_push_uniforms)(
+            cmdbuf, vs, 1);
+         if (result != VK_SUCCESS)
+            return;
+      }
 
-      if (fs) {
+      if (fs && compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
          result = panvk_per_arch(cmd_prepare_push_uniforms)(
             cmdbuf, fs, 1);
          if (result != VK_SUCCESS)
diff --git a/src/panfrost/vulkan/panvk_shader.h b/src/panfrost/vulkan/panvk_shader.h
index 924af49e638..1f77174680d 100644
--- a/src/panfrost/vulkan/panvk_shader.h
+++ b/src/panfrost/vulkan/panvk_shader.h
@@ -348,6 +348,8 @@ struct panvk_shader_fau_info {
    BITSET_DECLARE(used_sysvals, MAX_SYSVAL_FAUS);
    BITSET_DECLARE(used_push_consts, MAX_PUSH_CONST_FAUS);
    uint32_t sysval_count;
+   uint32_t push_count;
+   uint32_t pilot_count;
    uint32_t total_count;
 };
 
@@ -416,6 +418,7 @@ struct panvk_shader_variant {
 #endif
       } spds;
    };
+   struct panvk_priv_mem pilot_spd;
 #endif
 
    const char *nir_str;
diff --git a/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c b/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c
index 9428bc9f950..3553ecd7c02 100644
--- a/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c
+++ b/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c
@@ -17,18 +17,12 @@ panvk_per_arch(cmd_prepare_push_uniforms)(
 
    switch (shader->info.stage) {
    case MESA_SHADER_COMPUTE:
-      if (!compute_state_dirty(cmdbuf, PUSH_UNIFORMS))
-         return VK_SUCCESS;
       push_ptr = &cmdbuf->state.compute.push_uniforms;
       break;
    case MESA_SHADER_VERTEX:
-      if (!gfx_state_dirty(cmdbuf, VS_PUSH_UNIFORMS))
-         return VK_SUCCESS;
       push_ptr = &cmdbuf->state.gfx.vs.push_uniforms;
       break;
    case MESA_SHADER_FRAGMENT:
-      if (!gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS))
-         return VK_SUCCESS;
       push_ptr = &cmdbuf->state.gfx.fs.push_uniforms;
       break;
    default:
@@ -78,6 +72,9 @@ panvk_per_arch(cmd_prepare_push_uniforms)(
       BITSET_FOREACH_SET(w, shader->fau.used_push_consts, MAX_PUSH_CONST_FAUS)
          faus[fau++] = push_consts[w];
 
+      /* These are populated by the pilot shader */
+      fau += shader->fau.pilot_count;
+
       for (uint32_t i = 0; i < shader->info.fau_consts_count; i += 2) {
          faus[fau++] = (uint64_t)shader->info.fau_consts[i + 1] << 32 |
                        shader->info.fau_consts[i];
diff --git a/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c b/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c
index 91376e3ab83..b6a5d89e48a 100644
--- a/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c
+++ b/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c
@@ -569,7 +569,7 @@ load_resource_deref_desc(nir_builder *b, nir_deref_instr *deref,
       b, num_components, bit_size,
       nir_imm_int(b, pan_res_handle(VALHALL_RESOURCE_TABLE_IDX, set + 1)),
       set_offset, .range = ~0u, .align_mul = PANVK_DESCRIPTOR_SIZE,
-      .align_offset = desc_offset);
+      .align_offset = desc_offset, .access = ACCESS_CAN_SPECULATE);
 #endif
 }
 
@@ -1162,6 +1162,9 @@ static bool
 collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
                         struct lower_desc_ctx *ctx)
 {
+   /* We'll also set can_speculate here. */
+   tex->can_speculate = true;
+
    bool recorded = false;
    uint32_t plane = 0;
    int sampler_src_idx =
@@ -1184,6 +1187,9 @@ collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
       struct panvk_subdesc_info subdesc =
          get_sampler_subdesc_info(binding_layout->type, plane);
 
+      if (binding_layout->flags & VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT)
+         tex->can_speculate = false;
+
       record_binding(ctx, set, binding, subdesc, max_idx);
       recorded = true;
    }
@@ -1203,6 +1209,9 @@ collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
       struct panvk_subdesc_info subdesc =
          get_tex_subdesc_info(binding_layout->type, plane);
 
+      if (binding_layout->flags & VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT)
+         tex->can_speculate = false;
+
       record_binding(ctx, set, binding, subdesc, max_idx);
       recorded = true;
    }
@@ -1210,11 +1219,66 @@ collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
    return recorded;
 }
 
+static bool
+mark_load_can_speculate(nir_intrinsic_instr *load,
+                        const struct lower_desc_ctx *ctx)
+{
+   assert(load->intrinsic == nir_intrinsic_load_deref);
+
+   /* If it's already marked CAN_SPECULATE, there's nothing to do */
+   enum gl_access_qualifier access = nir_intrinsic_access(load);
+   if (access & ACCESS_CAN_SPECULATE)
+      return false;
+
+   nir_deref_instr *deref = nir_src_as_deref(load->src[0]);
+   if (!nir_deref_mode_is_one_of(deref, nir_var_mem_ubo | nir_var_mem_ssbo))
+      return false;
+
+   if (nir_deref_mode_is(deref, nir_var_mem_ssbo) &&
+       ((access & ACCESS_VOLATILE) || !(access & ACCESS_CAN_REORDER)))
+      return false;
+
+   while (true) {
+      nir_deref_instr *parent = nir_deref_instr_parent(deref);
+      if (parent == NULL)
+         break;
+      deref = parent;
+   }
+   assert(deref->deref_type == nir_deref_type_cast);
+
+   nir_intrinsic_instr *desc_load = nir_src_as_intrinsic(deref->parent);
+   if (desc_load == NULL ||
+       desc_load->intrinsic != nir_intrinsic_load_vulkan_descriptor)
+      return false;
+
+   nir_intrinsic_instr *desc_index = nir_src_as_intrinsic(desc_load->src[0]);
+   if (desc_index == NULL ||
+       desc_index->intrinsic != nir_intrinsic_vulkan_resource_index)
+      return false;
+
+   unsigned set = nir_intrinsic_desc_set(desc_index);
+   unsigned binding = nir_intrinsic_binding(desc_index);
+   const struct panvk_descriptor_set_layout *set_layout =
+      get_set_layout(set, ctx);
+   const struct panvk_descriptor_set_binding_layout *bind_layout =
+      &set_layout->bindings[binding];
+
+   if (bind_layout->flags & VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT)
+      return false;
+
+   access |= ACCESS_CAN_SPECULATE;
+   nir_intrinsic_set_access(load, access);
+   return true;
+}
+
 static bool
 collect_intr_desc_access(nir_builder *b, nir_intrinsic_instr *intrin,
                          struct lower_desc_ctx *ctx)
 {
    switch (intrin->intrinsic) {
+   case nir_intrinsic_load_deref:
+      return mark_load_can_speculate(intrin, ctx);
+
    case nir_intrinsic_vulkan_resource_index: {
       unsigned set, binding;
 
@@ -1338,6 +1402,9 @@ panvk_per_arch(nir_lower_descriptors)(
    for (uint32_t i = 0; i < set_layout_count; i++)
       ctx.set_layouts[i] = to_panvk_descriptor_set_layout(set_layouts[i]);
 
+   /* Collect descriptor access.  This also marks texture and UBO/SSBO loads
+    * as can_speculate based on descriptor binding flags.
+    */
    NIR_PASS(progress, nir, nir_shader_instructions_pass,
             collect_instr_desc_access, nir_metadata_all, &ctx);
    if (!progress)
diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c
index 60dd109f215..e271fc849b8 100644
--- a/src/panfrost/vulkan/panvk_vX_shader.c
+++ b/src/panfrost/vulkan/panvk_vX_shader.c
@@ -65,6 +65,12 @@
 #include "vk_shader.h"
 #include "vk_util.h"
 
+static inline bool
+panvk_wants_pilot_shader(mesa_shader_stage stage, unsigned arch)
+{
+   return arch >= 9 && stage == MESA_SHADER_FRAGMENT;
+}
+
 #define FAU_WORD_COUNT 64
 
 struct panvk_lower_sysvals_context {
@@ -700,6 +706,9 @@ lower_load_push_consts(nir_shader *nir, struct panvk_shader_variant *shader)
       shader_use_sysval(shader, graphics, blend.constants);
    }
 
+   if (panvk_wants_pilot_shader(nir->info.stage, PAN_ARCH))
+      shader_use_sysval(shader, common, push_uniforms);
+
    progress = false;
    NIR_PASS(progress, nir, nir_shader_intrinsics_pass, collect_push_constant,
             nir_metadata_all, shader);
@@ -717,8 +726,8 @@ lower_load_push_consts(nir_shader *nir, struct panvk_shader_variant *shader)
    /* 32 FAUs (256 bytes) are reserved for API push constants */
    assert(shader->fau.sysval_count <= FAU_WORD_COUNT - 32 &&
           "too many sysval FAUs");
-   shader->fau.total_count =
-      shader->fau.sysval_count + BITSET_COUNT(shader->fau.used_push_consts);
+   shader->fau.push_count = BITSET_COUNT(shader->fau.used_push_consts);
+   shader->fau.total_count = shader->fau.sysval_count + shader->fau.push_count;
    assert(shader->fau.total_count <= FAU_WORD_COUNT &&
           "asking for more FAUs than the hardware has to offer");
 
@@ -925,6 +934,26 @@ panvk_compile_nir(struct panvk_device *dev, nir_shader *nir,
 
    lower_load_push_consts(nir, shader);
 
+   if (panvk_wants_pilot_shader(nir->info.stage, PAN_ARCH)) {
+      /* Our pilots will be more efficient if we optimize first. */
+      pan_optimize_nir(nir, input.gpu_id);
+
+      struct pan_nir_opt_pilot_params pilot = {
+         .gpu_id = input.gpu_id,
+         .fau_addr_offset_B = shader_remapped_sysval_offset(
+            shader, sysval_offset(common, push_uniforms)),
+         .fau_pilot_data_offset_B =
+            FAU_WORD_SIZE * shader->fau.total_count,
+         .fau_pilot_data_space_B =
+            FAU_WORD_SIZE * (FAU_WORD_COUNT - shader->fau.total_count),
+      };
+      pan_nir_opt_pilot(nir, &pilot);
+
+      shader->fau.pilot_count =
+         DIV_ROUND_UP(pilot.pilot_data_size_B, FAU_WORD_SIZE);
+      shader->fau.total_count += shader->fau.pilot_count;
+   }
+
    /* Allow the remaining FAU space to be filled with constants. */
    input.fau_consts.max_amount =
       2 * (FAU_WORD_COUNT - shader->fau.total_count);
@@ -1208,6 +1237,23 @@ panvk_shader_upload(struct panvk_device *dev,
       }
 #endif
    }
+
+   if (shader->info.pilot.enable) {
+      shader->pilot_spd =
+         panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
+      if (!panvk_priv_mem_check_alloc(shader->pilot_spd))
+         return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+      panvk_priv_mem_write_desc(shader->pilot_spd, 0, SHADER_PROGRAM, cfg) {
+         cfg.stage = MALI_SHADER_STAGE_COMPUTE,
+         cfg.register_allocation =
+            pan_register_allocation(shader->info.pilot.work_reg_count);
+         cfg.binary = panvk_shader_variant_get_dev_addr(shader) +
+                      shader->info.pilot.offset;
+         cfg.preload.r48_r63 = (shader->info.pilot.preload >> 48);
+         cfg.flush_to_zero_mode = shader_ftz_mode(shader);
+      }
+   }
 #endif
 
    return VK_SUCCESS;
@@ -1237,6 +1283,7 @@ panvk_shader_variant_destroy(struct panvk_shader_variant *shader)
       panvk_pool_free_mem(&shader->spds.pos_triangles);
 #endif
    }
+   panvk_pool_free_mem(&shader->pilot_spd);
 #endif
 
    if (shader->own_bin)