Merge branch 'panvk/pilot' into 'main'

Draft: Very draft! Panvk pilot shaders See merge request mesa/mesa!39009
2025-12-20 05:10:11 +01:00 · 2025-12-20 01:47:24 +00:00 · 2025-12-20 01:47:24 +00:00 · 600fdfbc09
commit 600fdfbc09
parent c430f394c5 07076b1dcb
18 changed files with 697 additions and 166 deletions
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@ -6623,6 +6623,8 @@ bool nir_opt_load_skip_helpers(nir_shader *shader, nir_opt_load_skip_helpers_opt

 void nir_sweep(nir_shader *shader);

+void nir_steal_function(nir_shader *nir, nir_function *f);
+
 nir_intrinsic_op nir_intrinsic_from_system_value(gl_system_value val);
 gl_system_value nir_system_value_from_intrinsic(nir_intrinsic_op intrin);

--- a/src/compiler/nir/nir_opt_preamble.c
+++ b/src/compiler/nir/nir_opt_preamble.c
@ -167,7 +167,6 @@ can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
   case nir_intrinsic_load_fb_layers_v3d:
   case nir_intrinsic_load_fep_w_v3d:
   case nir_intrinsic_load_tcs_num_patches_amd:
-   case nir_intrinsic_load_sample_positions_pan:
   case nir_intrinsic_load_pipeline_stat_query_enabled_amd:
   case nir_intrinsic_load_prim_gen_query_enabled_amd:
   case nir_intrinsic_load_prim_xfb_query_enabled_amd:
@ -233,6 +232,8 @@ can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
   case nir_intrinsic_load_constant_agx:
   case nir_intrinsic_bindless_image_agx:
   case nir_intrinsic_bindless_sampler_agx:
+   case nir_intrinsic_load_texel_buf_conv_pan:
+   case nir_intrinsic_load_texel_buf_index_address_pan:
      return can_move_srcs(&instr->instr, ctx);

   /* Image/SSBO loads can be moved if they are CAN_REORDER and their
@ -240,6 +241,7 @@ can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
    */
   case nir_intrinsic_image_load:
   case nir_intrinsic_image_samples_identical:
+   case nir_intrinsic_image_texel_address:
   case nir_intrinsic_bindless_image_load:
   case nir_intrinsic_load_global_bounded:
   case nir_intrinsic_load_ssbo:
@ -813,7 +815,7 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,

   if (num_candidates == 0) {
      free(ctx.states);
-      return false;
+      return nir_no_progress(impl);
   }

   def_state **candidates = malloc(sizeof(*candidates) * num_candidates);
@ -880,7 +882,7 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
   if (num_candidates == 0) {
      free(ctx.states);
      free(candidates);
-      return false;
+      return nir_no_progress(impl);
   }

   /* Step 4: Figure out which candidates we're going to replace and assign an
@ -940,6 +942,12 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
   if (analyze_speculation_for_cf_list(&ctx, &impl->body))
      analyze_reconstructed(&ctx, impl);

+   if (!BITSET_TEST_RANGE(ctx.reconstructed_defs, 0, impl->ssa_alloc)) {
+      free(ctx.states);
+      free(ctx.reconstructed_defs);
+      return nir_no_progress(impl);
+   }
+
   /* Step 5: Actually do the replacement. */
   struct hash_table *remap_table =
      _mesa_pointer_hash_table_create(NULL);
@ -980,11 +988,10 @@ nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
      }
   }

-   nir_progress(true, impl, nir_metadata_control_flow);
-
   ralloc_free(remap_table);
   free(ctx.states);
   free(ctx.reconstructed_defs);
   _mesa_set_destroy(ctx.reconstructed_ifs, NULL);
-   return true;
+
+   return nir_progress(true, impl, nir_metadata_control_flow);
 }
--- a/src/compiler/nir/nir_sweep.c
+++ b/src/compiler/nir/nir_sweep.c
@ -185,6 +185,29 @@ sweep_function(nir_shader *nir, nir_function *f)
      sweep_impl(nir, f->impl);
 }

+/** Steals a function from one shader into another.
+ *
+ * This knows nothing about dependencies so if this function calls other
+ * functions, you'll need a more complex pass that uses nir_function_clone()
+ * to steal them all.
+ */
+void
+nir_steal_function(nir_shader *nir, nir_function *f)
+{
+#ifndef NDEBUG
+   if (f->impl) {
+      nir_foreach_block(block, f->impl) {
+         nir_foreach_instr(instr, block) {
+            assert(instr->type != nir_instr_type_call);
+         }
+      }
+   }
+#endif
+   exec_node_remove(&f->node);
+   exec_list_push_tail(&nir->functions, &f->node);
+   sweep_function(nir, f);
+}
+
 void
 nir_sweep(nir_shader *nir)
 {
--- a/src/panfrost/compiler/bifrost/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost/bifrost_compile.c
@ -6504,7 +6504,7 @@ bifrost_lower_texture_late_nir(nir_shader *nir, unsigned gpu_id)
 }

 static int
-compare_u32(const void* a, const void* b, void* _)
+compare_u32(const void* a, const void* b)
 {
   const uint32_t va = (uintptr_t)a;
   const uint32_t vb = (uintptr_t)b;
@ -6673,7 +6673,7 @@ bi_compile_variant_nir(nir_shader *nir,
         sorted[idx++] = (uintptr_t)entry.data;
      }

-      util_qsort_r(sorted, const_amount, sizeof(uint32_t), compare_u32, NULL);
+      qsort(sorted, const_amount, sizeof(uint32_t), compare_u32);
      uint32_t max_amount = MIN2(const_amount, ctx->inputs->fau_consts.max_amount);
      uint32_t min_count_for_fau = max_amount > 0 ? sorted[max_amount - 1] : 0;
      ralloc_free(sorted);
@ -6837,7 +6837,7 @@ bi_compile_variant(nir_shader *nir,

   /* Software invariant: Only a secondary shader can appear at a nonzero
    * offset, to keep the ABI simple. */
-   assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
+   assert((offset == 0) ^ (idvs == BI_IDVS_VARYING || idvs == BI_IDVS_PILOT));

   struct pan_stats *stats =
      idvs == BI_IDVS_VARYING ? &info->stats_idvs_varying : &info->stats;
@ -6874,6 +6874,11 @@ bi_compile_variant(nir_shader *nir,
      info->vs.secondary_offset = offset;
      info->vs.secondary_preload = preload;
      info->vs.secondary_work_reg_count = ctx->info.work_reg_count;
+   } else if (idvs == BI_IDVS_PILOT) {
+      info->pilot.enable = (binary->size > offset);
+      info->pilot.offset = offset;
+      info->pilot.preload = preload;
+      info->pilot.work_reg_count = ctx->info.work_reg_count;
   } else {
      info->preload = preload;
      info->work_reg_count = ctx->info.work_reg_count;
@ -6996,6 +7001,26 @@ bifrost_compile_shader_nir(nir_shader *nir,

   info->tls_size = nir->scratch_size;

+   nir_shader *pilot = NULL;
+   nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
+   if (entrypoint->preamble) {
+      pilot = nir_shader_create(nir, MESA_SHADER_COMPUTE, nir->options);
+
+      pilot->info = nir->info;
+      pilot->info.stage = MESA_SHADER_COMPUTE;
+      pilot->info.workgroup_size[0] = 1;
+      pilot->info.workgroup_size[1] = 1;
+      pilot->info.workgroup_size[2] = 1;
+      memset(&pilot->info.cs, 0, sizeof(pilot->info.cs));
+
+      nir_function *pilot_main = entrypoint->preamble;
+      nir_steal_function(pilot, pilot_main);
+      entrypoint->preamble = NULL;
+
+      pilot_main->is_preamble = false;
+      pilot_main->is_entrypoint = true;
+   }
+
   pan_nir_collect_varyings(nir, info, PAN_MEDIUMP_VARY_32BIT);

   if (nir->info.stage == MESA_SHADER_VERTEX && info->vs.idvs) {
@ -7010,6 +7035,9 @@ bifrost_compile_shader_nir(nir_shader *nir,
      bi_compile_variant(nir, inputs, binary, info, BI_IDVS_NONE);
   }

+   if (pilot)
+      bi_compile_variant(pilot, inputs, binary, info, BI_IDVS_PILOT);
+
   if (mesa_shader_stage_is_compute(nir->info.stage)) {
      /* Workgroups may be merged if the structure of the workgroup is
       * not software visible. This is true if neither shared memory
--- a/src/panfrost/compiler/bifrost/compiler.h
+++ b/src/panfrost/compiler/bifrost/compiler.h
@ -1055,6 +1055,9 @@ enum bi_idvs_mode {

   /* IDVS2 in use. Compiling a deferred shader (v12+) */
   BI_IDVS_ALL = 3,
+
+   /* This is a pilot shader, and not a position or vertex shader */
+   BI_IDVS_PILOT = 4,
 };

 #define BI_MAX_REGS 64
--- a/src/panfrost/compiler/meson.build
+++ b/src/panfrost/compiler/meson.build
@ -16,6 +16,7 @@ libpanfrost_compiler_files = files(
  'pan_nir_lower_vertex_id.c',
  'pan_nir_lower_writeout.c',
  'pan_nir_lower_xfb.c',
+  'pan_nir_opt_pilot.c',
 )

 subdir('bifrost')
--- a/src/panfrost/compiler/pan_compiler.h
+++ b/src/panfrost/compiler/pan_compiler.h
@ -315,6 +315,23 @@ struct pan_shader_info {
      } cs;
   };

+   struct {
+      /* True if a pilot shader is used */
+      bool enable;
+      /* If the pilot shader is enabled, number of work registers used by
+       * the pilot shader
+       */
+      uint16_t work_reg_count;
+      /* If the pilot shader is enabled, the pilot shader's offset in the
+       * program binary.
+       */
+      uint32_t offset;
+      /* If the pilot shader is enabled, bit mask of preloaded registers
+       * used by the varying shader
+       */
+      uint64_t preload;
+   } pilot;
+
   /* Does the shader contains a barrier? or (for fragment shaders) does it
    * require helper invocations, which demand the same ordering guarantees
    * of the hardware? These notions are unified in the hardware, so we
--- a/src/panfrost/compiler/pan_nir.h
+++ b/src/panfrost/compiler/pan_nir.h
@ -60,6 +60,26 @@ bool pan_nir_lower_framebuffer(nir_shader *shader,
                               unsigned blend_shader_nr_samples,
                               bool broken_ld_special);

+struct pan_nir_opt_pilot_params {
+   unsigned gpu_id;
+
+   /* Offset in the FAU where the address to the FAU memory lives.  This is
+    * used for writing FAUs from the pilot shader.
+    */
+   uint32_t fau_addr_offset_B;
+
+   /* Offset in the FAU where the pilot data should go */
+   uint32_t fau_pilot_data_offset_B;
+
+   /* Amount of space in the FAU available for pilot data */
+   uint32_t fau_pilot_data_space_B;
+
+   /* Amount of actual pilot data consumed.  This is an output parameter */
+   uint32_t pilot_data_size_B;
+};
+
+bool pan_nir_opt_pilot(nir_shader *nir, struct pan_nir_opt_pilot_params *p);
+
 uint32_t pan_nir_collect_noperspective_varyings_fs(nir_shader *s);

 /* Specify the mediump lowering behavior for pan_nir_collect_varyings */
--- a/src/panfrost/compiler/pan_nir_lower_writeout.c
+++ b/src/panfrost/compiler/pan_nir_lower_writeout.c
@ -93,20 +93,9 @@ kill_depth_stencil_writes(nir_builder *b, nir_intrinsic_instr *intr,
   return true;
 }

-bool
-pan_nir_lower_zs_store(nir_shader *nir)
+static bool
+lower_zs_store_impl(nir_function_impl *impl)
 {
-   bool progress = false;
-
-   if (nir->info.stage != MESA_SHADER_FRAGMENT)
-      return false;
-
-   /* Remove all stencil/depth writes if early fragment test is forced. */
-   if (nir->info.fs.early_fragment_tests)
-      progress |= nir_shader_intrinsics_pass(nir, kill_depth_stencil_writes,
-                                             nir_metadata_control_flow, NULL);
-
-   nir_foreach_function_impl(impl, nir) {
   nir_intrinsic_instr *stores[3] = {NULL};
   nir_intrinsic_instr *last_mask_store = NULL;
   nir_block *mask_block = NULL;
@ -140,7 +129,7 @@ pan_nir_lower_zs_store(nir_shader *nir)
   }

   if (!writeout && !last_mask_store)
-         continue;
+      return nir_no_progress(impl);

   nir_block *common_block = mask_block;

@ -229,8 +218,26 @@ pan_nir_lower_zs_store(nir_shader *nir)
         nir_instr_remove(&stores[i]->instr);
   }

-      progress = nir_progress(true, impl, nir_metadata_control_flow);
+   return nir_progress(true, impl, nir_metadata_control_flow);
+}
+
+bool
+pan_nir_lower_zs_store(nir_shader *nir)
+{
+   bool progress = false;
+
+   if (nir->info.stage != MESA_SHADER_FRAGMENT) {
+      nir_shader_preserve_all_metadata(nir);
+      return false;
   }

+   /* Remove all stencil/depth writes if early fragment test is forced. */
+   if (nir->info.fs.early_fragment_tests)
+      progress |= nir_shader_intrinsics_pass(nir, kill_depth_stencil_writes,
+                                             nir_metadata_control_flow, NULL);
+
+   nir_foreach_function_impl(impl, nir)
+      progress |= lower_zs_store_impl(impl);
+
   return progress;
 }
--- a/src/panfrost/compiler/pan_nir_opt_pilot.c
+++ b/src/panfrost/compiler/pan_nir_opt_pilot.c
@ -0,0 +1,194 @@
+/*
+ * Copyright (C) 2025 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "panfrost/model/pan_model.h"
+#include "pan_nir.h"
+
+static void
+pilot_def_size(nir_def *def, unsigned *size, unsigned *alignment,
+               nir_preamble_class *class)
+{
+   unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
+   unsigned bits =bit_size * def->num_components;
+   *size = align(bits / 8, 4);
+   *alignment = 4;
+   *class = nir_preamble_class_general;
+}
+
+static float
+pilot_instr_cost(nir_instr *instr, const void *data)
+{
+   switch (instr->type) {
+   case nir_instr_type_alu: {
+      nir_alu_instr *alu = nir_instr_as_alu(instr);
+      if (alu->def.bit_size == 64 || alu->src[0].src.ssa->bit_size == 64)
+         return 10.0;
+
+      /* Shrug */
+      return 2.0;
+   }
+
+   case nir_instr_type_tex:
+      return 20.0;
+
+   case nir_instr_type_intrinsic:
+      switch (nir_instr_as_intrinsic(instr)->intrinsic) {
+      case nir_intrinsic_load_ubo:
+      case nir_intrinsic_load_global:
+      case nir_intrinsic_load_global_constant:
+      case nir_intrinsic_load_ssbo:
+         return 20.0;
+      default:
+         /* Assume it's a sysval or something */
+         return 0.0;
+      }
+
+   case nir_instr_type_load_const:
+   case nir_instr_type_undef:
+      return 0.0;
+
+   default:
+      return 1.0;
+   }
+}
+
+static float
+pilot_rewrite_cost(nir_def *def, const void *data)
+{
+   unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
+   unsigned bits = bit_size * def->num_components;
+   unsigned dwords = DIV_ROUND_UP(bits, 32);
+
+   bool needs_mov = false;
+   nir_foreach_use(use, def) {
+      nir_instr *parent_instr = nir_src_parent_instr(use);
+      switch (parent_instr->type) {
+      case nir_instr_type_tex:
+      case nir_instr_type_intrinsic:
+         needs_mov = true;
+         break;
+      default:
+         break;
+      }
+   }
+
+   /* It's not a mov but booleans require a conversion */
+   if (def->bit_size == 1)
+      needs_mov = true;
+
+   /* We want rewrites to always cost something or else we'll pull constants
+    * and undefs into the pilot shader.
+    */
+   return needs_mov ? dwords * 2.0 : 1.0;
+}
+
+static bool
+pilot_avoid_instr(const nir_instr *instr, const void *data)
+{
+   return false;
+}
+
+static bool
+lower_pilot_intr(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   const struct pan_nir_opt_pilot_params *p = data;
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_subgroup_size: {
+      b->cursor = nir_before_instr(&intr->instr);
+      unsigned sg_size = pan_subgroup_size(pan_arch(p->gpu_id));
+      nir_def_replace(&intr->def, nir_imm_int(b, sg_size));
+      return true;
+   }
+
+   case nir_intrinsic_load_preamble: {
+      b->cursor = nir_before_instr(&intr->instr);
+
+      uint32_t offset = p->fau_pilot_data_offset_B + nir_intrinsic_base(intr);
+      assert(offset % 4 == 0);
+
+      unsigned bit_size = intr->def.bit_size == 1 ? 32 : intr->def.bit_size;
+      unsigned bits = bit_size * intr->def.num_components;
+      nir_def *val = nir_load_push_constant(b,
+         DIV_ROUND_UP(bits, 32), 32, nir_imm_int(b, offset));
+
+      val = nir_bitcast_vector(b, val, bit_size);
+      val = nir_trim_vector(b, val, intr->def.num_components);
+
+      if (intr->def.bit_size == 1)
+         val = nir_b2b1(b, val);
+
+      nir_def_replace(&intr->def, val);
+      return true;
+   }
+
+   case nir_intrinsic_store_preamble: {
+      b->cursor = nir_before_instr(&intr->instr);
+
+      nir_def *val = intr->src[0].ssa;
+      if (val->bit_size == 1)
+         val = nir_b2b32(b, val);
+
+      nir_def *push_const_buf = nir_load_push_constant(
+         b, 1, 64, nir_imm_int(b, p->fau_addr_offset_B));
+
+      uint32_t offset = p->fau_pilot_data_offset_B + nir_intrinsic_base(intr);
+
+      nir_store_global(b, val, nir_iadd_imm(b, push_const_buf, offset),
+                       .align_mul = 4);
+
+      nir_instr_remove(&intr->instr);
+      return true;
+   }
+
+   default:
+      return false;
+   }
+}
+
+bool
+pan_nir_opt_pilot(nir_shader *nir, struct pan_nir_opt_pilot_params *p)
+{
+   bool progress = false;
+
+   const nir_opt_preamble_options preamble_opts = {
+      .subgroup_size_uniform = true,
+      .def_size = pilot_def_size,
+      .preamble_storage_size = {
+         [nir_preamble_class_general] = p->fau_pilot_data_space_B,
+      },
+      .instr_cost_cb = pilot_instr_cost,
+      .rewrite_cost_cb = pilot_rewrite_cost,
+      .avoid_instr_cb = pilot_avoid_instr,
+   };
+   NIR_PASS(progress, nir, nir_opt_preamble, &preamble_opts,
+            &p->pilot_data_size_B);
+   if (!progress)
+      return false;
+
+   NIR_PASS(progress, nir, nir_shader_intrinsics_pass, lower_pilot_intr,
+            nir_metadata_control_flow, p);
+
+   return progress;
+}
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
@ -172,9 +172,11 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
   if (result != VK_SUCCESS)
      return;

+   if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
      result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1);
      if (result != VK_SUCCESS)
         return;
+   }

   if (compute_state_dirty(cmdbuf, CS) ||
       compute_state_dirty(cmdbuf, DESC_STATE)) {
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
@ -452,6 +452,89 @@ prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
   return VK_SUCCESS;
 }

+static void
+launch_pilot(struct panvk_cmd_buffer *cmdbuf,
+             const struct panvk_shader_variant *s,
+             struct panvk_shader_desc_state *desc_state,
+             uint64_t push_uniforms)
+{
+   if (!s->info.pilot.enable)
+      return;
+
+   const struct pan_compute_dim job_size = { 1, 1, 1 };
+
+   struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
+   cs_update_compute_ctx(b) {
+      cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SRT_2),
+                   desc_state->res_table);
+
+      uint64_t fau_ptr = push_uniforms | ((uint64_t)s->fau.total_count << 56);
+      cs_move64_to(b, cs_sr_reg64(b, COMPUTE, FAU_2), fau_ptr);
+
+      cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_2),
+                   panvk_priv_mem_dev_addr(s->pilot_spd));
+
+      /* TSD_2 is set in update_tls() */
+
+      struct mali_compute_size_workgroup_packed wg_size;
+      pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
+         cfg.workgroup_size_x = 1;
+         cfg.workgroup_size_y = 1;
+         cfg.workgroup_size_z = 1;
+         cfg.allow_merging_workgroups = true;
+      }
+      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, WG_SIZE), wg_size.opaque[0]);
+
+      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_OFFSET_X), 0);
+      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_OFFSET_Y), 0);
+      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_OFFSET_Z), 0);
+      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_X), job_size.x);
+      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_Y), job_size.y);
+      cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_Z), job_size.z);
+   }
+
+   const struct cs_tracing_ctx *tracing_ctx =
+      &cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].tracing;
+   cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
+                        1, MALI_TASK_AXIS_X, cs_shader_res_sel(2, 2, 2, 2));
+
+#if PAN_ARCH >= 11
+   struct cs_index sync_addr = cs_scratch_reg64(b, 0);
+   struct cs_index add_val = cs_scratch_reg64(b, 2);
+
+   cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
+                offsetof(struct panvk_cs_subqueue_context, syncobjs));
+
+   cs_add64(b, sync_addr, sync_addr,
+            PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
+   cs_move64_to(b, add_val, 1);
+   panvk_instr_sync64_add(cmdbuf, PANVK_SUBQUEUE_COMPUTE, true,
+                          MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,
+                          cs_defer_indirect());
+#else
+   struct cs_index sync_addr = cs_scratch_reg64(b, 0);
+   struct cs_index iter_sb = cs_scratch_reg32(b, 2);
+   struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
+   struct cs_index add_val = cs_scratch_reg64(b, 4);
+
+   cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
+              BITFIELD_MASK(3),
+              offsetof(struct panvk_cs_subqueue_context, syncobjs));
+
+   cs_add64(b, sync_addr, sync_addr,
+            PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
+   cs_move64_to(b, add_val, 1);
+
+   cs_match_iter_sb(b, x, iter_sb, cmp_scratch) {
+      panvk_instr_sync64_add(cmdbuf, PANVK_SUBQUEUE_COMPUTE, true,
+                             MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,
+                             cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)));
+   }
+#endif
+
+   ++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point;
+}
+
 static bool
 has_depth_att(struct panvk_cmd_buffer *cmdbuf)
 {
@ -596,6 +679,8 @@ update_tls(struct panvk_cmd_buffer *cmdbuf)
      panvk_shader_only_variant(cmdbuf->state.gfx.fs.shader);
   struct cs_builder *b =
      panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
+   struct cs_builder *cs_b =
+      panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);

   if (!cmdbuf->state.gfx.tsd) {
      if (!state->desc.gpu) {
@ -616,6 +701,10 @@ update_tls(struct panvk_cmd_buffer *cmdbuf)
         cs_move64_to(b, cs_sr_reg64(b, IDVS, TSD_0), state->desc.gpu);
 #endif
      }
+
+      cs_update_compute_ctx(cs_b) {
+         cs_move64_to(cs_b, cs_sr_reg64(b, COMPUTE, TSD_2), state->desc.gpu);
+      }
   }

   state->info.tls.size =
@ -1764,7 +1853,10 @@ prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf,
      }
   }

-   if (fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) {
+   bool fs_pilot = fs && fs->info.pilot.enable;
+   if (fs_user_dirty(cmdbuf) ||
+       (fs_pilot && gfx_state_dirty(cmdbuf, DESC_STATE)) ||
+       gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) {
      uint64_t fau_ptr = 0;

      if (fs) {
@ -1772,6 +1864,10 @@ prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf,
         if (result != VK_SUCCESS)
            return result;

+         if (fs->info.pilot.enable)
+            launch_pilot(cmdbuf, fs, &cmdbuf->state.gfx.fs.desc,
+                         cmdbuf->state.gfx.fs.push_uniforms);
+
         fau_ptr = cmdbuf->state.gfx.fs.push_uniforms |
                   ((uint64_t)fs->fau.total_count << 56);
      }
@ -2310,10 +2406,6 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)

   panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, draw);

-   result = prepare_push_uniforms(cmdbuf, draw);
-   if (result != VK_SUCCESS)
-      return result;
-
   result = prepare_vs(cmdbuf, draw);
   if (result != VK_SUCCESS)
      return result;
@ -2322,6 +2414,10 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
   if (result != VK_SUCCESS)
      return result;

+   result = prepare_push_uniforms(cmdbuf, draw);
+   if (result != VK_SUCCESS)
+      return result;
+
   /* Assumes 16 byte slots. We could do better. */
   uint32_t varying_size = get_varying_slots(cmdbuf) * 16;

@ -3069,6 +3165,17 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
   struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
   bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0;

+   /* Wait on any pilot shaders */
+   const struct panvk_cs_deps deps = {
+      .src[PANVK_SUBQUEUE_COMPUTE] = {
+         .wait_sb_mask = dev->csf.sb.all_iters_mask,
+      },
+      .dst[PANVK_SUBQUEUE_FRAGMENT] = {
+         .wait_subqueue_mask = BITFIELD_BIT(PANVK_SUBQUEUE_COMPUTE),
+      },
+   };
+   panvk_per_arch(emit_barrier)(cmdbuf, deps);
+
   /* Now initialize the fragment bits. */
   cs_update_frag_ctx(b) {
      cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN),
--- a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c
@ -103,9 +103,11 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)

   panvk_per_arch(cmd_prepare_dispatch_sysvals)(cmdbuf, info);

+   if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
      result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, cs, 1);
      if (result != VK_SUCCESS)
         return;
+   }

   struct pan_ptr copy_desc_job = {0};

--- a/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_draw.c
@ -1471,12 +1471,14 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_data *draw)
         gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
      }

+      if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
         result = panvk_per_arch(cmd_prepare_push_uniforms)(
            cmdbuf, vs, 1);
         if (result != VK_SUCCESS)
            return;
+      }

-      if (fs) {
+      if (fs && compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
         result = panvk_per_arch(cmd_prepare_push_uniforms)(
            cmdbuf, fs, 1);
         if (result != VK_SUCCESS)
@ -1633,12 +1635,14 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
         gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
      }

+      if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
         result = panvk_per_arch(cmd_prepare_push_uniforms)(
            cmdbuf, vs, 1);
         if (result != VK_SUCCESS)
            return;
+      }

-      if (fs) {
+      if (fs && compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
         result = panvk_per_arch(cmd_prepare_push_uniforms)(
            cmdbuf, fs, 1);
         if (result != VK_SUCCESS)
--- a/src/panfrost/vulkan/panvk_shader.h
+++ b/src/panfrost/vulkan/panvk_shader.h
@ -348,6 +348,8 @@ struct panvk_shader_fau_info {
   BITSET_DECLARE(used_sysvals, MAX_SYSVAL_FAUS);
   BITSET_DECLARE(used_push_consts, MAX_PUSH_CONST_FAUS);
   uint32_t sysval_count;
+   uint32_t push_count;
+   uint32_t pilot_count;
   uint32_t total_count;
 };

@ -416,6 +418,7 @@ struct panvk_shader_variant {
 #endif
      } spds;
   };
+   struct panvk_priv_mem pilot_spd;
 #endif

   const char *nir_str;
--- a/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c
+++ b/src/panfrost/vulkan/panvk_vX_cmd_push_constant.c
@ -17,18 +17,12 @@ panvk_per_arch(cmd_prepare_push_uniforms)(

   switch (shader->info.stage) {
   case MESA_SHADER_COMPUTE:
-      if (!compute_state_dirty(cmdbuf, PUSH_UNIFORMS))
-         return VK_SUCCESS;
      push_ptr = &cmdbuf->state.compute.push_uniforms;
      break;
   case MESA_SHADER_VERTEX:
-      if (!gfx_state_dirty(cmdbuf, VS_PUSH_UNIFORMS))
-         return VK_SUCCESS;
      push_ptr = &cmdbuf->state.gfx.vs.push_uniforms;
      break;
   case MESA_SHADER_FRAGMENT:
-      if (!gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS))
-         return VK_SUCCESS;
      push_ptr = &cmdbuf->state.gfx.fs.push_uniforms;
      break;
   default:
@ -78,6 +72,9 @@ panvk_per_arch(cmd_prepare_push_uniforms)(
      BITSET_FOREACH_SET(w, shader->fau.used_push_consts, MAX_PUSH_CONST_FAUS)
         faus[fau++] = push_consts[w];

+      /* These are populated by the pilot shader */
+      fau += shader->fau.pilot_count;
+
      for (uint32_t i = 0; i < shader->info.fau_consts_count; i += 2) {
         faus[fau++] = (uint64_t)shader->info.fau_consts[i + 1] << 32 |
                       shader->info.fau_consts[i];
--- a/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c
+++ b/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c
@ -569,7 +569,7 @@ load_resource_deref_desc(nir_builder *b, nir_deref_instr *deref,
      b, num_components, bit_size,
      nir_imm_int(b, pan_res_handle(VALHALL_RESOURCE_TABLE_IDX, set + 1)),
      set_offset, .range = ~0u, .align_mul = PANVK_DESCRIPTOR_SIZE,
-      .align_offset = desc_offset);
+      .align_offset = desc_offset, .access = ACCESS_CAN_SPECULATE);
 #endif
 }

@ -1162,6 +1162,9 @@ static bool
 collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
                        struct lower_desc_ctx *ctx)
 {
+   /* We'll also set can_speculate here. */
+   tex->can_speculate = true;
+
   bool recorded = false;
   uint32_t plane = 0;
   int sampler_src_idx =
@ -1184,6 +1187,9 @@ collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
      struct panvk_subdesc_info subdesc =
         get_sampler_subdesc_info(binding_layout->type, plane);

+      if (binding_layout->flags & VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT)
+         tex->can_speculate = false;
+
      record_binding(ctx, set, binding, subdesc, max_idx);
      recorded = true;
   }
@ -1203,6 +1209,9 @@ collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
      struct panvk_subdesc_info subdesc =
         get_tex_subdesc_info(binding_layout->type, plane);

+      if (binding_layout->flags & VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT)
+         tex->can_speculate = false;
+
      record_binding(ctx, set, binding, subdesc, max_idx);
      recorded = true;
   }
@ -1210,11 +1219,66 @@ collect_tex_desc_access(nir_builder *b, nir_tex_instr *tex,
   return recorded;
 }

+static bool
+mark_load_can_speculate(nir_intrinsic_instr *load,
+                        const struct lower_desc_ctx *ctx)
+{
+   assert(load->intrinsic == nir_intrinsic_load_deref);
+
+   /* If it's already marked CAN_SPECULATE, there's nothing to do */
+   enum gl_access_qualifier access = nir_intrinsic_access(load);
+   if (access & ACCESS_CAN_SPECULATE)
+      return false;
+
+   nir_deref_instr *deref = nir_src_as_deref(load->src[0]);
+   if (!nir_deref_mode_is_one_of(deref, nir_var_mem_ubo | nir_var_mem_ssbo))
+      return false;
+
+   if (nir_deref_mode_is(deref, nir_var_mem_ssbo) &&
+       ((access & ACCESS_VOLATILE) || !(access & ACCESS_CAN_REORDER)))
+      return false;
+
+   while (true) {
+      nir_deref_instr *parent = nir_deref_instr_parent(deref);
+      if (parent == NULL)
+         break;
+      deref = parent;
+   }
+   assert(deref->deref_type == nir_deref_type_cast);
+
+   nir_intrinsic_instr *desc_load = nir_src_as_intrinsic(deref->parent);
+   if (desc_load == NULL ||
+       desc_load->intrinsic != nir_intrinsic_load_vulkan_descriptor)
+      return false;
+
+   nir_intrinsic_instr *desc_index = nir_src_as_intrinsic(desc_load->src[0]);
+   if (desc_index == NULL ||
+       desc_index->intrinsic != nir_intrinsic_vulkan_resource_index)
+      return false;
+
+   unsigned set = nir_intrinsic_desc_set(desc_index);
+   unsigned binding = nir_intrinsic_binding(desc_index);
+   const struct panvk_descriptor_set_layout *set_layout =
+      get_set_layout(set, ctx);
+   const struct panvk_descriptor_set_binding_layout *bind_layout =
+      &set_layout->bindings[binding];
+
+   if (bind_layout->flags & VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT)
+      return false;
+
+   access |= ACCESS_CAN_SPECULATE;
+   nir_intrinsic_set_access(load, access);
+   return true;
+}
+
 static bool
 collect_intr_desc_access(nir_builder *b, nir_intrinsic_instr *intrin,
                         struct lower_desc_ctx *ctx)
 {
   switch (intrin->intrinsic) {
+   case nir_intrinsic_load_deref:
+      return mark_load_can_speculate(intrin, ctx);
+
   case nir_intrinsic_vulkan_resource_index: {
      unsigned set, binding;

@ -1338,6 +1402,9 @@ panvk_per_arch(nir_lower_descriptors)(
   for (uint32_t i = 0; i < set_layout_count; i++)
      ctx.set_layouts[i] = to_panvk_descriptor_set_layout(set_layouts[i]);

+   /* Collect descriptor access.  This also marks texture and UBO/SSBO loads
+    * as can_speculate based on descriptor binding flags.
+    */
   NIR_PASS(progress, nir, nir_shader_instructions_pass,
            collect_instr_desc_access, nir_metadata_all, &ctx);
   if (!progress)
--- a/src/panfrost/vulkan/panvk_vX_shader.c
+++ b/src/panfrost/vulkan/panvk_vX_shader.c
@ -65,6 +65,12 @@
 #include "vk_shader.h"
 #include "vk_util.h"

+static inline bool
+panvk_wants_pilot_shader(mesa_shader_stage stage, unsigned arch)
+{
+   return arch >= 9 && stage == MESA_SHADER_FRAGMENT;
+}
+
 #define FAU_WORD_COUNT 64

 struct panvk_lower_sysvals_context {
@ -700,6 +706,9 @@ lower_load_push_consts(nir_shader *nir, struct panvk_shader_variant *shader)
      shader_use_sysval(shader, graphics, blend.constants);
   }

+   if (panvk_wants_pilot_shader(nir->info.stage, PAN_ARCH))
+      shader_use_sysval(shader, common, push_uniforms);
+
   progress = false;
   NIR_PASS(progress, nir, nir_shader_intrinsics_pass, collect_push_constant,
            nir_metadata_all, shader);
@ -717,8 +726,8 @@ lower_load_push_consts(nir_shader *nir, struct panvk_shader_variant *shader)
   /* 32 FAUs (256 bytes) are reserved for API push constants */
   assert(shader->fau.sysval_count <= FAU_WORD_COUNT - 32 &&
          "too many sysval FAUs");
-   shader->fau.total_count =
-      shader->fau.sysval_count + BITSET_COUNT(shader->fau.used_push_consts);
+   shader->fau.push_count = BITSET_COUNT(shader->fau.used_push_consts);
+   shader->fau.total_count = shader->fau.sysval_count + shader->fau.push_count;
   assert(shader->fau.total_count <= FAU_WORD_COUNT &&
          "asking for more FAUs than the hardware has to offer");

@ -925,6 +934,26 @@ panvk_compile_nir(struct panvk_device *dev, nir_shader *nir,

   lower_load_push_consts(nir, shader);

+   if (panvk_wants_pilot_shader(nir->info.stage, PAN_ARCH)) {
+      /* Our pilots will be more efficient if we optimize first. */
+      pan_optimize_nir(nir, input.gpu_id);
+
+      struct pan_nir_opt_pilot_params pilot = {
+         .gpu_id = input.gpu_id,
+         .fau_addr_offset_B = shader_remapped_sysval_offset(
+            shader, sysval_offset(common, push_uniforms)),
+         .fau_pilot_data_offset_B =
+            FAU_WORD_SIZE * shader->fau.total_count,
+         .fau_pilot_data_space_B =
+            FAU_WORD_SIZE * (FAU_WORD_COUNT - shader->fau.total_count),
+      };
+      pan_nir_opt_pilot(nir, &pilot);
+
+      shader->fau.pilot_count =
+         DIV_ROUND_UP(pilot.pilot_data_size_B, FAU_WORD_SIZE);
+      shader->fau.total_count += shader->fau.pilot_count;
+   }
+
   /* Allow the remaining FAU space to be filled with constants. */
   input.fau_consts.max_amount =
      2 * (FAU_WORD_COUNT - shader->fau.total_count);
@ -1208,6 +1237,23 @@ panvk_shader_upload(struct panvk_device *dev,
      }
 #endif
   }
+
+   if (shader->info.pilot.enable) {
+      shader->pilot_spd =
+         panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
+      if (!panvk_priv_mem_check_alloc(shader->pilot_spd))
+         return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+      panvk_priv_mem_write_desc(shader->pilot_spd, 0, SHADER_PROGRAM, cfg) {
+         cfg.stage = MALI_SHADER_STAGE_COMPUTE,
+         cfg.register_allocation =
+            pan_register_allocation(shader->info.pilot.work_reg_count);
+         cfg.binary = panvk_shader_variant_get_dev_addr(shader) +
+                      shader->info.pilot.offset;
+         cfg.preload.r48_r63 = (shader->info.pilot.preload >> 48);
+         cfg.flush_to_zero_mode = shader_ftz_mode(shader);
+      }
+   }
 #endif

   return VK_SUCCESS;
@ -1237,6 +1283,7 @@ panvk_shader_variant_destroy(struct panvk_shader_variant *shader)
      panvk_pool_free_mem(&shader->spds.pos_triangles);
 #endif
   }
+   panvk_pool_free_mem(&shader->pilot_spd);
 #endif

   if (shader->own_bin)