panvk: Add a CSF backend for panvk_queue/cmd_buffer

With those two components implemented, we can now compile all common per-arch source files. Co-developed-by: Rebecca Mckeever <rebecca.mckeever@collabora.com> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com> Reviewed-by: Rebecca Mckeever <rebecca.mckeever@collabora.com> Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com> Acked-by: Erik Faye-Lund <erik.faye-lund@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30969>
2025-12-21 07:10:09 +01:00 · 2024-08-23 17:44:59 +02:00 · 2024-08-23 17:44:59 +02:00 · 5544d39f44
commit 5544d39f44
parent 7049d31676
11 changed files with 4406 additions and 28 deletions
--- a/src/panfrost/.clang-format
+++ b/src/panfrost/.clang-format
@ -10,5 +10,10 @@ ForEachMacros: [
  'cs_case',
  'cs_default',
  'cs_match',
  'cs_update_compute_ctx',
  'cs_update_frag_ctx',
  'cs_update_progress_seqno',
  'cs_update_vt_ctx',
  'cs_while',
  'panvk_cs_reg_upd_ctx',
 ]
--- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h
+++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h
@ -0,0 +1,460 @@
 /*
 * Copyright © 2024 Collabora Ltd.
 * SPDX-License-Identifier: MIT
 */
 #ifndef PANVK_CMD_BUFFER_H
 #define PANVK_CMD_BUFFER_H
 #ifndef PAN_ARCH
 #error "PAN_ARCH must be defined"
 #endif
 #include <stdint.h>
 #include "genxml/cs_builder.h"
 #include "panvk_cmd_desc_state.h"
 #include "panvk_cmd_push_constant.h"
 #include "panvk_queue.h"
 #include "vk_command_buffer.h"
 #include "util/list.h"
 #define MAX_VBS 16
 #define MAX_RTS 8
 struct panvk_cs_sync32 {
   uint32_t seqno;
   uint32_t error;
 };
 struct panvk_cs_sync64 {
   uint64_t seqno;
   uint32_t error;
   uint32_t pad;
 };
 struct panvk_cs_desc_ringbuf {
   uint64_t syncobj;
   uint64_t ptr;
   uint32_t pos;
   uint32_t pad;
 };
 /* 512k of render descriptors that can be used when
 * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT is set on the command buffer. */
 #define RENDER_DESC_RINGBUF_SIZE (512 * 1024)
 struct panvk_cs_subqueue_context {
   uint64_t syncobjs;
   uint32_t iter_sb;
   uint32_t pad;
   struct {
      struct panvk_cs_desc_ringbuf desc_ringbuf;
      uint64_t tiler_heap;
      uint64_t geom_buf;
   } render;
   uint64_t debug_syncobjs;
 } __attribute__((aligned(64)));
 struct panvk_cache_flush_info {
   enum mali_cs_flush_mode l2;
   enum mali_cs_flush_mode lsc;
   bool others;
 };
 struct panvk_cs_deps {
   bool needs_draw_flush;
   struct {
      uint32_t wait_sb_mask;
      struct panvk_cache_flush_info cache_flush;
   } src[PANVK_SUBQUEUE_COUNT];
   struct {
      uint32_t wait_subqueue_mask;
   } dst[PANVK_SUBQUEUE_COUNT];
 };
 enum panvk_sb_ids {
   PANVK_SB_LS = 0,
   PANVK_SB_IMM_FLUSH = 0,
   PANVK_SB_DEFERRED_SYNC = 1,
   PANVK_SB_DEFERRED_FLUSH = 2,
   PANVK_SB_ITER_START = 3,
   PANVK_SB_ITER_COUNT = 5,
 };
 #define SB_IMM_MASK     0
 #define SB_MASK(nm)     BITFIELD_BIT(PANVK_SB_##nm)
 #define SB_ID(nm)       PANVK_SB_##nm
 #define SB_ITER(x)      (PANVK_SB_ITER_START + (x))
 #define SB_WAIT_ITER(x) BITFIELD_BIT(PANVK_SB_ITER_START + (x))
 #define SB_ALL_MASK     BITFIELD_MASK(8)
 static inline uint32_t
 next_iter_sb(uint32_t sb)
 {
   return sb + 1 < PANVK_SB_ITER_COUNT ? sb + 1 : 0;
 }
 enum panvk_cs_regs {
   /* RUN_IDVS staging regs. */
   PANVK_CS_REG_RUN_IDVS_SR_START = 0,
   PANVK_CS_REG_RUN_IDVS_SR_END = 60,
   /* RUN_FRAGMENT staging regs. */
   PANVK_CS_REG_RUN_FRAGMENT_SR_START = 40,
   PANVK_CS_REG_RUN_FRAGMENT_SR_END = 46,
   /* RUN_COMPUTE staging regs. */
   PANVK_CS_REG_RUN_COMPUTE_SR_START = 0,
   PANVK_CS_REG_RUN_COMPUTE_SR_END = 39,
   /* Range of registers that can be used to store temporary data on
    * all queues. Note that some queues have extra space they can use
    * as scratch space.*/
   PANVK_CS_REG_SCRATCH_START = 66,
   PANVK_CS_REG_SCRATCH_END = 83,
   /* Driver context. */
   PANVK_CS_REG_PROGRESS_SEQNO_START = 84,
   PANVK_CS_REG_PROGRESS_SEQNO_END = 89,
   PANVK_CS_REG_SUBQUEUE_CTX_START = 90,
   PANVK_CS_REG_SUBQUEUE_CTX_END = 91,
 };
 static inline struct cs_index
 cs_scratch_reg_tuple(struct cs_builder *b, unsigned start, unsigned count)
 {
   assert(PANVK_CS_REG_SCRATCH_START + start + count - 1 <=
          PANVK_CS_REG_SCRATCH_END);
   return cs_reg_tuple(b, PANVK_CS_REG_SCRATCH_START + start, count);
 }
 static inline struct cs_index
 cs_scratch_reg32(struct cs_builder *b, unsigned reg)
 {
   return cs_scratch_reg_tuple(b, reg, 1);
 }
 static inline struct cs_index
 cs_scratch_reg64(struct cs_builder *b, unsigned reg)
 {
   assert(reg % 2 == 0);
   return cs_scratch_reg_tuple(b, reg, 2);
 }
 static inline struct cs_index
 cs_sr_reg_tuple(struct cs_builder *b, unsigned start, unsigned count)
 {
   assert(start + count - 1 < PANVK_CS_REG_SCRATCH_START);
   return cs_reg_tuple(b, start, count);
 }
 static inline struct cs_index
 cs_sr_reg32(struct cs_builder *b, unsigned reg)
 {
   return cs_sr_reg_tuple(b, reg, 1);
 }
 static inline struct cs_index
 cs_sr_reg64(struct cs_builder *b, unsigned reg)
 {
   assert(reg % 2 == 0);
   return cs_sr_reg_tuple(b, reg, 2);
 }
 static inline struct cs_index
 cs_subqueue_ctx_reg(struct cs_builder *b)
 {
   return cs_reg64(b, PANVK_CS_REG_SUBQUEUE_CTX_START);
 }
 static inline struct cs_index
 cs_progress_seqno_reg(struct cs_builder *b, enum panvk_subqueue_id subqueue)
 {
   assert(PANVK_CS_REG_PROGRESS_SEQNO_START + (subqueue * 2) <
          PANVK_CS_REG_PROGRESS_SEQNO_END);
   return cs_reg64(b, PANVK_CS_REG_PROGRESS_SEQNO_START + (subqueue * 2));
 }
 struct panvk_cs_reg_upd_context {
   reg_perm_cb_t reg_perm;
   struct panvk_cs_reg_upd_context *next;
 };
 struct panvk_cs_state {
   struct cs_builder builder;
   struct cs_load_store_tracker ls_tracker;
   /* Used to debug register writes in invalid contexts. */
   struct {
      struct panvk_cs_reg_upd_context *upd_ctx_stack;
      reg_perm_cb_t base_perm;
   } reg_access;
   /* Sync point relative to the beginning of the command buffer.
    * Needs to be offset with the subqueue sync point. */
   int32_t relative_sync_point;
 };
 static inline struct panvk_cs_reg_upd_context *
 panvk_cs_reg_ctx_push(struct cs_builder *b,
                      struct panvk_cs_reg_upd_context *ctx,
                      reg_perm_cb_t reg_perm)
 {
   struct panvk_cs_state *cs_state =
      container_of(b, struct panvk_cs_state, builder);
   ctx->reg_perm = reg_perm;
   ctx->next = cs_state->reg_access.upd_ctx_stack;
   cs_state->reg_access.upd_ctx_stack = ctx;
   return ctx;
 }
 static inline void
 panvk_cs_reg_ctx_pop(struct cs_builder *b, struct panvk_cs_reg_upd_context *ctx)
 {
   struct panvk_cs_state *cs_state =
      container_of(b, struct panvk_cs_state, builder);
   assert(cs_state->reg_access.upd_ctx_stack == ctx);
   cs_state->reg_access.upd_ctx_stack = ctx->next;
 }
 struct panvk_cs_reg_range {
   unsigned start;
   unsigned end;
 };
 #define PANVK_CS_REG_RANGE(__name)                                             \
   {                                                                           \
      .start = PANVK_CS_REG_##__name##_START,                                  \
      .end = PANVK_CS_REG_##__name##_END,                                      \
   }
 #define panvk_cs_reg_blacklist(__name, ...)                                    \
   static inline enum cs_reg_perm panvk_cs_##__name##_reg_perm(                \
      struct cs_builder *b, unsigned reg)                                      \
   {                                                                           \
      const struct panvk_cs_reg_range ranges[] = {                             \
         __VA_ARGS__,                                                          \
      };                                                                       \
                                                                               \
      for (unsigned i = 0; i < ARRAY_SIZE(ranges); i++) {                      \
         if (reg >= ranges[i].start && reg <= ranges[i].end)                   \
            return CS_REG_RD;                                                  \
      }                                                                        \
                                                                               \
      return CS_REG_RW;                                                        \
   }
 panvk_cs_reg_blacklist(vt, PANVK_CS_REG_RANGE(RUN_IDVS_SR),
                       PANVK_CS_REG_RANGE(PROGRESS_SEQNO),
                       PANVK_CS_REG_RANGE(SUBQUEUE_CTX));
 panvk_cs_reg_blacklist(frag, PANVK_CS_REG_RANGE(RUN_FRAGMENT_SR),
                       PANVK_CS_REG_RANGE(PROGRESS_SEQNO),
                       PANVK_CS_REG_RANGE(SUBQUEUE_CTX));
 panvk_cs_reg_blacklist(compute, PANVK_CS_REG_RANGE(RUN_COMPUTE_SR),
                       PANVK_CS_REG_RANGE(PROGRESS_SEQNO),
                       PANVK_CS_REG_RANGE(SUBQUEUE_CTX));
 #define panvk_cs_reg_whitelist(__name, ...)                                    \
   static inline enum cs_reg_perm panvk_cs_##__name##_reg_perm(                \
      struct cs_builder *b, unsigned reg)                                      \
   {                                                                           \
      const struct panvk_cs_reg_range ranges[] = {                             \
         __VA_ARGS__,                                                          \
      };                                                                       \
                                                                               \
      for (unsigned i = 0; i < ARRAY_SIZE(ranges); i++) {                      \
         if (reg >= ranges[i].start && reg <= ranges[i].end)                   \
            return CS_REG_RW;                                                  \
      }                                                                        \
                                                                               \
      return CS_REG_RD;                                                        \
   }
 #define panvk_cs_reg_upd_ctx(__b, __name)                                      \
   for (struct panvk_cs_reg_upd_context __reg_upd_ctx,                         \
        *reg_upd_ctxp = panvk_cs_reg_ctx_push(__b, &__reg_upd_ctx,             \
                                              panvk_cs_##__name##_reg_perm);   \
        reg_upd_ctxp;                                                          \
        panvk_cs_reg_ctx_pop(__b, &__reg_upd_ctx), reg_upd_ctxp = NULL)
 panvk_cs_reg_whitelist(progress_seqno, PANVK_CS_REG_RANGE(PROGRESS_SEQNO));
 #define cs_update_progress_seqno(__b) panvk_cs_reg_upd_ctx(__b, progress_seqno)
 panvk_cs_reg_whitelist(compute_ctx, PANVK_CS_REG_RANGE(RUN_COMPUTE_SR));
 #define cs_update_compute_ctx(__b) panvk_cs_reg_upd_ctx(__b, compute_ctx)
 panvk_cs_reg_whitelist(frag_ctx, PANVK_CS_REG_RANGE(RUN_FRAGMENT_SR));
 #define cs_update_frag_ctx(__b) panvk_cs_reg_upd_ctx(__b, frag_ctx)
 panvk_cs_reg_whitelist(vt_ctx, PANVK_CS_REG_RANGE(RUN_IDVS_SR));
 #define cs_update_vt_ctx(__b) panvk_cs_reg_upd_ctx(__b, vt_ctx)
 struct panvk_tls_state {
   struct panfrost_ptr desc;
   struct pan_tls_info info;
   unsigned max_wg_count;
 };
 struct panvk_cmd_compute_state {
   struct panvk_descriptor_state desc_state;
   const struct panvk_shader *shader;
   struct panvk_compute_sysvals sysvals;
   mali_ptr push_uniforms;
   struct {
      struct panvk_shader_desc_state desc;
   } cs;
 };
 struct panvk_attrib_buf {
   mali_ptr address;
   unsigned size;
 };
 struct panvk_resolve_attachment {
   VkResolveModeFlagBits mode;
   struct panvk_image_view *src_iview;
   struct panvk_image_view *dst_iview;
 };
 struct panvk_cmd_graphics_state {
   struct panvk_descriptor_state desc_state;
   struct {
      struct vk_vertex_input_state vi;
      struct vk_sample_locations_state sl;
   } dynamic;
   struct panvk_graphics_sysvals sysvals;
   struct panvk_shader_link link;
   bool linked;
   struct {
      const struct panvk_shader *shader;
      struct panvk_shader_desc_state desc;
      mali_ptr spd;
   } fs;
   struct {
      const struct panvk_shader *shader;
      struct panvk_shader_desc_state desc;
      struct {
         mali_ptr pos, var;
      } spds;
   } vs;
   struct {
      struct panvk_attrib_buf bufs[MAX_VBS];
      unsigned count;
      bool dirty;
   } vb;
   /* Index buffer */
   struct {
      struct panvk_buffer *buffer;
      uint64_t offset;
      uint8_t index_size;
      uint32_t first_vertex, base_vertex, base_instance;
      bool dirty;
   } ib;
   struct {
      struct panvk_blend_info info;
   } cb;
   struct {
      VkRenderingFlags flags;
      uint32_t layer_count;
      enum vk_rp_attachment_flags bound_attachments;
      struct {
         VkFormat fmts[MAX_RTS];
         uint8_t samples[MAX_RTS];
         struct panvk_resolve_attachment resolve[MAX_RTS];
      } color_attachments;
      struct pan_image_view zs_pview;
      struct {
         struct panvk_resolve_attachment resolve;
      } z_attachment, s_attachment;
      struct {
         struct pan_fb_info info;
         bool crc_valid[MAX_RTS];
      } fb;
      struct panfrost_ptr fbds;
      mali_ptr tiler;
      bool dirty;
   } render;
   mali_ptr push_uniforms;
 };
 struct panvk_cmd_buffer {
   struct vk_command_buffer vk;
   VkCommandBufferUsageFlags flags;
   struct panvk_pool cs_pool;
   struct panvk_pool desc_pool;
   struct panvk_pool tls_pool;
   struct list_head push_sets;
   uint32_t flush_id;
   struct {
      struct panvk_cmd_graphics_state gfx;
      struct panvk_cmd_compute_state compute;
      struct panvk_push_constant_state push_constants;
      struct panvk_cs_state cs[PANVK_SUBQUEUE_COUNT];
      struct panvk_tls_state tls;
   } state;
 };
 VK_DEFINE_HANDLE_CASTS(panvk_cmd_buffer, vk.base, VkCommandBuffer,
                       VK_OBJECT_TYPE_COMMAND_BUFFER)
 static inline struct cs_builder *
 panvk_get_cs_builder(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue)
 {
   return &cmdbuf->state.cs[subqueue].builder;
 }
 static inline struct panvk_descriptor_state *
 panvk_cmd_get_desc_state(struct panvk_cmd_buffer *cmdbuf,
                         VkPipelineBindPoint bindpoint)
 {
   switch (bindpoint) {
   case VK_PIPELINE_BIND_POINT_GRAPHICS:
      return &cmdbuf->state.gfx.desc_state;
   case VK_PIPELINE_BIND_POINT_COMPUTE:
      return &cmdbuf->state.compute.desc_state;
   default:
      assert(!"Unsupported bind point");
      return NULL;
   }
 }
 extern const struct vk_command_buffer_ops panvk_per_arch(cmd_buffer_ops);
 void panvk_per_arch(cmd_flush_draws)(struct panvk_cmd_buffer *cmdbuf);
 void panvk_per_arch(cs_pick_iter_sb)(struct panvk_cmd_buffer *cmdbuf,
                                     enum panvk_subqueue_id subqueue);
 void panvk_per_arch(get_cs_deps)(struct panvk_cmd_buffer *cmdbuf,
                                 const VkDependencyInfo *in,
                                 struct panvk_cs_deps *out);
 #endif /* PANVK_CMD_BUFFER_H */
--- a/src/panfrost/vulkan/csf/panvk_queue.h
+++ b/src/panfrost/vulkan/csf/panvk_queue.h
@ -0,0 +1,72 @@
 /*
 * Copyright © 2021 Collabora Ltd.
 * SPDX-License-Identifier: MIT
 */
 #ifndef PANVK_QUEUE_H
 #define PANVK_QUEUE_H
 #ifndef PAN_ARCH
 #error "PAN_ARCH must be defined"
 #endif
 #include "genxml/gen_macros.h"
 #include <stdint.h>
 #include "panvk_device.h"
 #include "vk_queue.h"
 enum panvk_subqueue_id {
   PANVK_SUBQUEUE_VERTEX_TILER = 0,
   PANVK_SUBQUEUE_FRAGMENT,
   PANVK_SUBQUEUE_COMPUTE,
   PANVK_SUBQUEUE_COUNT,
 };
 struct panvk_tiler_heap {
   uint32_t chunk_size;
   struct panvk_priv_mem desc;
   struct {
      uint32_t handle;
      mali_ptr dev_addr;
   } context;
 };
 struct panvk_subqueue {
   struct panvk_priv_mem context;
 };
 struct panvk_desc_ringbuf {
   struct panvk_priv_mem syncobj;
   struct pan_kmod_bo *bo;
   struct {
      uint64_t dev;
      void *host;
   } addr;
 };
 struct panvk_queue {
   struct vk_queue vk;
   uint32_t group_handle;
   uint32_t syncobj_handle;
   struct panvk_tiler_heap tiler_heap;
   struct panvk_desc_ringbuf render_desc_ringbuf;
   struct panvk_priv_mem syncobjs;
   struct panvk_priv_mem debug_syncobjs;
   struct panvk_subqueue subqueues[PANVK_SUBQUEUE_COUNT];
 };
 VK_DEFINE_HANDLE_CASTS(panvk_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
 void panvk_per_arch(queue_finish)(struct panvk_queue *queue);
 VkResult panvk_per_arch(queue_init)(struct panvk_device *device,
                                    struct panvk_queue *queue, int idx,
                                    const VkDeviceQueueCreateInfo *create_info);
 #endif
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c
@ -0,0 +1,703 @@
 /*
 * Copyright © 2021 Collabora Ltd.
 *
 * Derived from tu_cmd_buffer.c which is:
 * Copyright © 2016 Red Hat.
 * Copyright © 2016 Bas Nieuwenhuizen
 * Copyright © 2015 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #include "genxml/gen_macros.h"
 #include "panvk_buffer.h"
 #include "panvk_cmd_alloc.h"
 #include "panvk_cmd_buffer.h"
 #include "panvk_cmd_desc_state.h"
 #include "panvk_cmd_pool.h"
 #include "panvk_cmd_push_constant.h"
 #include "panvk_device.h"
 #include "panvk_entrypoints.h"
 #include "panvk_instance.h"
 #include "panvk_physical_device.h"
 #include "panvk_priv_bo.h"
 #include "pan_blitter.h"
 #include "pan_desc.h"
 #include "pan_encoder.h"
 #include "pan_props.h"
 #include "pan_samples.h"
 #include "vk_descriptor_update_template.h"
 #include "vk_format.h"
 #include "vk_synchronization.h"
 static void
 emit_tls(struct panvk_cmd_buffer *cmdbuf)
 {
   struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
   struct panvk_physical_device *phys_dev =
      to_panvk_physical_device(dev->vk.physical);
   unsigned core_id_range;
   panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
   if (cmdbuf->state.tls.info.tls.size) {
      unsigned thread_tls_alloc =
         panfrost_query_thread_tls_alloc(&phys_dev->kmod.props);
      unsigned size = panfrost_get_total_stack_size(
         cmdbuf->state.tls.info.tls.size, thread_tls_alloc, core_id_range);
      cmdbuf->state.tls.info.tls.ptr =
         panvk_cmd_alloc_dev_mem(cmdbuf, tls, size, 4096).gpu;
   }
   assert(!cmdbuf->state.tls.info.wls.size);
   if (cmdbuf->state.tls.desc.cpu) {
      GENX(pan_emit_tls)(&cmdbuf->state.tls.info, cmdbuf->state.tls.desc.cpu);
   }
 }
 static void
 finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue)
 {
   struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
   struct panvk_instance *instance =
      to_panvk_instance(dev->vk.physical->instance);
   struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
   cs_update_progress_seqno(b) {
      for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
         uint32_t rel_sync_point = cmdbuf->state.cs[i].relative_sync_point;
         if (!rel_sync_point)
            continue;
         cs_add64(b, cs_progress_seqno_reg(b, i), cs_progress_seqno_reg(b, i),
                  rel_sync_point);
      }
   }
   /* If we're decoding the CS or dumping memory mappings, we need a flush
    * to make sure all data have been pushed to memory. */
   if (instance->debug_flags & (PANVK_DEBUG_DUMP | PANVK_DEBUG_TRACE)) {
      struct cs_index flush_id = cs_scratch_reg32(b, 0);
      cs_move32_to(b, flush_id, 0);
      cs_wait_slots(b, SB_ALL_MASK, false);
      cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN,
                      false, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
      cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
   }
   /* If we're in sync/trace more, we signal the debug object. */
   if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) {
      struct cs_index debug_sync_addr = cs_scratch_reg64(b, 0);
      struct cs_index one = cs_scratch_reg32(b, 2);
      struct cs_index error = cs_scratch_reg32(b, 3);
      struct cs_index cmp_scratch = cs_scratch_reg32(b, 2);
      cs_move32_to(b, one, 1);
      cs_load64_to(b, debug_sync_addr, cs_subqueue_ctx_reg(b),
                   offsetof(struct panvk_cs_subqueue_context, debug_syncobjs));
      cs_wait_slot(b, SB_ID(LS), false);
      cs_add64(b, debug_sync_addr, debug_sync_addr,
               sizeof(struct panvk_cs_sync32) * subqueue);
      cs_load32_to(b, error, debug_sync_addr,
                   offsetof(struct panvk_cs_sync32, error));
      cs_wait_slots(b, SB_ALL_MASK, false);
      cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_SYSTEM, one, debug_sync_addr,
                    cs_now());
      cs_match(b, error, cmp_scratch) {
         cs_case(b, 0) {
            /* Do nothing. */
         }
         cs_default(b) {
            /* Overwrite the sync error with the first error we encountered. */
            cs_store32(b, error, debug_sync_addr,
                       offsetof(struct panvk_cs_sync32, error));
            cs_wait_slots(b, SB_ID(LS), false);
         }
      }
   }
   cs_finish(&cmdbuf->state.cs[subqueue].builder);
 }
 VKAPI_ATTR VkResult VKAPI_CALL
 panvk_per_arch(EndCommandBuffer)(VkCommandBuffer commandBuffer)
 {
   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
   emit_tls(cmdbuf);
   for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) {
      struct cs_builder *b = &cmdbuf->state.cs[i].builder;
      if (!cs_is_valid(b)) {
         vk_command_buffer_set_error(&cmdbuf->vk,
                                     VK_ERROR_OUT_OF_DEVICE_MEMORY);
      } else {
         finish_cs(cmdbuf, i);
      }
   }
   return vk_command_buffer_end(&cmdbuf->vk);
 }
 static bool
 src_stages_need_draw_flush(VkPipelineStageFlags2 stages)
 {
   static const VkPipelineStageFlags2 draw_flush_stage_mask =
      VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
      VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
      VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT |
      VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
      VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
      VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT;
   return (stages & draw_flush_stage_mask) != 0;
 }
 static bool
 stages_cover_subqueue(enum panvk_subqueue_id subqueue,
                      VkPipelineStageFlags2 stages)
 {
   static const VkPipelineStageFlags2 queue_coverage[PANVK_SUBQUEUE_COUNT] = {
      [PANVK_SUBQUEUE_VERTEX_TILER] = VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
                                      VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
                                      VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT,
      [PANVK_SUBQUEUE_FRAGMENT] =
         VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
         VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
         VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT |
         VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
         VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
         VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT,
      [PANVK_SUBQUEUE_COMPUTE] =
         VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_COPY_BIT,
   };
   return (stages & queue_coverage[subqueue]) != 0;
 }
 static uint32_t
 src_stages_to_subqueue_sb_mask(enum panvk_subqueue_id subqueue,
                               VkPipelineStageFlags2 stages)
 {
   if (!stages_cover_subqueue(subqueue, stages))
      return 0;
   /* Indirect draw buffers are read from the command stream, and load/store
    * operations are synchronized with the LS scoreboad immediately after the
    * read, so no need to wait in that case.
    */
   if (subqueue == PANVK_SUBQUEUE_VERTEX_TILER &&
       stages == VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT)
      return 0;
   /* We need to wait for all previously submitted jobs, and given the
    * iterator scoreboard is a moving target, we just wait for the
    * whole dynamic scoreboard range. */
   return BITFIELD_RANGE(PANVK_SB_ITER_START, PANVK_SB_ITER_COUNT);
 }
 static void
 collect_cache_flush_info(enum panvk_subqueue_id subqueue,
                         struct panvk_cache_flush_info *cache_flush,
                         VkPipelineStageFlags2 src_stages,
                         VkPipelineStageFlags2 dst_stages,
                         VkAccessFlags2 src_access, VkAccessFlags2 dst_access)
 {
   static const VkAccessFlags2 dev_writes[PANVK_SUBQUEUE_COUNT] = {
      [PANVK_SUBQUEUE_VERTEX_TILER] = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
                                      VK_ACCESS_2_SHADER_WRITE_BIT |
                                      VK_ACCESS_2_TRANSFER_WRITE_BIT,
      [PANVK_SUBQUEUE_FRAGMENT] =
         VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT | VK_ACCESS_2_SHADER_WRITE_BIT |
         VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
         VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
         VK_ACCESS_2_TRANSFER_WRITE_BIT,
      [PANVK_SUBQUEUE_COMPUTE] = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
                                 VK_ACCESS_2_SHADER_WRITE_BIT |
                                 VK_ACCESS_2_TRANSFER_WRITE_BIT,
   };
   static const VkAccessFlags2 dev_reads[PANVK_SUBQUEUE_COUNT] = {
      [PANVK_SUBQUEUE_VERTEX_TILER] =
         VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_2_INDEX_READ_BIT |
         VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_2_UNIFORM_READ_BIT |
         VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_TRANSFER_READ_BIT |
         VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
         VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
      [PANVK_SUBQUEUE_FRAGMENT] =
         VK_ACCESS_2_UNIFORM_READ_BIT | VK_ACCESS_2_SHADER_READ_BIT |
         VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT |
         VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
         VK_ACCESS_2_TRANSFER_READ_BIT | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
         VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
      [PANVK_SUBQUEUE_COMPUTE] =
         VK_ACCESS_2_UNIFORM_READ_BIT | VK_ACCESS_2_SHADER_READ_BIT |
         VK_ACCESS_2_TRANSFER_READ_BIT | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
         VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
   };
   /* Note on the cache organization:
    * - L2 cache is unified, so all changes to this cache are automatically
    *   visible to all GPU sub-components (shader cores, tiler, ...). This
    *   means we only need to flush when the host (AKA CPU) is involved.
    * - LS caches (which are basically just read-write L1 caches) are coherent
    *   with each other and with the L2 cache, so again, we only need to flush
    *   when the host is involved.
    * - Other read-only L1 caches (like the ones in front of the texture unit)
    *   are not coherent with the LS or L2 caches, and thus need to be
    *   invalidated any time a write happens.
    */
 #define ACCESS_HITS_RO_L1_CACHE                                                \
   (VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |                                      \
    VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT |                                    \
    VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT |                            \
    VK_ACCESS_2_TRANSFER_READ_BIT)
   if ((dev_writes[subqueue] & src_access) &&
       (dev_reads[subqueue] & ACCESS_HITS_RO_L1_CACHE & dst_access))
      cache_flush->others |= true;
   /* If the host wrote something, we need to clean/invalidate everything. */
   if ((src_stages & VK_PIPELINE_STAGE_2_HOST_BIT) &&
       (src_access & VK_ACCESS_2_HOST_WRITE_BIT) &&
       ((dev_reads[subqueue] | dev_writes[subqueue]) & dst_access)) {
      cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE;
      cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE;
      cache_flush->others |= true;
   }
   /* If the host needs to read something we wrote, we need to clean
    * everything. */
   if ((dst_stages & VK_PIPELINE_STAGE_2_HOST_BIT) &&
       (dst_access & VK_ACCESS_2_HOST_READ_BIT) &&
       (dev_writes[subqueue] & src_access)) {
      cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN;
      cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN;
   }
 }
 static void
 collect_cs_deps(struct panvk_cmd_buffer *cmdbuf,
                VkPipelineStageFlags2 src_stages,
                VkPipelineStageFlags2 dst_stages, VkAccessFlags src_access,
                VkAccessFlags dst_access, struct panvk_cs_deps *deps)
 {
   if (src_stages_need_draw_flush(src_stages) && cmdbuf->state.gfx.render.tiler)
      deps->needs_draw_flush = true;
   uint32_t wait_subqueue_mask = 0;
   for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
      uint32_t sb_mask = src_stages_to_subqueue_sb_mask(i, src_stages);
      assert((sb_mask != 0) == stages_cover_subqueue(i, src_stages));
      if (!sb_mask)
         continue;
      deps->src[i].wait_sb_mask |= sb_mask;
      collect_cache_flush_info(i, &deps->src[i].cache_flush, src_stages,
                               dst_stages, src_access, dst_access);
      wait_subqueue_mask |= BITFIELD_BIT(i);
   }
   for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
      if (!stages_cover_subqueue(i, dst_stages))
         continue;
      deps->dst[i].wait_subqueue_mask |= wait_subqueue_mask & ~BITFIELD_BIT(i);
   }
 }
 void
 panvk_per_arch(get_cs_deps)(struct panvk_cmd_buffer *cmdbuf,
                            const VkDependencyInfo *in,
                            struct panvk_cs_deps *out)
 {
   memset(out, 0, sizeof(*out));
   for (uint32_t i = 0; i < in->memoryBarrierCount; i++) {
      const VkMemoryBarrier2 *barrier = &in->pMemoryBarriers[i];
      VkPipelineStageFlags2 src_stages =
         vk_expand_pipeline_stage_flags2(barrier->srcStageMask);
      VkPipelineStageFlags2 dst_stages =
         vk_expand_pipeline_stage_flags2(barrier->dstStageMask);
      VkAccessFlags2 src_access =
         vk_filter_src_access_flags2(src_stages, barrier->srcAccessMask);
      VkAccessFlags2 dst_access =
         vk_filter_dst_access_flags2(dst_stages, barrier->dstAccessMask);
      collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
                      out);
   }
   for (uint32_t i = 0; i < in->bufferMemoryBarrierCount; i++) {
      const VkBufferMemoryBarrier2 *barrier = &in->pBufferMemoryBarriers[i];
      VkPipelineStageFlags2 src_stages =
         vk_expand_pipeline_stage_flags2(barrier->srcStageMask);
      VkPipelineStageFlags2 dst_stages =
         vk_expand_pipeline_stage_flags2(barrier->dstStageMask);
      VkAccessFlags2 src_access =
         vk_filter_src_access_flags2(src_stages, barrier->srcAccessMask);
      VkAccessFlags2 dst_access =
         vk_filter_dst_access_flags2(dst_stages, barrier->dstAccessMask);
      collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
                      out);
   }
   for (uint32_t i = 0; i < in->imageMemoryBarrierCount; i++) {
      const VkImageMemoryBarrier2 *barrier = &in->pImageMemoryBarriers[i];
      VkPipelineStageFlags2 src_stages =
         vk_expand_pipeline_stage_flags2(barrier->srcStageMask);
      VkPipelineStageFlags2 dst_stages =
         vk_expand_pipeline_stage_flags2(barrier->dstStageMask);
      VkAccessFlags2 src_access =
         vk_filter_src_access_flags2(src_stages, barrier->srcAccessMask);
      VkAccessFlags2 dst_access =
         vk_filter_dst_access_flags2(dst_stages, barrier->dstAccessMask);
      collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
                      out);
   }
   /* The draw flush will add a vertex -> fragment dependency, so we can skip
    * the one described in the deps. */
   if (out->needs_draw_flush)
      out->dst[PANVK_SUBQUEUE_FRAGMENT].wait_subqueue_mask &=
         ~BITFIELD_BIT(PANVK_SUBQUEUE_VERTEX_TILER);
 }
 VKAPI_ATTR void VKAPI_CALL
 panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer,
                                    const VkDependencyInfo *pDependencyInfo)
 {
   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
   struct panvk_cs_deps deps;
   panvk_per_arch(get_cs_deps)(cmdbuf, pDependencyInfo, &deps);
   if (deps.needs_draw_flush)
      panvk_per_arch(cmd_flush_draws)(cmdbuf);
   uint32_t wait_subqueue_mask = 0;
   for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
      wait_subqueue_mask |= deps.dst[i].wait_subqueue_mask;
   for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
      if (!deps.src[i].wait_sb_mask)
         continue;
      struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i);
      struct panvk_cs_state *cs_state = &cmdbuf->state.cs[i];
      cs_wait_slots(b, deps.src[i].wait_sb_mask, false);
      struct panvk_cache_flush_info cache_flush = deps.src[i].cache_flush;
      if (cache_flush.l2 != MALI_CS_FLUSH_MODE_NONE ||
          cache_flush.lsc != MALI_CS_FLUSH_MODE_NONE || cache_flush.others) {
         struct cs_index flush_id = cs_scratch_reg32(b, 0);
         cs_move32_to(b, flush_id, 0);
         cs_flush_caches(b, cache_flush.l2, cache_flush.lsc, cache_flush.others,
                         flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
         cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
      }
      /* If no one waits on us, there's no point signaling the sync object. */
      if (wait_subqueue_mask & BITFIELD_BIT(i)) {
         struct cs_index sync_addr = cs_scratch_reg64(b, 0);
         struct cs_index add_val = cs_scratch_reg64(b, 2);
         cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
                      offsetof(struct panvk_cs_subqueue_context, syncobjs));
         cs_wait_slot(b, SB_ID(LS), false);
         cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * i);
         cs_move64_to(b, add_val, 1);
         cs_sync64_add(b, false, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,
                       cs_now());
         ++cs_state->relative_sync_point;
      }
   }
   for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
      if (!deps.dst[i].wait_subqueue_mask)
         continue;
      struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i);
      for (uint32_t j = 0; j < PANVK_SUBQUEUE_COUNT; j++) {
         if (!(deps.dst[i].wait_subqueue_mask & BITFIELD_BIT(j)))
            continue;
         struct panvk_cs_state *cs_state = &cmdbuf->state.cs[j];
         struct cs_index sync_addr = cs_scratch_reg64(b, 0);
         struct cs_index wait_val = cs_scratch_reg64(b, 2);
         cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
                      offsetof(struct panvk_cs_subqueue_context, syncobjs));
         cs_wait_slot(b, SB_ID(LS), false);
         cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * j);
         cs_add64(b, wait_val, cs_progress_seqno_reg(b, j),
                  cs_state->relative_sync_point);
         cs_sync64_wait(b, false, MALI_CS_CONDITION_GREATER, wait_val,
                        sync_addr);
      }
   }
 }
 void
 panvk_per_arch(cs_pick_iter_sb)(struct panvk_cmd_buffer *cmdbuf,
                                enum panvk_subqueue_id subqueue)
 {
   struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
   struct cs_index iter_sb = cs_scratch_reg32(b, 0);
   struct cs_index cmp_scratch = cs_scratch_reg32(b, 1);
   cs_load32_to(b, iter_sb, cs_subqueue_ctx_reg(b),
                offsetof(struct panvk_cs_subqueue_context, iter_sb));
   cs_wait_slot(b, SB_ID(LS), false);
   cs_match(b, iter_sb, cmp_scratch) {
 #define CASE(x)                                                                \
      cs_case(b, x) {                                                          \
         cs_wait_slot(b, SB_ITER(x), false);                                   \
         cs_set_scoreboard_entry(b, SB_ITER(x), SB_ID(LS));                    \
      }
      CASE(0)
      CASE(1)
      CASE(2)
      CASE(3)
      CASE(4)
 #undef CASE
   }
 }
 static struct cs_buffer
 alloc_cs_buffer(void *cookie)
 {
   struct panvk_cmd_buffer *cmdbuf = cookie;
   const unsigned capacity = 64 * 1024 / sizeof(uint64_t);
   struct panfrost_ptr ptr =
      panvk_cmd_alloc_dev_mem(cmdbuf, cs, capacity * 8, 64);
   return (struct cs_buffer){
      .cpu = ptr.cpu,
      .gpu = ptr.gpu,
      .capacity = capacity,
   };
 }
 static enum cs_reg_perm
 cs_reg_perm(struct cs_builder *b, unsigned reg)
 {
   struct panvk_cs_state *cs_state =
      container_of(b, struct panvk_cs_state, builder);
   struct panvk_cs_reg_upd_context *upd_ctx;
   for (upd_ctx = cs_state->reg_access.upd_ctx_stack; upd_ctx;
        upd_ctx = upd_ctx->next) {
      if (upd_ctx->reg_perm(b, reg) == CS_REG_RW)
         return CS_REG_RW;
   }
   return cs_state->reg_access.base_perm(b, reg);
 }
 static void
 init_cs_builders(struct panvk_cmd_buffer *cmdbuf)
 {
   struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
   struct panvk_instance *instance =
      to_panvk_instance(dev->vk.physical->instance);
   const reg_perm_cb_t base_reg_perms[PANVK_SUBQUEUE_COUNT] = {
      [PANVK_SUBQUEUE_VERTEX_TILER] = panvk_cs_vt_reg_perm,
      [PANVK_SUBQUEUE_FRAGMENT] = panvk_cs_frag_reg_perm,
      [PANVK_SUBQUEUE_COMPUTE] = panvk_cs_compute_reg_perm,
   };
   for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) {
      /* Lazy allocation of the root CS. */
      struct cs_buffer root_cs = {0};
      struct cs_builder_conf conf = {
         .nr_registers = 96,
         .nr_kernel_registers = 4,
         .alloc_buffer = alloc_cs_buffer,
         .cookie = cmdbuf,
      };
      if (instance->debug_flags & PANVK_DEBUG_CS) {
         cmdbuf->state.cs[i].ls_tracker = (struct cs_load_store_tracker){
            .sb_slot = SB_ID(LS),
         };
         conf.ls_tracker = &cmdbuf->state.cs[i].ls_tracker;
         cmdbuf->state.cs[i].reg_access.upd_ctx_stack = NULL;
         cmdbuf->state.cs[i].reg_access.base_perm = base_reg_perms[i];
         conf.reg_perm = cs_reg_perm;
      }
      cs_builder_init(&cmdbuf->state.cs[i].builder, &conf, root_cs);
   }
 }
 static void
 panvk_reset_cmdbuf(struct vk_command_buffer *vk_cmdbuf,
                   VkCommandBufferResetFlags flags)
 {
   struct panvk_cmd_buffer *cmdbuf =
      container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
   struct panvk_cmd_pool *pool =
      container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk);
   vk_command_buffer_reset(&cmdbuf->vk);
   panvk_pool_reset(&cmdbuf->cs_pool);
   panvk_pool_reset(&cmdbuf->desc_pool);
   panvk_pool_reset(&cmdbuf->tls_pool);
   list_splicetail(&cmdbuf->push_sets, &pool->push_sets);
   list_inithead(&cmdbuf->push_sets);
   memset(&cmdbuf->state, 0, sizeof(cmdbuf->state));
   init_cs_builders(cmdbuf);
 }
 static void
 panvk_destroy_cmdbuf(struct vk_command_buffer *vk_cmdbuf)
 {
   struct panvk_cmd_buffer *cmdbuf =
      container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
   struct panvk_cmd_pool *pool =
      container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk);
   struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
   panvk_pool_cleanup(&cmdbuf->cs_pool);
   panvk_pool_cleanup(&cmdbuf->desc_pool);
   panvk_pool_cleanup(&cmdbuf->tls_pool);
   list_splicetail(&cmdbuf->push_sets, &pool->push_sets);
   vk_command_buffer_finish(&cmdbuf->vk);
   vk_free(&dev->vk.alloc, cmdbuf);
 }
 static VkResult
 panvk_create_cmdbuf(struct vk_command_pool *vk_pool, VkCommandBufferLevel level,
                    struct vk_command_buffer **cmdbuf_out)
 {
   struct panvk_device *device =
      container_of(vk_pool->base.device, struct panvk_device, vk);
   struct panvk_cmd_pool *pool =
      container_of(vk_pool, struct panvk_cmd_pool, vk);
   struct panvk_cmd_buffer *cmdbuf;
   cmdbuf = vk_zalloc(&device->vk.alloc, sizeof(*cmdbuf), 8,
                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (!cmdbuf)
      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
   VkResult result = vk_command_buffer_init(
      &pool->vk, &cmdbuf->vk, &panvk_per_arch(cmd_buffer_ops), level);
   if (result != VK_SUCCESS) {
      vk_free(&device->vk.alloc, cmdbuf);
      return result;
   }
   list_inithead(&cmdbuf->push_sets);
   cmdbuf->vk.dynamic_graphics_state.vi = &cmdbuf->state.gfx.dynamic.vi;
   cmdbuf->vk.dynamic_graphics_state.ms.sample_locations =
      &cmdbuf->state.gfx.dynamic.sl;
   struct panvk_pool_properties cs_pool_props = {
      .create_flags = 0,
      .slab_size = 64 * 1024,
      .label = "Command buffer CS pool",
      .prealloc = false,
      .owns_bos = true,
      .needs_locking = false,
   };
   panvk_pool_init(&cmdbuf->cs_pool, device, &pool->cs_bo_pool, &cs_pool_props);
   struct panvk_pool_properties desc_pool_props = {
      .create_flags = 0,
      .slab_size = 64 * 1024,
      .label = "Command buffer descriptor pool",
      .prealloc = false,
      .owns_bos = true,
      .needs_locking = false,
   };
   panvk_pool_init(&cmdbuf->desc_pool, device, &pool->desc_bo_pool,
                   &desc_pool_props);
   struct panvk_pool_properties tls_pool_props = {
      .create_flags =
         panvk_device_adjust_bo_flags(device, PAN_KMOD_BO_FLAG_NO_MMAP),
      .slab_size = 64 * 1024,
      .label = "TLS pool",
      .prealloc = false,
      .owns_bos = true,
      .needs_locking = false,
   };
   panvk_pool_init(&cmdbuf->tls_pool, device, &pool->tls_bo_pool,
                   &tls_pool_props);
   init_cs_builders(cmdbuf);
   *cmdbuf_out = &cmdbuf->vk;
   return VK_SUCCESS;
 }
 const struct vk_command_buffer_ops panvk_per_arch(cmd_buffer_ops) = {
   .create = panvk_create_cmdbuf,
   .reset = panvk_reset_cmdbuf,
   .destroy = panvk_destroy_cmdbuf,
 };
 VKAPI_ATTR VkResult VKAPI_CALL
 panvk_per_arch(BeginCommandBuffer)(VkCommandBuffer commandBuffer,
                                   const VkCommandBufferBeginInfo *pBeginInfo)
 {
   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
   struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
   struct panvk_instance *instance =
      to_panvk_instance(dev->vk.physical->instance);
   vk_command_buffer_begin(&cmdbuf->vk, pBeginInfo);
   cmdbuf->flags = pBeginInfo->flags;
   /* The descriptor ringbuf trips out pandecode because we always point to the
    * next tiler/framebuffer descriptor after CS execution, which means we're
    * decoding an uninitialized or stale descriptor.
    * FIXME: find a way to trace the simultaneous path that doesn't crash. One
    * option would be to disable CS intepretation and dump the RUN_xxx context
    * on the side at execution time.
    */
   if (instance->debug_flags & PANVK_DEBUG_TRACE)
      cmdbuf->flags &= ~VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
   return VK_SUCCESS;
 }
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
@ -0,0 +1,285 @@
 /*
 * Copyright © 2024 Collabora Ltd.
 *
 * Derived from tu_cmd_buffer.c which is:
 * Copyright © 2016 Red Hat.
 * Copyright © 2016 Bas Nieuwenhuizen
 * Copyright © 2015 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 */
 #include "genxml/gen_macros.h"
 #include "panvk_cmd_alloc.h"
 #include "panvk_cmd_buffer.h"
 #include "panvk_cmd_desc_state.h"
 #include "panvk_cmd_meta.h"
 #include "panvk_cmd_push_constant.h"
 #include "panvk_device.h"
 #include "panvk_entrypoints.h"
 #include "panvk_meta.h"
 #include "panvk_physical_device.h"
 #include "pan_desc.h"
 #include "pan_encoder.h"
 #include "pan_props.h"
 #include <vulkan/vulkan_core.h>
 static VkResult
 prepare_driver_set(struct panvk_cmd_buffer *cmdbuf)
 {
   struct panvk_shader_desc_state *cs_desc_state =
      &cmdbuf->state.compute.cs.desc;
   if (cs_desc_state->driver_set.dev_addr)
      return VK_SUCCESS;
   const struct panvk_descriptor_state *desc_state =
      &cmdbuf->state.compute.desc_state;
   const struct panvk_shader *cs = cmdbuf->state.compute.shader;
   uint32_t desc_count = cs->desc_info.dyn_bufs.count + 1;
   struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
      cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
   struct panvk_opaque_desc *descs = driver_set.cpu;
   if (!driver_set.gpu)
      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
   /* Dummy sampler always comes first. */
   pan_pack(&descs[0], SAMPLER, _) {
   }
   panvk_per_arch(cmd_fill_dyn_bufs)(desc_state, cs,
                                     (struct mali_buffer_packed *)(&descs[1]));
   cs_desc_state->driver_set.dev_addr = driver_set.gpu;
   cs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
   return VK_SUCCESS;
 }
 static VkResult
 prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf)
 {
   cmdbuf->state.compute.push_uniforms = panvk_per_arch(
      cmd_prepare_push_uniforms)(cmdbuf, &cmdbuf->state.compute.sysvals,
                                 sizeof(cmdbuf->state.compute.sysvals));
   return cmdbuf->state.compute.push_uniforms ? VK_SUCCESS
                                              : VK_ERROR_OUT_OF_DEVICE_MEMORY;
 }
 VKAPI_ATTR void VKAPI_CALL
 panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
                                uint32_t baseGroupX, uint32_t baseGroupY,
                                uint32_t baseGroupZ, uint32_t groupCountX,
                                uint32_t groupCountY, uint32_t groupCountZ)
 {
   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
   const struct panvk_shader *shader = cmdbuf->state.compute.shader;
   VkResult result;
   /* If there's no compute shader, we can skip the dispatch. */
   if (!panvk_priv_mem_dev_addr(shader->spd))
      return;
   struct panvk_physical_device *phys_dev =
      to_panvk_physical_device(cmdbuf->vk.base.device->physical);
   struct panvk_descriptor_state *desc_state =
      &cmdbuf->state.compute.desc_state;
   struct panvk_shader_desc_state *cs_desc_state =
      &cmdbuf->state.compute.cs.desc;
   struct panfrost_ptr tsd = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
   if (!tsd.gpu)
      return;
   struct pan_tls_info tlsinfo = {
      .tls.size = shader->info.tls_size,
      .wls.size = shader->info.wls_size,
   };
   unsigned core_id_range;
   panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
   if (tlsinfo.wls.size) {
      /* TODO: Clamp WLS instance to some maximum WLS budget. */
      struct pan_compute_dim dim = {groupCountX, groupCountY, groupCountZ};
      tlsinfo.wls.instances = pan_wls_instances(&dim);
      unsigned wls_total_size = pan_wls_adjust_size(tlsinfo.wls.size) *
                                tlsinfo.wls.instances * core_id_range;
      tlsinfo.wls.ptr =
         panvk_cmd_alloc_dev_mem(cmdbuf, tls, wls_total_size, 4096).gpu;
      if (!tlsinfo.wls.ptr)
         return;
   }
   cmdbuf->state.tls.info.tls.size =
      MAX2(shader->info.tls_size, cmdbuf->state.tls.info.tls.size);
   if (!cmdbuf->state.tls.desc.gpu) {
      cmdbuf->state.tls.desc = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
      if (!cmdbuf->state.tls.desc.gpu)
         return;
   }
   GENX(pan_emit_tls)(&tlsinfo, tsd.cpu);
   result = panvk_per_arch(cmd_prepare_push_descs)(
      cmdbuf, desc_state, shader->desc_info.used_set_mask);
   if (result != VK_SUCCESS)
      return;
   struct panvk_compute_sysvals *sysvals = &cmdbuf->state.compute.sysvals;
   sysvals->num_work_groups.x = groupCountX;
   sysvals->num_work_groups.y = groupCountY;
   sysvals->num_work_groups.z = groupCountZ;
   sysvals->local_group_size.x = shader->local_size.x;
   sysvals->local_group_size.y = shader->local_size.y;
   sysvals->local_group_size.z = shader->local_size.z;
   result = prepare_driver_set(cmdbuf);
   if (result != VK_SUCCESS)
      return;
   cmdbuf->state.compute.push_uniforms = 0;
   result = prepare_push_uniforms(cmdbuf);
   if (result != VK_SUCCESS)
      return;
   result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state,
                                                         shader, cs_desc_state);
   if (result != VK_SUCCESS)
      return;
   struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
   unsigned task_axis = MALI_TASK_AXIS_X;
   unsigned task_increment = 0;
   /* Copy the global TLS pointer to the per-job TSD. */
   cs_move64_to(b, cs_scratch_reg64(b, 0), tsd.gpu);
   cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
   cs_wait_slot(b, SB_ID(LS), false);
   cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu);
   cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
   cs_wait_slot(b, SB_ID(LS), false);
   cs_update_compute_ctx(b) {
      cs_move64_to(b, cs_sr_reg64(b, 0), cs_desc_state->res_table);
      uint32_t push_size = 256 + sizeof(struct panvk_compute_sysvals);
      uint64_t fau_count = DIV_ROUND_UP(push_size, 8);
      mali_ptr fau_ptr =
         cmdbuf->state.compute.push_uniforms | (fau_count << 56);
      cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr);
      cs_move64_to(b, cs_sr_reg64(b, 16), panvk_priv_mem_dev_addr(shader->spd));
      cs_move64_to(b, cs_sr_reg64(b, 24), tsd.gpu);
      /* Global attribute offset */
      cs_move32_to(b, cs_sr_reg32(b, 32), 0);
      struct mali_compute_size_workgroup_packed wg_size;
      pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
         cfg.workgroup_size_x = shader->local_size.x;
         cfg.workgroup_size_y = shader->local_size.y;
         cfg.workgroup_size_z = shader->local_size.z;
         cfg.allow_merging_workgroups = false;
      }
      cs_move32_to(b, cs_sr_reg32(b, 33), wg_size.opaque[0]);
      cs_move32_to(b, cs_sr_reg32(b, 34), baseGroupX * shader->local_size.x);
      cs_move32_to(b, cs_sr_reg32(b, 35), baseGroupY * shader->local_size.y);
      cs_move32_to(b, cs_sr_reg32(b, 36), baseGroupZ * shader->local_size.z);
      cs_move32_to(b, cs_sr_reg32(b, 37), groupCountX);
      cs_move32_to(b, cs_sr_reg32(b, 38), groupCountY);
      cs_move32_to(b, cs_sr_reg32(b, 39), groupCountZ);
      /* Pick the task_axis and task_increment to maximize thread utilization. */
      unsigned threads_per_wg =
         shader->local_size.x * shader->local_size.y * shader->local_size.z;
      unsigned max_thread_cnt = panfrost_compute_max_thread_count(
         &phys_dev->kmod.props, shader->info.work_reg_count);
      unsigned threads_per_task = threads_per_wg;
      unsigned local_size[3] = {
         shader->local_size.x,
         shader->local_size.y,
         shader->local_size.z,
      };
      for (unsigned i = 0; i < 3; i++) {
         if (threads_per_task * local_size[i] >= max_thread_cnt) {
            /* We reached out thread limit, stop at the current axis and
             * calculate the increment so it doesn't exceed the per-core
             * thread capacity.
             */
            task_increment = max_thread_cnt / threads_per_task;
            break;
         } else if (task_axis == MALI_TASK_AXIS_Z) {
            /* We reached the Z axis, and there's still room to stuff more
             * threads. Pick the current axis grid size as our increment
             * as there's no point using something bigger.
             */
            task_increment = local_size[i];
            break;
         }
         threads_per_task *= local_size[i];
         task_axis++;
      }
   }
   assert(task_axis <= MALI_TASK_AXIS_Z);
   assert(task_increment > 0);
   panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
   cs_req_res(b, CS_COMPUTE_RES);
   cs_run_compute(b, task_increment, task_axis, false,
                  cs_shader_res_sel(0, 0, 0, 0));
   cs_req_res(b, 0);
   struct cs_index sync_addr = cs_scratch_reg64(b, 0);
   struct cs_index iter_sb = cs_scratch_reg32(b, 2);
   struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
   struct cs_index add_val = cs_scratch_reg64(b, 4);
   cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
              BITFIELD_MASK(3),
              offsetof(struct panvk_cs_subqueue_context, syncobjs));
   cs_wait_slot(b, SB_ID(LS), false);
   cs_add64(b, sync_addr, sync_addr,
            PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
   cs_move64_to(b, add_val, 1);
   cs_match(b, iter_sb, cmp_scratch) {
 #define CASE(x)                                                                \
      cs_case(b, x) {                                                          \
         cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG,                        \
                       add_val, sync_addr,                                     \
                       cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)));       \
         cs_move32_to(b, iter_sb, next_iter_sb(x));                            \
      }
      CASE(0)
      CASE(1)
      CASE(2)
      CASE(3)
      CASE(4)
 #undef CASE
   }
   cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
              offsetof(struct panvk_cs_subqueue_context, iter_sb));
   cs_wait_slot(b, SB_ID(LS), false);
   ++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point;
 }
 VKAPI_ATTR void VKAPI_CALL
 panvk_per_arch(CmdDispatchIndirect)(VkCommandBuffer commandBuffer,
                                    VkBuffer _buffer, VkDeviceSize offset)
 {
   panvk_stub();
 }
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
--- a/src/panfrost/vulkan/csf/panvk_vX_queue.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_queue.c
@ -0,0 +1,745 @@
 /*
 * Copyright © 2024 Collabora Ltd.
 *
 * SPDX-License-Identifier: MIT
 */
 #include "drm-uapi/panthor_drm.h"
 #include "genxml/cs_builder.h"
 #include "genxml/decode.h"
 #include "panvk_cmd_buffer.h"
 #include "panvk_macros.h"
 #include "panvk_queue.h"
 #include "vk_drm_syncobj.h"
 #include "vk_log.h"
 static void
 finish_render_desc_ringbuf(struct panvk_queue *queue)
 {
   struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
   struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf;
   panvk_pool_free_mem(&dev->mempools.rw, ringbuf->syncobj);
   if (dev->debug.decode_ctx && ringbuf->addr.dev) {
      pandecode_inject_free(dev->debug.decode_ctx, ringbuf->addr.dev,
                            RENDER_DESC_RINGBUF_SIZE);
      pandecode_inject_free(dev->debug.decode_ctx,
                            ringbuf->addr.dev + RENDER_DESC_RINGBUF_SIZE,
                            RENDER_DESC_RINGBUF_SIZE);
   }
   if (ringbuf->addr.dev) {
      struct pan_kmod_vm_op op = {
         .type = PAN_KMOD_VM_OP_TYPE_UNMAP,
         .va = {
            .start = ringbuf->addr.dev,
            .size = RENDER_DESC_RINGBUF_SIZE * 2,
         },
      };
      ASSERTED int ret =
         pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, &op, 1);
      assert(!ret);
   }
   if (ringbuf->addr.host) {
      ASSERTED int ret =
         os_munmap(ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE);
      assert(!ret);
   }
   pan_kmod_bo_put(ringbuf->bo);
 }
 static VkResult
 init_render_desc_ringbuf(struct panvk_queue *queue)
 {
   struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
   const struct panvk_physical_device *phys_dev =
      to_panvk_physical_device(queue->vk.base.device->physical);
   uint32_t flags = panvk_device_adjust_bo_flags(dev, PAN_KMOD_BO_FLAG_NO_MMAP);
   struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf;
   const size_t size = RENDER_DESC_RINGBUF_SIZE;
   VkResult result;
   int ret;
   ringbuf->bo = pan_kmod_bo_alloc(dev->kmod.dev, dev->kmod.vm, size, flags);
   if (!ringbuf->bo)
      return vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                       "Failed to create a descriptor ring buffer context");
   if (!(flags & PAN_KMOD_BO_FLAG_NO_MMAP)) {
      ringbuf->addr.host = pan_kmod_bo_mmap(
         ringbuf->bo, 0, size, PROT_READ | PROT_WRITE, MAP_SHARED, NULL);
      if (ringbuf->addr.host == MAP_FAILED) {
         result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                            "Failed to CPU map ringbuf BO");
         goto err_finish_ringbuf;
      }
   }
   /* We choose the alignment to guarantee that we won't ever cross a 4G
    * boundary when accessing the mapping. This way we can encode the wraparound
    * using 32-bit operations. */
   uint64_t dev_addr = util_vma_heap_alloc(&dev->as.heap, size * 2, size * 2);
   struct pan_kmod_vm_op vm_ops[] = {
      {
         .type = PAN_KMOD_VM_OP_TYPE_MAP,
         .va = {
            .start = dev_addr,
            .size = RENDER_DESC_RINGBUF_SIZE,
         },
         .map = {
            .bo = ringbuf->bo,
            .bo_offset = 0,
         },
      },
      {
         .type = PAN_KMOD_VM_OP_TYPE_MAP,
         .va = {
            .start = dev_addr + RENDER_DESC_RINGBUF_SIZE,
            .size = RENDER_DESC_RINGBUF_SIZE,
         },
         .map = {
            .bo = ringbuf->bo,
            .bo_offset = 0,
         },
      },
   };
   ret = pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, vm_ops,
                          ARRAY_SIZE(vm_ops));
   if (ret) {
      result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                         "Failed to GPU map ringbuf BO");
      goto err_finish_ringbuf;
   }
   ringbuf->addr.dev = dev_addr;
   if (dev->debug.decode_ctx) {
      pandecode_inject_mmap(dev->debug.decode_ctx, ringbuf->addr.dev,
                            ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE, NULL);
      pandecode_inject_mmap(dev->debug.decode_ctx,
                            ringbuf->addr.dev + RENDER_DESC_RINGBUF_SIZE,
                            ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE, NULL);
   }
   struct panvk_pool_alloc_info alloc_info = {
      .size = sizeof(struct panvk_cs_sync32),
      .alignment = 64,
   };
   ringbuf->syncobj = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
   struct panvk_cs_sync32 *syncobj = panvk_priv_mem_host_addr(ringbuf->syncobj);
   if (!syncobj) {
      result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                         "Failed to create the render desc ringbuf context");
      goto err_finish_ringbuf;
   }
   *syncobj = (struct panvk_cs_sync32){
      .seqno = RENDER_DESC_RINGBUF_SIZE,
   };
   return VK_SUCCESS;
 err_finish_ringbuf:
   finish_render_desc_ringbuf(queue);
   return result;
 }
 static VkResult
 init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue)
 {
   struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
   struct panvk_subqueue *subq = &queue->subqueues[subqueue];
   const struct panvk_physical_device *phys_dev =
      to_panvk_physical_device(queue->vk.base.device->physical);
   struct panvk_instance *instance =
      to_panvk_instance(dev->vk.physical->instance);
   unsigned debug = instance->debug_flags;
   struct panvk_cs_sync64 *syncobjs = panvk_priv_mem_host_addr(queue->syncobjs);
   struct panvk_pool_alloc_info alloc_info = {
      .size = sizeof(struct panvk_cs_subqueue_context),
      .alignment = 64,
   };
   subq->context = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
   if (!panvk_priv_mem_host_addr(subq->context))
      return vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                       "Failed to create a queue context");
   struct panvk_cs_subqueue_context *cs_ctx =
      panvk_priv_mem_host_addr(subq->context);
   *cs_ctx = (struct panvk_cs_subqueue_context){
      .syncobjs = panvk_priv_mem_dev_addr(queue->syncobjs),
      .debug_syncobjs = panvk_priv_mem_dev_addr(queue->debug_syncobjs),
      .iter_sb = 0,
   };
   /* We use the geometry buffer for our temporary CS buffer. */
   struct cs_buffer root_cs = {
      .cpu = panvk_priv_mem_host_addr(queue->tiler_heap.desc) + 4096,
      .gpu = panvk_priv_mem_dev_addr(queue->tiler_heap.desc) + 4096,
      .capacity = 64 * 1024 / sizeof(uint64_t),
   };
   const struct cs_builder_conf conf = {
      .nr_registers = 96,
      .nr_kernel_registers = 4,
   };
   struct cs_builder b;
   assert(panvk_priv_mem_dev_addr(queue->tiler_heap.desc) != 0);
   cs_builder_init(&b, &conf, root_cs);
   /* Pass the context to through r62. */
   cs_move64_to(&b, cs_subqueue_ctx_reg(&b),
                panvk_priv_mem_dev_addr(subq->context));
   /* Intialize scoreboard slots used for asynchronous operations. */
   cs_set_scoreboard_entry(&b, SB_ITER(0), SB_ID(LS));
   /* We do greater than test on sync objects, and given the reference seqno
    * registers are all zero at init time, we need to initialize all syncobjs
    * with a seqno of one. */
   syncobjs[subqueue].seqno = 1;
   if (subqueue != PANVK_SUBQUEUE_COMPUTE) {
      cs_ctx->render.tiler_heap =
         panvk_priv_mem_dev_addr(queue->tiler_heap.desc);
      /* Our geometry buffer comes 4k after the tiler heap, and we encode the
       * size in the lower 12 bits so the address can be copied directly
       * to the tiler descriptors. */
      cs_ctx->render.geom_buf =
         (cs_ctx->render.tiler_heap + 4096) | ((64 * 1024) >> 12);
      /* Initialize the ringbuf */
      cs_ctx->render.desc_ringbuf = (struct panvk_cs_desc_ringbuf){
         .syncobj = panvk_priv_mem_dev_addr(queue->render_desc_ringbuf.syncobj),
         .ptr = queue->render_desc_ringbuf.addr.dev,
         .pos = 0,
      };
      struct cs_index heap_ctx_addr = cs_scratch_reg64(&b, 0);
      /* Pre-set the heap context on the vertex-tiler/fragment queues. */
      cs_move64_to(&b, heap_ctx_addr, queue->tiler_heap.context.dev_addr);
      cs_heap_set(&b, heap_ctx_addr);
   }
   cs_finish(&b);
   assert(cs_is_valid(&b));
   struct drm_panthor_sync_op syncop = {
      .flags =
         DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ | DRM_PANTHOR_SYNC_OP_SIGNAL,
      .handle = queue->syncobj_handle,
      .timeline_value = 0,
   };
   struct drm_panthor_queue_submit qsubmit = {
      .queue_index = subqueue,
      .stream_size = cs_root_chunk_size(&b),
      .stream_addr = cs_root_chunk_gpu_addr(&b),
      .latest_flush = 0,
      .syncs = DRM_PANTHOR_OBJ_ARRAY(1, &syncop),
   };
   struct drm_panthor_group_submit gsubmit = {
      .group_handle = queue->group_handle,
      .queue_submits = DRM_PANTHOR_OBJ_ARRAY(1, &qsubmit),
   };
   int ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_SUBMIT, &gsubmit);
   if (ret)
      return vk_errorf(dev->vk.physical, VK_ERROR_INITIALIZATION_FAILED,
                       "Failed to initialized subqueue: %m");
   ret = drmSyncobjWait(dev->vk.drm_fd, &queue->syncobj_handle, 1, INT64_MAX, 0,
                        NULL);
   if (ret)
      return vk_errorf(dev->vk.physical, VK_ERROR_INITIALIZATION_FAILED,
                       "SyncobjWait failed: %m");
   if (debug & PANVK_DEBUG_TRACE) {
      uint32_t regs[256] = {0};
      pandecode_cs(dev->debug.decode_ctx, qsubmit.stream_addr,
                   qsubmit.stream_size, phys_dev->kmod.props.gpu_prod_id, regs);
      pandecode_next_frame(dev->debug.decode_ctx);
   }
   return VK_SUCCESS;
 }
 static void
 cleanup_queue(struct panvk_queue *queue)
 {
   struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
   for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
      panvk_pool_free_mem(&dev->mempools.rw, queue->subqueues[i].context);
   finish_render_desc_ringbuf(queue);
   panvk_pool_free_mem(&dev->mempools.rw, queue->debug_syncobjs);
   panvk_pool_free_mem(&dev->mempools.rw, queue->syncobjs);
 }
 static VkResult
 init_queue(struct panvk_queue *queue)
 {
   struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
   const struct panvk_physical_device *phys_dev =
      to_panvk_physical_device(queue->vk.base.device->physical);
   struct panvk_instance *instance =
      to_panvk_instance(dev->vk.physical->instance);
   VkResult result;
   struct panvk_pool_alloc_info alloc_info = {
      .size =
         ALIGN_POT(sizeof(struct panvk_cs_sync64), 64) * PANVK_SUBQUEUE_COUNT,
      .alignment = 64,
   };
   queue->syncobjs = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
   if (!panvk_priv_mem_host_addr(queue->syncobjs))
      return vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                       "Failed to allocate subqueue sync objects");
   if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) {
      alloc_info.size =
         ALIGN_POT(sizeof(struct panvk_cs_sync32), 64) * PANVK_SUBQUEUE_COUNT,
      queue->debug_syncobjs =
         panvk_pool_alloc_mem(&dev->mempools.rw_nc, alloc_info);
      if (!panvk_priv_mem_host_addr(queue->debug_syncobjs)) {
         result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                            "Failed to allocate subqueue sync objects");
         goto err_cleanup_queue;
      }
   }
   result = init_render_desc_ringbuf(queue);
   if (result != VK_SUCCESS)
      goto err_cleanup_queue;
   for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
      result = init_subqueue(queue, i);
      if (result != VK_SUCCESS)
         goto err_cleanup_queue;
   }
   return VK_SUCCESS;
 err_cleanup_queue:
   cleanup_queue(queue);
   return result;
 }
 static VkResult
 create_group(struct panvk_queue *queue)
 {
   const struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
   const struct panvk_physical_device *phys_dev =
      to_panvk_physical_device(queue->vk.base.device->physical);
   struct drm_panthor_queue_create qc[] = {
      [PANVK_SUBQUEUE_VERTEX_TILER] =
         {
            .priority = 1,
            .ringbuf_size = 64 * 1024,
         },
      [PANVK_SUBQUEUE_FRAGMENT] =
         {
            .priority = 1,
            .ringbuf_size = 64 * 1024,
         },
      [PANVK_SUBQUEUE_COMPUTE] =
         {
            .priority = 1,
            .ringbuf_size = 64 * 1024,
         },
   };
   struct drm_panthor_group_create gc = {
      .compute_core_mask = phys_dev->kmod.props.shader_present,
      .fragment_core_mask = phys_dev->kmod.props.shader_present,
      .tiler_core_mask = 1,
      .max_compute_cores = util_bitcount64(phys_dev->kmod.props.shader_present),
      .max_fragment_cores =
         util_bitcount64(phys_dev->kmod.props.shader_present),
      .max_tiler_cores = 1,
      .priority = PANTHOR_GROUP_PRIORITY_MEDIUM,
      .queues = DRM_PANTHOR_OBJ_ARRAY(ARRAY_SIZE(qc), qc),
      .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
   };
   int ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_CREATE, &gc);
   if (ret)
      return vk_errorf(phys_dev, VK_ERROR_INITIALIZATION_FAILED,
                       "Failed to create a scheduling group");
   queue->group_handle = gc.group_handle;
   return VK_SUCCESS;
 }
 static void
 destroy_group(struct panvk_queue *queue)
 {
   const struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
   struct drm_panthor_group_destroy gd = {
      .group_handle = queue->group_handle,
   };
   ASSERTED int ret =
      drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd);
   assert(!ret);
 }
 static VkResult
 init_tiler(struct panvk_queue *queue)
 {
   const struct panvk_physical_device *phys_dev =
      to_panvk_physical_device(queue->vk.base.device->physical);
   struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
   struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap;
   VkResult result;
   /* We allocate the tiler heap descriptor and geometry buffer in one go,
    * so we can pass it through a single 64-bit register to the VERTEX_TILER
    * command streams. */
   struct panvk_pool_alloc_info alloc_info = {
      .size = (64 * 1024) + 4096,
      .alignment = 4096,
   };
   tiler_heap->desc = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
   if (!panvk_priv_mem_host_addr(tiler_heap->desc)) {
      result = vk_errorf(phys_dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                         "Failed to create a tiler heap context");
      goto err_free_desc;
   }
   tiler_heap->chunk_size = 2 * 1024 * 1024;
   struct drm_panthor_tiler_heap_create thc = {
      .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
      .chunk_size = tiler_heap->chunk_size,
      .initial_chunk_count = 5,
      .max_chunks = 64,
      .target_in_flight = 65535,
   };
   int ret =
      drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE, &thc);
   if (ret) {
      result = vk_errorf(phys_dev, VK_ERROR_INITIALIZATION_FAILED,
                         "Failed to create a tiler heap context");
      goto err_free_desc;
   }
   tiler_heap->context.handle = thc.handle;
   tiler_heap->context.dev_addr = thc.tiler_heap_ctx_gpu_va;
   pan_pack(panvk_priv_mem_host_addr(tiler_heap->desc), TILER_HEAP, cfg) {
      cfg.size = tiler_heap->chunk_size;
      cfg.base = thc.first_heap_chunk_gpu_va;
      cfg.bottom = cfg.base + 64;
      cfg.top = cfg.base + cfg.size;
   }
   return VK_SUCCESS;
 err_free_desc:
   panvk_pool_free_mem(&dev->mempools.rw, tiler_heap->desc);
   return result;
 }
 static void
 cleanup_tiler(struct panvk_queue *queue)
 {
   struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
   struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap;
   struct drm_panthor_tiler_heap_destroy thd = {
      .handle = tiler_heap->context.handle,
   };
   ASSERTED int ret =
      drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY, &thd);
   assert(!ret);
   panvk_pool_free_mem(&dev->mempools.rw, tiler_heap->desc);
 }
 static VkResult
 panvk_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit)
 {
   struct panvk_queue *queue = container_of(vk_queue, struct panvk_queue, vk);
   struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
   const struct panvk_physical_device *phys_dev =
      to_panvk_physical_device(queue->vk.base.device->physical);
   VkResult result = VK_SUCCESS;
   int ret;
   if (vk_queue_is_lost(&queue->vk))
      return VK_ERROR_DEVICE_LOST;
   struct panvk_instance *instance =
      to_panvk_instance(dev->vk.physical->instance);
   unsigned debug = instance->debug_flags;
   bool force_sync = debug & (PANVK_DEBUG_TRACE | PANVK_DEBUG_SYNC);
   uint32_t qsubmit_count = 0;
   uint32_t used_queue_mask = 0;
   for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
      struct panvk_cmd_buffer *cmdbuf =
         container_of(submit->command_buffers[i], struct panvk_cmd_buffer, vk);
      for (uint32_t j = 0; j < ARRAY_SIZE(cmdbuf->state.cs); j++) {
         assert(cs_is_valid(&cmdbuf->state.cs[j].builder));
         if (!cs_is_empty(&cmdbuf->state.cs[j].builder)) {
            used_queue_mask |= BITFIELD_BIT(j);
            qsubmit_count++;
         }
      }
   }
   /* Synchronize all subqueues if we have no command buffer submitted. */
   if (!qsubmit_count)
      used_queue_mask = BITFIELD_MASK(PANVK_SUBQUEUE_COUNT);
   /* We add sync-only queue submits to place our wait/signal operations. */
   if (submit->wait_count > 0)
      qsubmit_count += util_bitcount(used_queue_mask);
   if (submit->signal_count > 0)
      qsubmit_count += util_bitcount(used_queue_mask);
   uint32_t syncop_count = submit->wait_count + util_bitcount(used_queue_mask);
   STACK_ARRAY(struct drm_panthor_queue_submit, qsubmits, qsubmit_count);
   STACK_ARRAY(struct drm_panthor_sync_op, syncops, syncop_count);
   struct drm_panthor_sync_op *wait_ops = syncops;
   struct drm_panthor_sync_op *signal_ops = syncops + submit->wait_count;
   qsubmit_count = 0;
   if (submit->wait_count) {
      for (uint32_t i = 0; i < submit->wait_count; i++) {
         assert(vk_sync_type_is_drm_syncobj(submit->waits[i].sync->type));
         struct vk_drm_syncobj *syncobj =
            vk_sync_as_drm_syncobj(submit->waits[i].sync);
         wait_ops[i] = (struct drm_panthor_sync_op){
            .flags = (submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE
                         ? DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ
                         : DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ) |
                     DRM_PANTHOR_SYNC_OP_WAIT,
            .handle = syncobj->syncobj,
            .timeline_value = submit->waits[i].wait_value,
         };
      }
      for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
         if (used_queue_mask & BITFIELD_BIT(i)) {
            qsubmits[qsubmit_count++] = (struct drm_panthor_queue_submit){
               .queue_index = i,
               .syncs = DRM_PANTHOR_OBJ_ARRAY(submit->wait_count, wait_ops),
            };
         }
      }
   }
   for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
      struct panvk_cmd_buffer *cmdbuf =
         container_of(submit->command_buffers[i], struct panvk_cmd_buffer, vk);
      for (uint32_t j = 0; j < ARRAY_SIZE(cmdbuf->state.cs); j++) {
         if (cs_is_empty(&cmdbuf->state.cs[j].builder))
            continue;
         qsubmits[qsubmit_count++] = (struct drm_panthor_queue_submit){
            .queue_index = j,
            .stream_size = cs_root_chunk_size(&cmdbuf->state.cs[j].builder),
            .stream_addr = cs_root_chunk_gpu_addr(&cmdbuf->state.cs[j].builder),
            .latest_flush = 0,
         };
      }
   }
   if (submit->signal_count || force_sync) {
      uint32_t signal_op = 0;
      for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
         if (used_queue_mask & BITFIELD_BIT(i)) {
            signal_ops[signal_op] = (struct drm_panthor_sync_op){
               .flags = DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ |
                        DRM_PANTHOR_SYNC_OP_SIGNAL,
               .handle = queue->syncobj_handle,
               .timeline_value = signal_op + 1,
            };
            qsubmits[qsubmit_count++] = (struct drm_panthor_queue_submit){
               .queue_index = i,
               .syncs = DRM_PANTHOR_OBJ_ARRAY(1, &signal_ops[signal_op++]),
            };
         }
      }
   }
   if (force_sync) {
      struct panvk_cs_sync32 *debug_syncs =
         panvk_priv_mem_host_addr(queue->debug_syncobjs);
      assert(debug_syncs);
      memset(debug_syncs, 0, sizeof(*debug_syncs) * PANVK_SUBQUEUE_COUNT);
   }
   struct drm_panthor_group_submit gsubmit = {
      .group_handle = queue->group_handle,
      .queue_submits = DRM_PANTHOR_OBJ_ARRAY(qsubmit_count, qsubmits),
   };
   ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_SUBMIT, &gsubmit);
   if (ret) {
      result = vk_queue_set_lost(&queue->vk, "GROUP_SUBMIT: %m");
      goto out;
   }
   if (submit->signal_count || force_sync) {
      if (force_sync) {
         uint64_t point = util_bitcount(used_queue_mask);
         ret = drmSyncobjTimelineWait(dev->vk.drm_fd, &queue->syncobj_handle,
                                      &point, 1, INT64_MAX,
                                      DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
         assert(!ret);
      }
      for (uint32_t i = 0; i < submit->signal_count; i++) {
         assert(vk_sync_type_is_drm_syncobj(submit->signals[i].sync->type));
         struct vk_drm_syncobj *syncobj =
            vk_sync_as_drm_syncobj(submit->signals[i].sync);
         drmSyncobjTransfer(dev->vk.drm_fd, syncobj->syncobj,
                            submit->signals[i].signal_value,
                            queue->syncobj_handle, 0, 0);
      }
      drmSyncobjReset(dev->vk.drm_fd, &queue->syncobj_handle, 1);
   }
   if (debug & PANVK_DEBUG_TRACE) {
      for (uint32_t i = 0; i < qsubmit_count; i++) {
         if (!qsubmits[i].stream_size)
            continue;
         uint32_t subqueue = qsubmits[i].queue_index;
         uint32_t regs[256] = {0};
         uint64_t ctx =
            panvk_priv_mem_dev_addr(queue->subqueues[subqueue].context);
         regs[PANVK_CS_REG_SUBQUEUE_CTX_START] = ctx;
         regs[PANVK_CS_REG_SUBQUEUE_CTX_START + 1] = ctx >> 32;
         simple_mtx_lock(&dev->debug.decode_ctx->lock);
         pandecode_dump_file_open(dev->debug.decode_ctx);
         pandecode_log(dev->debug.decode_ctx, "CS%d\n",
                       qsubmits[i].queue_index);
         simple_mtx_unlock(&dev->debug.decode_ctx->lock);
         pandecode_cs(dev->debug.decode_ctx, qsubmits[i].stream_addr,
                      qsubmits[i].stream_size, phys_dev->kmod.props.gpu_prod_id,
                      regs);
      }
   }
   if (debug & PANVK_DEBUG_DUMP)
      pandecode_dump_mappings(dev->debug.decode_ctx);
   if (force_sync) {
      struct panvk_cs_sync32 *debug_syncs =
         panvk_priv_mem_host_addr(queue->debug_syncobjs);
      uint32_t debug_sync_points[PANVK_SUBQUEUE_COUNT] = {0};
      for (uint32_t i = 0; i < qsubmit_count; i++) {
         if (qsubmits[i].stream_size)
            debug_sync_points[qsubmits[i].queue_index]++;
      }
      for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
         if (debug_syncs[i].seqno != debug_sync_points[i] ||
             debug_syncs[i].error != 0)
            assert(!"Incomplete job or timeout\n");
      }
   }
   if (debug & PANVK_DEBUG_TRACE)
      pandecode_next_frame(dev->debug.decode_ctx);
 out:
   STACK_ARRAY_FINISH(syncops);
   STACK_ARRAY_FINISH(qsubmits);
   return result;
 }
 VkResult
 panvk_per_arch(queue_init)(struct panvk_device *dev, struct panvk_queue *queue,
                           int idx, const VkDeviceQueueCreateInfo *create_info)
 {
   const struct panvk_physical_device *phys_dev =
      to_panvk_physical_device(dev->vk.physical);
   VkResult result = vk_queue_init(&queue->vk, &dev->vk, create_info, idx);
   if (result != VK_SUCCESS)
      return result;
   int ret = drmSyncobjCreate(dev->vk.drm_fd, 0, &queue->syncobj_handle);
   if (ret) {
      result = vk_errorf(phys_dev, VK_ERROR_INITIALIZATION_FAILED,
                         "Failed to create our internal sync object");
      goto err_finish_queue;
   }
   result = init_tiler(queue);
   if (result != VK_SUCCESS)
      goto err_destroy_syncobj;
   result = create_group(queue);
   if (result != VK_SUCCESS)
      goto err_cleanup_tiler;
   result = init_queue(queue);
   if (result != VK_SUCCESS)
      goto err_destroy_group;
   queue->vk.driver_submit = panvk_queue_submit;
   return VK_SUCCESS;
 err_destroy_group:
   destroy_group(queue);
 err_cleanup_tiler:
   cleanup_tiler(queue);
 err_destroy_syncobj:
   drmSyncobjDestroy(dev->vk.drm_fd, queue->syncobj_handle);
 err_finish_queue:
   vk_queue_finish(&queue->vk);
   return result;
 }
 void
 panvk_per_arch(queue_finish)(struct panvk_queue *queue)
 {
   struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
   destroy_group(queue);
   cleanup_tiler(queue);
   drmSyncobjDestroy(dev->vk.drm_fd, queue->syncobj_handle);
   vk_queue_finish(&queue->vk);
 }
--- a/src/panfrost/vulkan/meson.build
+++ b/src/panfrost/vulkan/meson.build
@ -57,6 +57,15 @@ jm_files = [
  'jm/panvk_vX_queue.c',
 ]
 csf_archs = [10]
 csf_inc_dir = ['csf']
 csf_files = [
  'csf/panvk_vX_cmd_buffer.c',
  'csf/panvk_vX_cmd_dispatch.c',
  'csf/panvk_vX_cmd_draw.c',
  'csf/panvk_vX_queue.c',
 ]
 common_per_arch_files = [
  panvk_entrypoints[0],
  'panvk_vX_blend.c',
@ -74,21 +83,7 @@ common_per_arch_files = [
 ]
 foreach arch : [6, 7, 10]
-  if arch in valhall_archs
+  per_arch_files = common_per_arch_files
    # We're lacking key components to compile common source files for v9/v10.
    # Just add the v10 entrypoints for now.
    per_arch_files = [
      panvk_entrypoints[0],
      'panvk_vX_blend.c',
      'panvk_vX_descriptor_set.c',
      'panvk_vX_descriptor_set_layout.c',
      'panvk_vX_nir_lower_descriptors.c',
      'panvk_vX_shader.c',
    ]
  else
    per_arch_files = common_per_arch_files
  endif
  inc_panvk_per_arch = []
  if arch in bifrost_archs
@ -102,6 +97,9 @@ foreach arch : [6, 7, 10]
  if arch in jm_archs
    inc_panvk_per_arch += jm_inc_dir
    per_arch_files += jm_files
  elif arch in csf_archs
    inc_panvk_per_arch += csf_inc_dir
    per_arch_files += csf_files
  endif
  panvk_per_arch_libs += static_library(
--- a/src/panfrost/vulkan/panvk_instance.c
+++ b/src/panfrost/vulkan/panvk_instance.c
@ -36,6 +36,7 @@ static const struct debug_control panvk_debug_options[] = {
   {"linear", PANVK_DEBUG_LINEAR},
   {"dump", PANVK_DEBUG_DUMP},
   {"no_known_warn", PANVK_DEBUG_NO_KNOWN_WARN},
   {"cs", PANVK_DEBUG_CS},
   {NULL, 0}};
 VKAPI_ATTR VkResult VKAPI_CALL
--- a/src/panfrost/vulkan/panvk_instance.h
+++ b/src/panfrost/vulkan/panvk_instance.h
@ -21,6 +21,7 @@ enum panvk_debug_flags {
   PANVK_DEBUG_LINEAR = 1 << 5,
   PANVK_DEBUG_DUMP = 1 << 6,
   PANVK_DEBUG_NO_KNOWN_WARN = 1 << 7,
   PANVK_DEBUG_CS = 1 << 8,
 };
 #if defined(VK_USE_PLATFORM_WAYLAND_KHR) || \
--- a/src/panfrost/vulkan/panvk_vX_shader.c
+++ b/src/panfrost/vulkan/panvk_vX_shader.c
@ -29,11 +29,7 @@
 #include "genxml/gen_macros.h"
 /* FIXME: make the include statement unconditional when the CSF command buffer
 * logic is implemented. */
 #if PAN_ARCH <= 7
 #include "panvk_cmd_buffer.h"
 #endif
 #include "panvk_device.h"
 #include "panvk_instance.h"
 #include "panvk_mempool.h"
@ -1333,9 +1329,6 @@ static const struct vk_shader_ops panvk_shader_ops = {
      panvk_shader_get_executable_internal_representations,
 };
 /* FIXME: make this unconditional when the CSF command buffer logic is
 * implemented. */
 #if PAN_ARCH <= 7
 static void
 panvk_cmd_bind_shader(struct panvk_cmd_buffer *cmd, const gl_shader_stage stage,
                      struct panvk_shader *shader)
@ -1380,7 +1373,6 @@ panvk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count,
      panvk_cmd_bind_shader(cmd, stages[i], shader);
   }
 }
 #endif
 const struct vk_device_shader_ops panvk_per_arch(device_shader_ops) = {
   .get_nir_options = panvk_get_nir_options,
@ -1390,10 +1382,5 @@ const struct vk_device_shader_ops panvk_per_arch(device_shader_ops) = {
   .compile = panvk_compile_shaders,
   .deserialize = panvk_deserialize_shader,
   .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
 /* FIXME: make the assignment unconditional when the CSF command buffer logic is
 * implemented. */
 #if PAN_ARCH <= 7
   .cmd_bind_shaders = panvk_cmd_bind_shaders,
 #endif
 };