diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build
index b953ea006ed..fdbdaac2a45 100644
--- a/src/gallium/drivers/panfrost/meson.build
+++ b/src/gallium/drivers/panfrost/meson.build
@@ -63,6 +63,8 @@ foreach ver : panfrost_versions
   files_panfrost_vx = ['pan_cmdstream.c', pan_packers]
   if ver in ['4', '5', '6', '7', '9']
     files_panfrost_vx += ['pan_jm.c']
+  elif ver in ['10']
+    files_panfrost_vx += ['pan_csf.c']
   endif
   libpanfrost_versions += static_library(
     'panfrost-v' + ver, files_panfrost_vx,
diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c
index 33ae3040a8c..02e480f1293 100644
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@@ -46,6 +46,7 @@
 #include "pan_bo.h"
 #include "pan_cmdstream.h"
 #include "pan_context.h"
+#include "pan_csf.h"
 #include "pan_indirect_dispatch.h"
 #include "pan_jm.h"
 #include "pan_job.h"
@@ -60,6 +61,8 @@
  * functions. */
 #if PAN_ARCH <= 9
 #define JOBX(__suffix) GENX(jm_##__suffix)
+#elif PAN_ARCH <= 10
+#define JOBX(__suffix) GENX(csf_##__suffix)
 #else
 #error "Unsupported arch"
 #endif
diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.h b/src/gallium/drivers/panfrost/pan_cmdstream.h
index 11382d4c1e2..c1bfd350fe7 100644
--- a/src/gallium/drivers/panfrost/pan_cmdstream.h
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.h
@@ -40,7 +40,7 @@
 
 #include "util/u_prim.h"
 
-#define PAN_GPU_INDIRECTS (PAN_ARCH == 7)
+#define PAN_GPU_INDIRECTS (PAN_ARCH == 7 || PAN_ARCH >= 10)
 
 struct panfrost_rasterizer {
    struct pipe_rasterizer_state base;
diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h
index caf5e3534b0..b768bc878a7 100644
--- a/src/gallium/drivers/panfrost/pan_context.h
+++ b/src/gallium/drivers/panfrost/pan_context.h
@@ -50,6 +50,8 @@
 #include "compiler/shader_enums.h"
 #include "midgard/midgard_compile.h"
 
+#include "pan_csf.h"
+
 #define SET_BIT(lval, bit, cond)                                               \
    if (cond)                                                                   \
       lval |= (bit);                                                           \
@@ -230,6 +232,10 @@ struct panfrost_context {
 
    int in_sync_fd;
    uint32_t in_sync_obj;
+
+   union {
+      struct panfrost_csf_context csf;
+   };
 };
 
 /* Corresponds to the CSO */
diff --git a/src/gallium/drivers/panfrost/pan_csf.c b/src/gallium/drivers/panfrost/pan_csf.c
new file mode 100644
index 00000000000..a37c785542b
--- /dev/null
+++ b/src/gallium/drivers/panfrost/pan_csf.c
@@ -0,0 +1,1080 @@
+/*
+ * Copyright (C) 2023 Collabora Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "decode.h"
+
+#include "drm-uapi/panthor_drm.h"
+
+#include "genxml/cs_builder.h"
+
+#include "pan_blitter.h"
+#include "pan_cmdstream.h"
+#include "pan_context.h"
+#include "pan_csf.h"
+#include "pan_job.h"
+
+#if PAN_ARCH < 10
+#error "CSF helpers are only used for gen >= 10"
+#endif
+
+static struct cs_buffer
+csf_alloc_cs_buffer(void *cookie)
+{
+   assert(cookie && "Self-contained queues can't be extended.");
+
+   struct panfrost_batch *batch = cookie;
+   unsigned capacity = 4096;
+
+   struct panfrost_ptr ptr =
+      pan_pool_alloc_aligned(&batch->csf.cs_chunk_pool.base, capacity * 8, 64);
+
+   return (struct cs_buffer){
+      .cpu = ptr.cpu,
+      .gpu = ptr.gpu,
+      .capacity = capacity,
+   };
+}
+
+void
+GENX(csf_cleanup_batch)(struct panfrost_batch *batch)
+{
+   free(batch->csf.cs.builder);
+
+   panfrost_pool_cleanup(&batch->csf.cs_chunk_pool);
+}
+
+void
+GENX(csf_init_batch)(struct panfrost_batch *batch)
+{
+   struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
+
+   /* Initialize the CS chunk pool. */
+   panfrost_pool_init(&batch->csf.cs_chunk_pool, NULL, dev, 0, 32768,
+                      "CS chunk pool", false, true);
+
+   /* Allocate and bind the command queue */
+   struct cs_buffer queue = csf_alloc_cs_buffer(batch);
+   const struct cs_builder_conf conf = {
+      .nr_registers = 96,
+      .nr_kernel_registers = 4,
+      .alloc_buffer = csf_alloc_cs_buffer,
+      .cookie = batch,
+   };
+
+   /* Setup the queue builder */
+   batch->csf.cs.builder = malloc(sizeof(struct cs_builder));
+   cs_builder_init(batch->csf.cs.builder, &conf, queue);
+   cs_req_res(batch->csf.cs.builder,
+              CS_COMPUTE_RES | CS_TILER_RES | CS_IDVS_RES | CS_FRAG_RES);
+
+   /* Set up entries */
+   struct cs_builder *b = batch->csf.cs.builder;
+   cs_set_scoreboard_entry(b, 2, 0);
+
+   batch->framebuffer = pan_pool_alloc_desc_aggregate(
+      &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
+      PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
+   batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
+}
+
+static void
+csf_prepare_qsubmit(struct panfrost_context *ctx,
+                    struct drm_panthor_queue_submit *submit, uint8_t queue,
+                    uint64_t cs_start, uint32_t cs_size,
+                    struct drm_panthor_sync_op *syncs, uint32_t sync_count)
+{
+   struct panfrost_device *dev = pan_device(ctx->base.screen);
+
+   *submit = (struct drm_panthor_queue_submit){
+      .queue_index = queue,
+      .stream_addr = cs_start,
+      .stream_size = cs_size,
+      .latest_flush = panthor_kmod_get_flush_id(dev->kmod.dev),
+      .syncs = DRM_PANTHOR_OBJ_ARRAY(sync_count, syncs),
+   };
+}
+
+static void
+csf_prepare_gsubmit(struct panfrost_context *ctx,
+                    struct drm_panthor_group_submit *gsubmit,
+                    struct drm_panthor_queue_submit *qsubmits,
+                    uint32_t qsubmit_count)
+{
+   *gsubmit = (struct drm_panthor_group_submit){
+      .group_handle = ctx->csf.group_handle,
+      .queue_submits = DRM_PANTHOR_OBJ_ARRAY(qsubmit_count, qsubmits),
+   };
+}
+
+static int
+csf_submit_gsubmit(struct panfrost_context *ctx,
+                   struct drm_panthor_group_submit *gsubmit)
+{
+   struct panfrost_device *dev = pan_device(ctx->base.screen);
+   int ret = 0;
+
+   if (!ctx->is_noop) {
+      ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_SUBMIT,
+                     gsubmit);
+   }
+
+   if (ret)
+      return errno;
+
+   return 0;
+}
+
+static void
+csf_emit_batch_end(struct panfrost_batch *batch)
+{
+   struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
+   struct cs_builder *b = batch->csf.cs.builder;
+
+   /* Barrier to let everything finish */
+   cs_wait_slots(b, BITFIELD_MASK(8), false);
+
+   if (dev->debug & PAN_DBG_SYNC) {
+      /* Get the CS state */
+      batch->csf.cs.state = pan_pool_alloc_aligned(&batch->pool.base, 8, 8);
+      memset(batch->csf.cs.state.cpu, ~0, 8);
+      cs_move64_to(b, cs_reg64(b, 90), batch->csf.cs.state.gpu);
+      cs_store_state(b, cs_reg64(b, 90), 0, MALI_CS_STATE_ERROR_STATUS,
+                     cs_now());
+   }
+
+   /* Flush caches now that we're done (synchronous) */
+   struct cs_index flush_id = cs_reg32(b, 74);
+   cs_move32_to(b, flush_id, 0);
+   cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, true,
+                   flush_id, cs_now());
+   cs_wait_slot(b, 0, false);
+
+   /* Finish the command stream */
+   assert(cs_is_valid(batch->csf.cs.builder));
+   cs_finish(batch->csf.cs.builder);
+}
+
+static int
+csf_submit_collect_wait_ops(struct panfrost_batch *batch,
+                            struct util_dynarray *syncops,
+                            uint32_t vm_sync_handle)
+{
+   struct panfrost_context *ctx = batch->ctx;
+   struct panfrost_device *dev = pan_device(ctx->base.screen);
+   uint64_t vm_sync_wait_point = 0, bo_sync_point;
+   uint32_t bo_sync_handle;
+   int ret;
+
+   /* We don't wait on BOs attached to the various batch pools, because those
+    * are private to the batch, and are guaranteed to be idle at allocation
+    * time. We need to iterate over other BOs accessed by the batch though,
+    * to add the corresponding wait operations.
+    */
+   util_dynarray_foreach(&batch->bos, pan_bo_access, ptr) {
+      unsigned i = ptr - util_dynarray_element(&batch->bos, pan_bo_access, 0);
+      pan_bo_access flags = *ptr;
+
+      if (!flags)
+         continue;
+
+      /* Update the BO access flags so that panfrost_bo_wait() knows
+       * about all pending accesses.
+       * We only keep the READ/WRITE info since this is all the BO
+       * wait logic cares about.
+       * We also preserve existing flags as this batch might not
+       * be the first one to access the BO.
+       */
+      struct panfrost_bo *bo = pan_lookup_bo(dev, i);
+
+      ret = panthor_kmod_bo_get_sync_point(bo->kmod_bo, &bo_sync_handle,
+                                           &bo_sync_point,
+                                           !(flags & PAN_BO_ACCESS_WRITE));
+      if (ret)
+         return ret;
+
+      if (bo_sync_handle == vm_sync_handle) {
+         vm_sync_wait_point = MAX2(vm_sync_wait_point, bo_sync_point);
+         continue;
+      }
+
+      assert(bo_sync_point == 0 || !bo->kmod_bo->exclusive_vm);
+
+      struct drm_panthor_sync_op waitop = {
+         .flags =
+            DRM_PANTHOR_SYNC_OP_WAIT |
+            (bo_sync_point ? DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ
+                           : DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ),
+         .handle = bo_sync_handle,
+         .timeline_value = bo_sync_point,
+      };
+
+      util_dynarray_append(syncops, struct drm_panthor_sync_op, waitop);
+   }
+
+   if (vm_sync_wait_point > 0) {
+      struct drm_panthor_sync_op waitop = {
+         .flags = DRM_PANTHOR_SYNC_OP_WAIT |
+                  DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ,
+         .handle = vm_sync_handle,
+         .timeline_value = vm_sync_wait_point,
+      };
+
+      util_dynarray_append(syncops, struct drm_panthor_sync_op, waitop);
+   }
+
+   if (ctx->in_sync_fd >= 0) {
+      ret = drmSyncobjImportSyncFile(panfrost_device_fd(dev), ctx->in_sync_obj,
+                                     ctx->in_sync_fd);
+      if (ret)
+         return ret;
+
+      struct drm_panthor_sync_op waitop = {
+         .flags =
+            DRM_PANTHOR_SYNC_OP_WAIT | DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ,
+         .handle = ctx->in_sync_obj,
+      };
+
+      util_dynarray_append(syncops, struct drm_panthor_sync_op, waitop);
+
+      close(ctx->in_sync_fd);
+      ctx->in_sync_fd = -1;
+   }
+
+   return 0;
+}
+
+static int
+csf_attach_sync_points(struct panfrost_batch *batch, uint32_t vm_sync_handle,
+                       uint64_t vm_sync_signal_point)
+{
+   struct panfrost_context *ctx = batch->ctx;
+   struct panfrost_device *dev = pan_device(ctx->base.screen);
+   int ret;
+
+   /* There should be no invisble allocation on CSF. */
+   assert(batch->invisible_pool.bos.size == 0);
+
+   /* Attach sync points to batch-private BOs first. We assume BOs can
+    * be written by the GPU to keep things simple.
+    */
+   util_dynarray_foreach(&batch->pool.bos, struct panfrost_bo *, bo) {
+      (*bo)->gpu_access |= PAN_BO_ACCESS_RW;
+      ret = panthor_kmod_bo_attach_sync_point((*bo)->kmod_bo, vm_sync_handle,
+                                              vm_sync_signal_point, true);
+      if (ret)
+         return ret;
+   }
+
+   util_dynarray_foreach(&batch->csf.cs_chunk_pool.bos, struct panfrost_bo *,
+                         bo) {
+      (*bo)->gpu_access |= PAN_BO_ACCESS_RW;
+      ret = panthor_kmod_bo_attach_sync_point((*bo)->kmod_bo, vm_sync_handle,
+                                              vm_sync_signal_point, true);
+      if (ret)
+         return ret;
+   }
+
+   /* Attach the VM sync point to all resources accessed by the batch. */
+   util_dynarray_foreach(&batch->bos, pan_bo_access, ptr) {
+      unsigned i = ptr - util_dynarray_element(&batch->bos, pan_bo_access, 0);
+      pan_bo_access flags = *ptr;
+
+      if (!flags)
+         continue;
+
+      struct panfrost_bo *bo = pan_lookup_bo(dev, i);
+
+      bo->gpu_access |= flags & (PAN_BO_ACCESS_RW);
+      ret = panthor_kmod_bo_attach_sync_point(bo->kmod_bo, vm_sync_handle,
+                                              vm_sync_signal_point,
+                                              flags & PAN_BO_ACCESS_WRITE);
+      if (ret)
+         return ret;
+   }
+
+   /* And finally transfer the VM sync point to the context syncobj. */
+   return drmSyncobjTransfer(panfrost_device_fd(dev), ctx->syncobj, 0,
+                             vm_sync_handle, vm_sync_signal_point, 0);
+}
+
+static void
+csf_check_ctx_state_and_reinit(struct panfrost_context *ctx)
+{
+   struct panfrost_device *dev = pan_device(ctx->base.screen);
+   struct drm_panthor_group_get_state state = {
+      .group_handle = ctx->csf.group_handle,
+   };
+   int ret;
+
+   ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_GET_STATE,
+                  &state);
+   if (ret) {
+      mesa_loge("DRM_IOCTL_PANTHOR_GROUP_GET_STATE failed (err=%d)", errno);
+      return;
+   }
+
+   /* Context is still usable. This was a transient error. */
+   if (state.state == 0)
+      return;
+
+   /* If the VM is unusable, we can't do much, as this is shared between all
+    * contexts, and restoring the VM state is non-trivial.
+    */
+   if (pan_kmod_vm_query_state(dev->kmod.vm) != PAN_KMOD_VM_USABLE) {
+      mesa_loge("VM became unusable, we can't reset the context");
+      assert(!"VM became unusable, we can't reset the context");
+   }
+
+   panfrost_context_reinit(ctx);
+}
+
+static void
+csf_submit_wait_and_dump(struct panfrost_batch *batch,
+                         const struct drm_panthor_group_submit *gsubmit,
+                         uint32_t vm_sync_handle, uint64_t vm_sync_signal_point)
+{
+   struct panfrost_context *ctx = batch->ctx;
+   struct panfrost_device *dev = pan_device(ctx->base.screen);
+   bool wait = (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) && !ctx->is_noop;
+   bool dump = (dev->debug & PAN_DBG_TRACE);
+   bool crash = false;
+
+   if (!wait && !dump)
+      return;
+
+   /* Wait so we can get errors reported back */
+   if (wait) {
+      int ret =
+         drmSyncobjTimelineWait(panfrost_device_fd(dev), &vm_sync_handle,
+                                &vm_sync_signal_point, 1, INT64_MAX, 0, NULL);
+      assert(ret >= 0);
+   }
+
+   /* Jobs won't be complete if blackhole rendering, that's ok */
+   if (!ctx->is_noop && (dev->debug & PAN_DBG_SYNC) &&
+       *((uint64_t *)batch->csf.cs.state.cpu) != 0) {
+      crash = true;
+      dump = true;
+   }
+
+   if (dump) {
+      const struct drm_panthor_queue_submit *qsubmits =
+         (void *)(uintptr_t)gsubmit->queue_submits.array;
+
+      for (unsigned i = 0; i < gsubmit->queue_submits.count; i++) {
+         uint32_t regs[256] = {0};
+         pandecode_cs(dev->decode_ctx, qsubmits[i].stream_addr,
+                      qsubmits[i].stream_size, panfrost_device_gpu_id(dev),
+                      regs);
+      }
+
+      if (dev->debug & PAN_DBG_DUMP)
+         pandecode_dump_mappings(dev->decode_ctx);
+   }
+
+   if (crash) {
+      fprintf(stderr, "Incomplete job or timeout\n");
+      fflush(NULL);
+      abort();
+   }
+}
+
+int
+GENX(csf_submit_batch)(struct panfrost_batch *batch)
+{
+   /* Close the batch before submitting. */
+   csf_emit_batch_end(batch);
+
+   uint32_t cs_instr_count = batch->csf.cs.builder->root_chunk.size;
+   uint64_t cs_start = batch->csf.cs.builder->root_chunk.buffer.gpu;
+   uint32_t cs_size = cs_instr_count * 8;
+   struct panfrost_context *ctx = batch->ctx;
+   struct panfrost_device *dev = pan_device(ctx->base.screen);
+   uint32_t vm_sync_handle = panthor_kmod_vm_sync_handle(dev->kmod.vm);
+   struct util_dynarray syncops;
+   int ret;
+
+   util_dynarray_init(&syncops, NULL);
+
+   ret = csf_submit_collect_wait_ops(batch, &syncops, vm_sync_handle);
+   if (ret)
+      goto out_free_syncops;
+
+   uint64_t vm_sync_cur_point = panthor_kmod_vm_sync_lock(dev->kmod.vm);
+   uint64_t vm_sync_signal_point = vm_sync_cur_point + 1;
+
+   struct drm_panthor_sync_op signalop = {
+      .flags = DRM_PANTHOR_SYNC_OP_SIGNAL |
+               DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ,
+      .handle = vm_sync_handle,
+      .timeline_value = vm_sync_signal_point,
+   };
+
+   util_dynarray_append(&syncops, struct drm_panthor_sync_op, signalop);
+
+   struct drm_panthor_queue_submit qsubmit;
+   struct drm_panthor_group_submit gsubmit;
+
+   csf_prepare_qsubmit(
+      ctx, &qsubmit, 0, cs_start, cs_size, util_dynarray_begin(&syncops),
+      util_dynarray_num_elements(&syncops, struct drm_panthor_sync_op));
+   csf_prepare_gsubmit(ctx, &gsubmit, &qsubmit, 1);
+   ret = csf_submit_gsubmit(ctx, &gsubmit);
+   panthor_kmod_vm_sync_unlock(dev->kmod.vm,
+                               ret ? vm_sync_cur_point : vm_sync_signal_point);
+
+   if (!ret) {
+      csf_submit_wait_and_dump(batch, &gsubmit, vm_sync_handle,
+                               vm_sync_signal_point);
+      ret = csf_attach_sync_points(batch, vm_sync_handle, vm_sync_signal_point);
+   } else {
+      csf_check_ctx_state_and_reinit(batch->ctx);
+   }
+
+out_free_syncops:
+   util_dynarray_fini(&syncops);
+   return ret;
+}
+
+void
+GENX(csf_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
+{
+   struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
+
+   GENX(pan_preload_fb)
+   (&dev->blitter, &batch->pool.base, NULL, fb, batch->tls.gpu,
+    batch->tiler_ctx.bifrost, NULL);
+}
+
+void
+GENX(csf_emit_fragment_job)(struct panfrost_batch *batch,
+                            const struct pan_fb_info *pfb)
+{
+   struct cs_builder *b = batch->csf.cs.builder;
+
+   if (batch->draw_count > 0) {
+      /* Finish tiling and wait for IDVS and tiling */
+      cs_finish_tiling(b, false);
+      cs_wait_slot(b, 2, false);
+      cs_vt_end(b, cs_now());
+   }
+
+   /* Set up the fragment job */
+   cs_move64_to(b, cs_reg64(b, 40), batch->framebuffer.gpu);
+   cs_move32_to(b, cs_reg32(b, 42), (batch->miny << 16) | batch->minx);
+   cs_move32_to(b, cs_reg32(b, 43),
+                ((batch->maxy - 1) << 16) | (batch->maxx - 1));
+
+   /* Run the fragment job and wait */
+   cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
+   cs_wait_slot(b, 2, false);
+
+   /* Gather freed heap chunks and add them to the heap context free list
+    * so they can be re-used next time the tiler heap runs out of chunks.
+    * That's what cs_finish_fragment() is all about. The list of freed
+    * chunks is in the tiler context descriptor
+    * (completed_{top,bottom fields}). */
+   if (batch->draw_count > 0) {
+      assert(batch->tiler_ctx.bifrost);
+      cs_move64_to(b, cs_reg64(b, 90), batch->tiler_ctx.bifrost);
+      cs_load_to(b, cs_reg_tuple(b, 86, 4), cs_reg64(b, 90), BITFIELD_MASK(4),
+                 40);
+      cs_wait_slot(b, 0, false);
+      cs_finish_fragment(b, true, cs_reg64(b, 86), cs_reg64(b, 88), cs_now());
+   }
+}
+
+static void
+csf_emit_shader_regs(struct panfrost_batch *batch, enum pipe_shader_type stage,
+                     mali_ptr shader)
+{
+   mali_ptr resources = panfrost_emit_resources(batch, stage);
+
+   assert(stage == PIPE_SHADER_VERTEX || stage == PIPE_SHADER_FRAGMENT ||
+          stage == PIPE_SHADER_COMPUTE);
+
+   unsigned offset = (stage == PIPE_SHADER_FRAGMENT) ? 4 : 0;
+   unsigned fau_count = DIV_ROUND_UP(batch->nr_push_uniforms[stage], 2);
+
+   struct cs_builder *b = batch->csf.cs.builder;
+   cs_move64_to(b, cs_reg64(b, 0 + offset), resources);
+   cs_move64_to(b, cs_reg64(b, 8 + offset),
+                batch->push_uniforms[stage] | ((uint64_t)fau_count << 56));
+   cs_move64_to(b, cs_reg64(b, 16 + offset), shader);
+}
+
+void
+GENX(csf_launch_grid)(struct panfrost_batch *batch,
+                      const struct pipe_grid_info *info)
+{
+   /* Empty compute programs are invalid and don't make sense */
+   if (batch->rsd[PIPE_SHADER_COMPUTE] == 0)
+      return;
+
+   struct panfrost_context *ctx = batch->ctx;
+   struct panfrost_device *dev = pan_device(ctx->base.screen);
+   struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
+   struct cs_builder *b = batch->csf.cs.builder;
+
+   csf_emit_shader_regs(batch, PIPE_SHADER_COMPUTE,
+                        batch->rsd[PIPE_SHADER_COMPUTE]);
+
+   cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
+
+   /* Global attribute offset */
+   cs_move32_to(b, cs_reg32(b, 32), 0);
+
+   /* Compute workgroup size */
+   uint32_t wg_size[4];
+   pan_pack(wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
+      cfg.workgroup_size_x = info->block[0];
+      cfg.workgroup_size_y = info->block[1];
+      cfg.workgroup_size_z = info->block[2];
+
+      /* Workgroups may be merged if the shader does not use barriers
+       * or shared memory. This condition is checked against the
+       * static shared_size at compile-time. We need to check the
+       * variable shared size at launch_grid time, because the
+       * compiler doesn't know about that.
+       */
+      cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups &&
+                                     (info->variable_shared_mem == 0);
+   }
+
+   cs_move32_to(b, cs_reg32(b, 33), wg_size[0]);
+
+   /* Offset */
+   for (unsigned i = 0; i < 3; ++i)
+      cs_move32_to(b, cs_reg32(b, 34 + i), 0);
+
+   unsigned threads_per_wg = info->block[0] * info->block[1] * info->block[2];
+   unsigned max_thread_cnt = panfrost_compute_max_thread_count(
+      &dev->kmod.props, cs->info.work_reg_count);
+
+   if (info->indirect) {
+      /* Load size in workgroups per dimension from memory */
+      struct cs_index address = cs_reg64(b, 64);
+      cs_move64_to(
+         b, address,
+         pan_resource(info->indirect)->image.data.base + info->indirect_offset);
+
+      struct cs_index grid_xyz = cs_reg_tuple(b, 37, 3);
+      cs_load_to(b, grid_xyz, address, BITFIELD_MASK(3), 0);
+
+      /* Wait for the load */
+      cs_wait_slot(b, 0, false);
+
+      /* Copy to FAU */
+      for (unsigned i = 0; i < 3; ++i) {
+         if (batch->num_wg_sysval[i]) {
+            cs_move64_to(b, address, batch->num_wg_sysval[i]);
+            cs_store(b, cs_extract32(b, grid_xyz, i), address, BITFIELD_MASK(1),
+                     0);
+         }
+      }
+
+      /* Wait for the stores */
+      cs_wait_slot(b, 0, false);
+
+      cs_run_compute_indirect(b, DIV_ROUND_UP(max_thread_cnt, threads_per_wg),
+                              false, cs_shader_res_sel(0, 0, 0, 0));
+   } else {
+      /* Set size in workgroups per dimension immediately */
+      for (unsigned i = 0; i < 3; ++i)
+         cs_move32_to(b, cs_reg32(b, 37 + i), info->grid[i]);
+
+      /* Pick the task_axis and task_increment to maximize thread utilization. */
+      unsigned task_axis = MALI_TASK_AXIS_X;
+      unsigned threads_per_task = threads_per_wg;
+      unsigned task_increment = 0;
+
+      for (unsigned i = 0; i < 3; i++) {
+         if (threads_per_task * info->grid[i] >= max_thread_cnt) {
+            /* We reached out thread limit, stop at the current axis and
+             * calculate the increment so it doesn't exceed the per-core
+             * thread capacity.
+             */
+            task_increment = max_thread_cnt / threads_per_task;
+            break;
+         } else if (task_axis == MALI_TASK_AXIS_Z) {
+            /* We reached the Z axis, and there's still room to stuff more
+             * threads. Pick the current axis grid size as our increment
+             * as there's no point using something bigger.
+             */
+            task_increment = info->grid[i];
+            break;
+         }
+
+         threads_per_task *= info->grid[i];
+         task_axis++;
+      }
+
+      assert(task_axis <= MALI_TASK_AXIS_Z);
+      assert(task_increment > 0);
+      cs_run_compute(b, task_increment, task_axis, false,
+                     cs_shader_res_sel(0, 0, 0, 0));
+   }
+}
+
+void
+GENX(csf_launch_xfb)(struct panfrost_batch *batch,
+                     const struct pipe_draw_info *info, unsigned count)
+{
+   struct cs_builder *b = batch->csf.cs.builder;
+
+   cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
+
+   /* TODO: Indexing. Also, attribute_offset is a legacy feature.. */
+   cs_move32_to(b, cs_reg32(b, 32), batch->ctx->offset_start);
+
+   /* Compute workgroup size */
+   uint32_t wg_size[4];
+   pan_pack(wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
+      cfg.workgroup_size_x = 1;
+      cfg.workgroup_size_y = 1;
+      cfg.workgroup_size_z = 1;
+
+      /* Transform feedback shaders do not use barriers or
+       * shared memory, so we may merge workgroups.
+       */
+      cfg.allow_merging_workgroups = true;
+   }
+   cs_move32_to(b, cs_reg32(b, 33), wg_size[0]);
+
+   /* Offset */
+   for (unsigned i = 0; i < 3; ++i)
+      cs_move32_to(b, cs_reg32(b, 34 + i), 0);
+
+   cs_move32_to(b, cs_reg32(b, 37), count);
+   cs_move32_to(b, cs_reg32(b, 38), info->instance_count);
+   cs_move32_to(b, cs_reg32(b, 39), 1);
+
+   csf_emit_shader_regs(batch, PIPE_SHADER_VERTEX,
+                        batch->rsd[PIPE_SHADER_VERTEX]);
+   /* XXX: Choose correctly */
+   cs_run_compute(b, 1, MALI_TASK_AXIS_Z, false, cs_shader_res_sel(0, 0, 0, 0));
+}
+
+static mali_ptr
+csf_get_tiler_desc(struct panfrost_batch *batch)
+{
+   struct panfrost_context *ctx = batch->ctx;
+   struct panfrost_device *dev = pan_device(ctx->base.screen);
+
+   if (batch->tiler_ctx.bifrost)
+      return batch->tiler_ctx.bifrost;
+
+   struct panfrost_ptr t =
+      pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
+   pan_pack(t.cpu, TILER_CONTEXT, tiler) {
+      unsigned max_levels = dev->tiler_features.max_levels;
+      assert(max_levels >= 2);
+
+      /* TODO: Select hierarchy mask more effectively */
+      tiler.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28;
+
+      /* For large framebuffers, disable the smallest bin size to
+       * avoid pathological tiler memory usage. Required to avoid OOM
+       * on dEQP-GLES31.functional.fbo.no_attachments.maximums.all on
+       * Mali-G57.
+       */
+      if (MAX2(batch->key.width, batch->key.height) >= 4096)
+         tiler.hierarchy_mask &= ~1;
+
+      tiler.fb_width = batch->key.width;
+      tiler.fb_height = batch->key.height;
+      tiler.heap = batch->ctx->csf.heap.desc_bo->ptr.gpu;
+      tiler.sample_pattern =
+         pan_sample_pattern(util_framebuffer_get_num_samples(&batch->key));
+      tiler.first_provoking_vertex =
+         pan_tristate_get(batch->first_provoking_vertex);
+      tiler.geometry_buffer = ctx->csf.tmp_geom_bo->ptr.gpu;
+      tiler.geometry_buffer_size = ctx->csf.tmp_geom_bo->kmod_bo->size;
+   }
+
+   batch->tiler_ctx.bifrost = t.gpu;
+   return batch->tiler_ctx.bifrost;
+}
+
+void
+GENX(csf_launch_draw)(struct panfrost_batch *batch,
+                      const struct pipe_draw_info *info, unsigned drawid_offset,
+                      const struct pipe_draw_start_count_bias *draw,
+                      unsigned vertex_count)
+{
+   struct panfrost_context *ctx = batch->ctx;
+   struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
+   struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
+   bool idvs = vs->info.vs.idvs;
+   bool fs_required = panfrost_fs_required(
+      fs, ctx->blend, &ctx->pipe_framebuffer, ctx->depth_stencil);
+   bool secondary_shader = vs->info.vs.secondary_enable && fs_required;
+
+   assert(idvs && "IDVS required for CSF");
+
+   struct cs_builder *b = batch->csf.cs.builder;
+
+   if (batch->draw_count == 0)
+      cs_vt_start(batch->csf.cs.builder, cs_now());
+
+   csf_emit_shader_regs(batch, PIPE_SHADER_VERTEX,
+                        panfrost_get_position_shader(batch, info));
+
+   if (fs_required) {
+      csf_emit_shader_regs(batch, PIPE_SHADER_FRAGMENT,
+                           batch->rsd[PIPE_SHADER_FRAGMENT]);
+   } else {
+      cs_move64_to(b, cs_reg64(b, 4), 0);
+      cs_move64_to(b, cs_reg64(b, 12), 0);
+      cs_move64_to(b, cs_reg64(b, 20), 0);
+   }
+
+   if (secondary_shader) {
+      cs_move64_to(b, cs_reg64(b, 18), panfrost_get_varying_shader(batch));
+   }
+
+   cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
+   cs_move64_to(b, cs_reg64(b, 30), batch->tls.gpu);
+   cs_move32_to(b, cs_reg32(b, 32), 0);
+   cs_move32_to(b, cs_reg32(b, 33), draw->count);
+   cs_move32_to(b, cs_reg32(b, 34), info->instance_count);
+   cs_move32_to(b, cs_reg32(b, 35), 0);
+
+   /* Base vertex offset on Valhall is used for both indexed and
+    * non-indexed draws, in a simple way for either. Handle both cases.
+    */
+   if (info->index_size) {
+      cs_move32_to(b, cs_reg32(b, 36), draw->index_bias);
+      cs_move32_to(b, cs_reg32(b, 39), info->index_size * draw->count);
+   } else {
+      cs_move32_to(b, cs_reg32(b, 36), draw->start);
+      cs_move32_to(b, cs_reg32(b, 39), 0);
+   }
+   cs_move32_to(b, cs_reg32(b, 37), 0);
+   cs_move32_to(b, cs_reg32(b, 38), 0);
+
+   cs_move64_to(b, cs_reg64(b, 40), csf_get_tiler_desc(batch));
+
+   STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
+   STATIC_ASSERT(sizeof(uint64_t) == pan_size(SCISSOR));
+   uint64_t *sbd = (uint64_t *)&batch->scissor[0];
+   cs_move64_to(b, cs_reg64(b, 42), *sbd);
+
+   cs_move32_to(b, cs_reg32(b, 44), fui(batch->minimum_z));
+   cs_move32_to(b, cs_reg32(b, 45), fui(batch->maximum_z));
+
+   if (ctx->occlusion_query && ctx->active_queries) {
+      struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc);
+      cs_move64_to(b, cs_reg64(b, 46), rsrc->image.data.base);
+      panfrost_batch_write_rsrc(ctx->batch, rsrc, PIPE_SHADER_FRAGMENT);
+   }
+
+   cs_move32_to(b, cs_reg32(b, 48), panfrost_vertex_attribute_stride(vs, fs));
+   cs_move64_to(b, cs_reg64(b, 50),
+                batch->blend | MAX2(batch->key.nr_cbufs, 1));
+   cs_move64_to(b, cs_reg64(b, 52), batch->depth_stencil);
+
+   if (info->index_size)
+      cs_move64_to(b, cs_reg64(b, 54), batch->indices);
+
+   uint32_t primitive_flags = 0;
+   pan_pack(&primitive_flags, PRIMITIVE_FLAGS, cfg) {
+      if (panfrost_writes_point_size(ctx))
+         cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
+
+      cfg.allow_rotating_primitives = allow_rotating_primitives(fs, info);
+
+      /* Non-fixed restart indices should have been lowered */
+      assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
+      cfg.primitive_restart = info->primitive_restart;
+
+      cfg.position_fifo_format = panfrost_writes_point_size(ctx)
+                                    ? MALI_FIFO_FORMAT_EXTENDED
+                                    : MALI_FIFO_FORMAT_BASIC;
+   }
+
+   cs_move32_to(b, cs_reg32(b, 56), primitive_flags);
+
+   struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
+
+   uint32_t dcd_flags0 = 0, dcd_flags1 = 0;
+   pan_pack(&dcd_flags0, DCD_FLAGS_0, cfg) {
+      enum mesa_prim reduced_mode = u_reduced_prim(info->mode);
+      bool polygon = reduced_mode == MESA_PRIM_TRIANGLES;
+      bool lines = reduced_mode == MESA_PRIM_LINES;
+
+      /*
+       * From the Gallium documentation,
+       * pipe_rasterizer_state::cull_face "indicates which faces of
+       * polygons to cull". Points and lines are not considered
+       * polygons and should be drawn even if all faces are culled.
+       * The hardware does not take primitive type into account when
+       * culling, so we need to do that check ourselves.
+       */
+      cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
+      cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
+      cfg.front_face_ccw = rast->front_ccw;
+
+      cfg.multisample_enable = rast->multisample;
+
+      /* Use per-sample shading if required by API Also use it when a
+       * blend shader is used with multisampling, as this is handled
+       * by a single ST_TILE in the blend shader with the current
+       * sample ID, requiring per-sample shading.
+       */
+      cfg.evaluate_per_sample =
+         (rast->multisample &&
+          ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
+
+      cfg.single_sampled_lines = !rast->multisample;
+
+      if (lines && rast->line_smooth) {
+         cfg.multisample_enable = true;
+         cfg.single_sampled_lines = false;
+      }
+
+      bool has_oq = ctx->occlusion_query && ctx->active_queries;
+      if (has_oq) {
+         if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
+            cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
+         else
+            cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
+      }
+
+      if (fs_required) {
+         struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
+            fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
+            ctx->blend->base.alpha_to_coverage,
+            ctx->depth_stencil->zs_always_passes);
+
+         cfg.pixel_kill_operation = earlyzs.kill;
+         cfg.zs_update_operation = earlyzs.update;
+
+         cfg.allow_forward_pixel_to_kill =
+            pan_allow_forward_pixel_to_kill(ctx, fs);
+         cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
+
+         cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
+         cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
+
+         /* Also use per-sample shading if required by the shader
+          */
+         cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
+
+         /* Unlike Bifrost, alpha-to-coverage must be included in
+          * this identically-named flag. Confusing, isn't it?
+          */
+         cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
+                                        fs->info.fs.can_discard ||
+                                        ctx->blend->base.alpha_to_coverage;
+
+         cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
+      } else {
+         /* These operations need to be FORCE to benefit from the
+          * depth-only pass optimizations.
+          */
+         cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
+         cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
+
+         /* No shader and no blend => no shader or blend
+          * reasons to disable FPK. The only FPK-related state
+          * not covered is alpha-to-coverage which we don't set
+          * without blend.
+          */
+         cfg.allow_forward_pixel_to_kill = true;
+
+         /* No shader => no shader side effects */
+         cfg.allow_forward_pixel_to_be_killed = true;
+
+         /* Alpha isn't written so these are vacuous */
+         cfg.overdraw_alpha0 = true;
+         cfg.overdraw_alpha1 = true;
+      }
+   }
+
+   pan_pack(&dcd_flags1, DCD_FLAGS_1, cfg) {
+      cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
+
+      if (fs_required) {
+         /* See JM Valhall equivalent code */
+         cfg.render_target_mask =
+            (fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask;
+      }
+   }
+
+   cs_move32_to(b, cs_reg32(b, 57), dcd_flags0);
+   cs_move32_to(b, cs_reg32(b, 58), dcd_flags1);
+
+   uint64_t primsize = 0;
+   panfrost_emit_primitive_size(ctx, info->mode == MESA_PRIM_POINTS, 0,
+                                &primsize);
+   cs_move64_to(b, cs_reg64(b, 60), primsize);
+
+   uint32_t flags_override;
+   pan_pack(&flags_override, PRIMITIVE_FLAGS, cfg) {
+      cfg.draw_mode = pan_draw_mode(info->mode);
+      cfg.index_type = panfrost_translate_index_size(info->index_size);
+      cfg.secondary_shader = secondary_shader;
+   }
+
+   cs_run_idvs(b, flags_override, false, true, cs_shader_res_sel(0, 0, 1, 0),
+               cs_shader_res_sel(2, 2, 2, 0), cs_undef());
+}
+
+#define POSITION_FIFO_SIZE (64 * 1024)
+
+void
+GENX(csf_init_context)(struct panfrost_context *ctx)
+{
+   struct panfrost_device *dev = pan_device(ctx->base.screen);
+   struct drm_panthor_queue_create qc[] = {{
+      .priority = 1,
+      .ringbuf_size = 64 * 1024,
+   }};
+
+   struct drm_panthor_group_create gc = {
+      .compute_core_mask = dev->kmod.props.shader_present,
+      .fragment_core_mask = dev->kmod.props.shader_present,
+      .tiler_core_mask = 1,
+      .max_compute_cores = util_bitcount64(dev->kmod.props.shader_present),
+      .max_fragment_cores = util_bitcount64(dev->kmod.props.shader_present),
+      .max_tiler_cores = 1,
+      .priority = PANTHOR_GROUP_PRIORITY_MEDIUM,
+      .queues = DRM_PANTHOR_OBJ_ARRAY(ARRAY_SIZE(qc), qc),
+      .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
+   };
+
+   int ret =
+      drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_CREATE, &gc);
+
+   assert(!ret);
+
+   ctx->csf.group_handle = gc.group_handle;
+
+   /* Get tiler heap */
+   struct drm_panthor_tiler_heap_create thc = {
+      .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
+      /* 2M chunks. */
+      .chunk_size = 2 * 1024 * 1024,
+      .initial_chunk_count = 5,
+
+      /* 64 x 2M = 128M, which matches the tiler_heap BO allocated in
+       * panfrost_open_device() for pre-v10 HW.
+       */
+      .max_chunks = 64,
+      .target_in_flight = 65535,
+   };
+   ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE,
+                  &thc);
+
+   assert(!ret);
+
+   ctx->csf.heap.handle = thc.handle;
+
+   ctx->csf.heap.desc_bo =
+      panfrost_bo_create(dev, pan_size(TILER_HEAP), 0, "Tiler Heap");
+   pan_pack(ctx->csf.heap.desc_bo->ptr.cpu, TILER_HEAP, heap) {
+      heap.size = 2 * 1024 * 1024;
+      heap.base = thc.first_heap_chunk_gpu_va;
+      heap.bottom = heap.base + 64;
+      heap.top = heap.base + heap.size;
+   }
+
+   ctx->csf.tmp_geom_bo = panfrost_bo_create(
+      dev, POSITION_FIFO_SIZE, PAN_BO_INVISIBLE, "Temporary Geometry buffer");
+   assert(ctx->csf.tmp_geom_bo);
+
+   /* Setup the tiler heap */
+   struct panfrost_bo *cs_bo =
+      panfrost_bo_create(dev, 4096, 0, "Temporary CS buffer");
+   assert(cs_bo);
+
+   struct cs_buffer init_buffer = {
+      .cpu = cs_bo->ptr.cpu,
+      .gpu = cs_bo->ptr.gpu,
+      .capacity = panfrost_bo_size(cs_bo) / sizeof(uint64_t),
+   };
+   const struct cs_builder_conf bconf = {
+      .nr_registers = 96,
+      .nr_kernel_registers = 4,
+   };
+   struct cs_builder b;
+   cs_builder_init(&b, &bconf, init_buffer);
+   struct cs_index heap = cs_reg64(&b, 72);
+   cs_move64_to(&b, heap, thc.tiler_heap_ctx_gpu_va);
+   cs_heap_set(&b, heap);
+
+   struct drm_panthor_queue_submit qsubmit;
+   struct drm_panthor_group_submit gsubmit;
+   struct drm_panthor_sync_op sync = {
+      .flags =
+         DRM_PANTHOR_SYNC_OP_SIGNAL | DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ,
+      .handle = ctx->syncobj,
+   };
+
+   assert(cs_is_valid(&b));
+   cs_finish(&b);
+
+   uint32_t cs_instr_count = b.root_chunk.size;
+   uint64_t cs_start = b.root_chunk.buffer.gpu;
+   uint32_t cs_size = cs_instr_count * 8;
+
+   csf_prepare_qsubmit(ctx, &qsubmit, 0, cs_start, cs_size, &sync, 1);
+   csf_prepare_gsubmit(ctx, &gsubmit, &qsubmit, 1);
+   ret = csf_submit_gsubmit(ctx, &gsubmit);
+   assert(!ret);
+
+   /* Wait before freeing the buffer. */
+   drmSyncobjWait(panfrost_device_fd(dev), &ctx->syncobj, 1, INT64_MAX, 0,
+                  NULL);
+   panfrost_bo_unreference(cs_bo);
+}
+
+void
+GENX(csf_cleanup_context)(struct panfrost_context *ctx)
+{
+   struct panfrost_device *dev = pan_device(ctx->base.screen);
+   struct drm_panthor_tiler_heap_destroy thd = {
+      .handle = ctx->csf.heap.handle,
+   };
+   int ret;
+
+   /* Make sure all jobs are done before destroying the heap. */
+   ret = drmSyncobjWait(panfrost_device_fd(dev), &ctx->syncobj, 1, INT64_MAX, 0,
+                        NULL);
+   assert(!ret);
+
+   ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY,
+                  &thd);
+   assert(!ret);
+
+   struct drm_panthor_group_destroy gd = {
+      .group_handle = ctx->csf.group_handle,
+   };
+
+   ret =
+      drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd);
+   assert(!ret);
+
+   panfrost_bo_unreference(ctx->csf.heap.desc_bo);
+}
diff --git a/src/gallium/drivers/panfrost/pan_csf.h b/src/gallium/drivers/panfrost/pan_csf.h
new file mode 100644
index 00000000000..ee03219ddc7
--- /dev/null
+++ b/src/gallium/drivers/panfrost/pan_csf.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2023 Collabora Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __PAN_CSF_H__
+#define __PAN_CSF_H__
+
+#include "compiler/shader_enums.h"
+
+#include "pan_bo.h"
+#include "pan_mempool.h"
+
+struct cs_builder;
+
+struct panfrost_csf_batch {
+   /* CS related fields. */
+   struct {
+      /* CS builder. */
+      struct cs_builder *builder;
+
+      /* CS state, written through the CS, and checked when PAN_MESA_DEBUG=sync.
+       */
+      struct panfrost_ptr state;
+   } cs;
+
+   /* Pool used to allocate CS chunks. */
+   struct panfrost_pool cs_chunk_pool;
+};
+
+struct panfrost_csf_context {
+   uint32_t group_handle;
+
+   struct {
+      uint32_t handle;
+      struct panfrost_bo *desc_bo;
+   } heap;
+
+   /* Temporary geometry buffer. Used as a FIFO by the tiler. */
+   struct panfrost_bo *tmp_geom_bo;
+};
+
+#if defined(PAN_ARCH) && PAN_ARCH >= 10
+
+#include "genxml/gen_macros.h"
+
+struct panfrost_batch;
+struct panfrost_context;
+struct pan_fb_info;
+struct pipe_draw_info;
+struct pipe_grid_info;
+struct pipe_draw_start_count_bias;
+
+void GENX(csf_init_context)(struct panfrost_context *ctx);
+void GENX(csf_cleanup_context)(struct panfrost_context *ctx);
+
+void GENX(csf_init_batch)(struct panfrost_batch *batch);
+void GENX(csf_cleanup_batch)(struct panfrost_batch *batch);
+int GENX(csf_submit_batch)(struct panfrost_batch *batch);
+
+void GENX(csf_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb);
+void GENX(csf_emit_fragment_job)(struct panfrost_batch *batch,
+                                 const struct pan_fb_info *pfb);
+void GENX(csf_emit_batch_end)(struct panfrost_batch *batch);
+void GENX(csf_launch_xfb)(struct panfrost_batch *batch,
+                          const struct pipe_draw_info *info, unsigned count);
+void GENX(csf_launch_grid)(struct panfrost_batch *batch,
+                           const struct pipe_grid_info *info);
+void GENX(csf_launch_draw)(struct panfrost_batch *batch,
+                           const struct pipe_draw_info *info,
+                           unsigned drawid_offset,
+                           const struct pipe_draw_start_count_bias *draw,
+                           unsigned vertex_count);
+
+#endif /* PAN_ARCH >= 10 */
+
+#endif /* __PAN_CSF_H__ */
diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h
index 04d3745493b..aebc4ae9074 100644
--- a/src/gallium/drivers/panfrost/pan_job.h
+++ b/src/gallium/drivers/panfrost/pan_job.h
@@ -28,6 +28,7 @@
 
 #include "pipe/p_state.h"
 #include "util/u_dynarray.h"
+#include "pan_csf.h"
 #include "pan_desc.h"
 #include "pan_jm.h"
 #include "pan_mempool.h"
@@ -218,6 +219,7 @@ struct panfrost_batch {
    /* Job frontend specific fields. */
    union {
       struct panfrost_jm_batch jm;
+      struct panfrost_csf_batch csf;
    };
 };