Merge branch 'panfrost-v15' into 'main'

Draft: panfrost: Add v15 support See merge request mesa/mesa!41366
2026-05-08 02:38:04 +02:00 · 2026-05-08 02:09:53 +02:00 · 2026-05-08 02:09:53 +02:00 · 7731477125
commit 7731477125
parent ff5b909511 b4f5227efe
92 changed files with 10165 additions and 769 deletions
--- a/docs/drivers/panfrost.rst
+++ b/docs/drivers/panfrost.rst
@ -34,6 +34,10 @@ The following hardware is currently supported:
 +--------------------+---------------+-----------+--------+--------+
 | G725               | 5th Gen (v13) | 3.1       | 3.1    | 1.4    |
 +--------------------+---------------+-----------+--------+--------+
+| G1-Pro             | 5th Gen (v14) | 3.1       | 3.1    | 1.4    |
+--------------------+---------------+-----------+--------+--------+
+| TMAx               | 5th Gen (v15) | 3.1       | 3.1    | 1.4    |
+--------------------+---------------+-----------+--------+--------+

 Other Midgard and Bifrost chips (e.g. G71) are not yet supported.

--- a/include/drm-uapi/panthor_drm.h
+++ b/include/drm-uapi/panthor_drm.h
@ -350,7 +350,7 @@ struct drm_panthor_gpu_info {
 	__u32 as_present;

 	/**
-	 * @select_coherency: Coherency selected for this device.
+	 * @selected_coherency: Coherency selected for this device.
 	 *
 	 * One of drm_panthor_gpu_coherency.
 	 */
@ -368,11 +368,27 @@ struct drm_panthor_gpu_info {
 	/** @core_features: Used to discriminate core variants when they exist. */
 	__u32 core_features;

-	/** @pad: MBZ. */
-	__u32 pad;
+	/** @thread_num_active_granularity: Granularity of number of active threads */
+	__u32 thread_num_active_granularity;

 	/** @gpu_features: Bitmask describing supported GPU-wide features */
 	__u64 gpu_features;
+
+	/** @gpu_wide_id: 64-bit GPU_ID for v15 onwards. */
+	__u64 gpu_wide_id;
+#define DRM_PANTHOR_WIDE_ARCH_MAJOR(x)		(((x) >> 56) & 0xff)
+#define DRM_PANTHOR_WIDE_ARCH_MINOR(x)		(((x) >> 48) & 0xff)
+#define DRM_PANTHOR_WIDE_ARCH_REV(x)		(((x) >> 40) & 0xff)
+#define DRM_PANTHOR_WIDE_PRODUCT_MAJOR(x)	(((x) >> 32) & 0xff)
+#define DRM_PANTHOR_WIDE_VERSION_MAJOR(x)	(((x) >> 16) & 0xff)
+#define DRM_PANTHOR_WIDE_VERSION_MINOR(x)	(((x) >> 8) & 0xff)
+#define DRM_PANTHOR_WIDE_VERSION_STATUS(x)	((x) & 0xff)
+
+	/** @gpu_rev_wide: 64-bit GPU revision for v15 onwards */
+	__u64 gpu_rev_wide;
+
+	/** @l2_features_wide: 64-bit L2_FEATURES for v15 onwards */
+	__u64 l2_features_wide;
 };

 /**
@ -409,6 +425,38 @@ struct drm_panthor_csif_info {
 	__u32 pad;
 };

+/**
+ * enum drm_panthor_timestamp_info_flags - drm_panthor_timestamp_info.flags
+ */
+enum drm_panthor_timestamp_info_flags {
+	/** @DRM_PANTHOR_TIMESTAMP_GPU: Query GPU time. */
+	DRM_PANTHOR_TIMESTAMP_GPU = 1 << 0,
+
+	/** @DRM_PANTHOR_TIMESTAMP_CPU_NONE: Don't query CPU time. */
+	DRM_PANTHOR_TIMESTAMP_CPU_NONE = 0 << 1,
+
+	/** @DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC: Query CPU time using CLOCK_MONOTONIC. */
+	DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC = 1 << 1,
+
+	/** @DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC_RAW: Query CPU time using CLOCK_MONOTONIC_RAW. */
+	DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC_RAW = 2 << 1,
+
+	/** @DRM_PANTHOR_TIMESTAMP_CPU_TYPE_MASK: Space reserved for CPU clock type. */
+	DRM_PANTHOR_TIMESTAMP_CPU_TYPE_MASK = 7 << 1,
+
+	/** @DRM_PANTHOR_TIMESTAMP_GPU_OFFSET: Query GPU offset. */
+	DRM_PANTHOR_TIMESTAMP_GPU_OFFSET = 1 << 4,
+
+	/** @DRM_PANTHOR_TIMESTAMP_GPU_CYCLE_COUNT: Query GPU cycle count. */
+	DRM_PANTHOR_TIMESTAMP_GPU_CYCLE_COUNT = 1 << 5,
+
+	/** @DRM_PANTHOR_TIMESTAMP_FREQ: Query timestamp frequency. */
+	DRM_PANTHOR_TIMESTAMP_FREQ = 1 << 6,
+
+	/** @DRM_PANTHOR_TIMESTAMP_DURATION: Return duration of time query. */
+	DRM_PANTHOR_TIMESTAMP_DURATION = 1 << 7,
+};
+
 /**
 * struct drm_panthor_timestamp_info - Timestamp information
 *
@ -421,11 +469,38 @@ struct drm_panthor_timestamp_info {
 	 */
 	__u64 timestamp_frequency;

-	/** @current_timestamp: The current timestamp. */
+	/** @current_timestamp: The current GPU timestamp. */
 	__u64 current_timestamp;

-	/** @timestamp_offset: The offset of the timestamp timer. */
+	/** @timestamp_offset: The offset of the GPU timestamp timer. */
 	__u64 timestamp_offset;
+
+	/**
+	 * @flags: Bitmask of drm_panthor_timestamp_info_flags.
+	 *
+	 * If set to 0, then it is interpreted as:
+	 *  DRM_PANTHOR_TIMESTAMP_GPU |
+	 *  DRM_PANTHOR_TIMESTAMP_GPU_OFFSET |
+	 *  DRM_PANTHOR_TIMESTAMP_FREQ
+	 *
+	 * Note: these flags are exclusive to each other (only one can be used):
+	 * - DRM_PANTHOR_TIMESTAMP_CPU_NONE
+	 * - DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC
+	 * - DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC_RAW
+	 */
+	__u32 flags;
+
+	/** @duration_nsec: Duration of time query. */
+	__u32 duration_nsec;
+
+	/** @cycle_count: Value of GPU_CYCLE_COUNT. */
+	__u64 cycle_count;
+
+	/** @cpu_timestamp_sec: Seconds part of CPU timestamp. */
+	__u64 cpu_timestamp_sec;
+
+	/** @cpu_timestamp_nsec: Nanseconds part of CPU timestamp. */
+	__u64 cpu_timestamp_nsec;
 };

 /**
--- a/src/gallium/drivers/panfrost/meson.build
+++ b/src/gallium/drivers/panfrost/meson.build
@ -41,7 +41,7 @@ compile_args_panfrost = [
  '-Wno-pointer-arith'
 ]

-panfrost_versions = ['4', '5', '6', '7', '9', '10', '12', '13']
+panfrost_versions = ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15']
 libpanfrost_versions = []

 foreach ver : panfrost_versions
@ -54,7 +54,7 @@ foreach ver : panfrost_versions
  ]
  if ver in ['4', '5', '6', '7', '9']
    files_panfrost_vx += ['pan_jm.c']
-  elif ver in ['10', '12', '13']
+  elif ver in ['10', '12', '13', '14', '15']
    files_panfrost_vx += ['pan_csf.c']
  endif
  libpanfrost_versions += static_library(
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@ -49,7 +49,7 @@
 * functions. */
 #if PAN_ARCH <= 9
 #define JOBX(__suffix) GENX(jm_##__suffix)
-#elif PAN_ARCH <= 13
+#elif PAN_ARCH <= 15
 #define JOBX(__suffix) GENX(csf_##__suffix)
 #else
 #error "Unsupported arch"
@ -1661,7 +1661,8 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
      .tls.size = ss->info.tls_size,
      .wls.size = ss->info.wls_size + grid->variable_shared_mem,
      .wls.instances = pan_calc_wls_instances(
-         &local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim),
+         &local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim,
+         ss->info.work_reg_count),
   };

   if (ss->info.tls_size) {
@ -4455,11 +4456,15 @@ prepare_shader(struct panfrost_compiled_shader *state,
      else if (vs)
         cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
 #endif
-
+#if PAN_ARCH >= 15
+      cfg.register_count = state->info.work_reg_count;
+      cfg.preload.r0_r15 = state->info.preload;
+#else
      cfg.register_allocation =
         pan_register_allocation(state->info.work_reg_count);
-      cfg.binary = state->bin.gpu;
      cfg.preload.r48_r63 = (state->info.preload >> 48);
+#endif
+      cfg.binary = state->bin.gpu;
      cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);

      if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
@ -4475,10 +4480,15 @@ prepare_shader(struct panfrost_compiled_shader *state,
 #if PAN_ARCH < 12
      cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
 #endif
+#if PAN_ARCH >= 15
+      cfg.register_count = state->info.work_reg_count;
+      cfg.preload.r0_r15 = state->info.preload;
+#else
      cfg.register_allocation =
         pan_register_allocation(state->info.work_reg_count);
-      cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset;
      cfg.preload.r48_r63 = (state->info.preload >> 48);
+#endif
+      cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset;
      cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
   }

--- a/src/gallium/drivers/panfrost/pan_csf.c
+++ b/src/gallium/drivers/panfrost/pan_csf.c
@ -1,5 +1,6 @@
 /*
 * Copyright (C) 2023 Collabora Ltd.
+ * Copyright (C) 2026 Arm Ltd.
 * SPDX-License-Identifier: MIT
 */

@ -13,6 +14,7 @@
 #include "pan_cmdstream.h"
 #include "pan_context.h"
 #include "pan_csf.h"
+#include "pan_fb.h"
 #include "pan_fb_preload.h"
 #include "pan_job.h"
 #include "pan_trace.h"
@ -75,6 +77,99 @@ csf_update_tiler_oom_ctx(struct cs_builder *b, uint64_t addr)
    (PAN_INCREMENTAL_RENDERING_##_pass##_PASS * sizeof(struct pan_ptr)) +      \
    offsetof(struct pan_ptr, gpu))

+#if PAN_ARCH >= 14
+static void
+cs_emit_static_fragment_state(struct cs_builder *b,
+                              struct panfrost_batch *batch,
+                              const struct pan_fb_info *fb)
+{
+   struct mali_fragment_bounding_box_packed bbox;
+   pan_pack(&bbox, FRAGMENT_BOUNDING_BOX, cfg) {
+      cfg.bound_min_x = batch->minx;
+      cfg.bound_min_y = batch->miny;
+      cfg.bound_max_x = batch->maxx - 1;
+      cfg.bound_max_y = batch->maxy - 1;
+   }
+
+   struct mali_frame_size_packed frame_size;
+   pan_pack(&frame_size, FRAME_SIZE, cfg) {
+      cfg.width = fb->width;
+      cfg.height = fb->height;
+   }
+
+   cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, BOUNDING_BOX),
+                bbox.opaque[0] | ((uint64_t)bbox.opaque[1] << 32));
+   cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FRAME_SIZE), frame_size.opaque[0]);
+   cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, SAMPLE_POSITION_ARRAY_POINTER),
+                fb->sample_positions);
+
+   struct mali_fragment_flags_1_packed flags1;
+   pan_pack(&flags1, FRAGMENT_FLAGS_1, cfg) {
+      /* The force_samples setting dictates the sample-count that is used
+       * for rasterization, and works like D3D11's ForcedSampleCount
+       * feature:
+       *
+       * - If force_samples == 0: Let nr_samples dictate sample count
+       * - If force_samples == 1: force single-sampled rasterization
+       * - If force_samples >= 1: force multi-sampled rasterization
+       *
+       * This can be used to read SYSTEM_VALUE_SAMPLE_MASK_IN from the
+       * fragment shader, even when performing single-sampled rendering.
+       */
+      if (fb->pls_enabled) {
+         cfg.sample_count = 4;
+         cfg.sample_pattern = pan_sample_pattern(1);
+      } else if (!fb->force_samples) {
+         cfg.sample_count = fb->nr_samples;
+         cfg.sample_pattern = pan_sample_pattern(fb->nr_samples);
+      } else if (fb->force_samples == 1) {
+         cfg.sample_count = fb->nr_samples;
+         cfg.sample_pattern = pan_sample_pattern(1);
+      } else {
+         cfg.sample_count = 1;
+         cfg.sample_pattern = pan_sample_pattern(fb->force_samples);
+      }
+
+      cfg.effective_tile_size = fb->tile_size;
+      cfg.point_sprite_coord_origin_max_y = fb->sprite_coord_origin;
+      cfg.first_provoking_vertex = fb->first_provoking_vertex;
+      cfg.render_target_count = MAX2(fb->rt_count, 1);
+      cfg.color_buffer_allocation = fb->cbuf_allocation;
+   }
+
+   cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), flags1.opaque[0]);
+
+   /* Leave the remaining RUN_FRAGMENT2 staging registers as zero. */
+}
+
+#define PAN_CS_REG_FBD_LAYER_PTR 54
+
+static inline void
+cs_emit_layer_fragment_state(struct cs_builder *b, struct cs_index fbd_ptr)
+{
+   /* Emit the dynamic fragment state. This state may change per-layer. */
+
+   cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_0), fbd_ptr,
+                offsetof(struct pan_fbd_layer, flags0));
+   cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_2), fbd_ptr,
+                offsetof(struct pan_fbd_layer, flags2));
+   cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, Z_CLEAR), fbd_ptr,
+                offsetof(struct pan_fbd_layer, z_clear));
+   cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, TILER_DESCRIPTOR_POINTER), fbd_ptr,
+                offsetof(struct pan_fbd_layer, tiler));
+   cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, RTD_POINTER), fbd_ptr,
+                offsetof(struct pan_fbd_layer, rtd_pointer));
+   cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, DBD_POINTER), fbd_ptr,
+                offsetof(struct pan_fbd_layer, dbd_pointer));
+   cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_ARG), fbd_ptr,
+                offsetof(struct pan_fbd_layer, frame_argument));
+   cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_SHADER_DCD_POINTER), fbd_ptr,
+                offsetof(struct pan_fbd_layer, dcd_pointer));
+
+   cs_flush_loads(b);
+}
+#endif /* PAN_ARCH >= 14 */
+
 static int
 csf_oom_handler_init(struct panfrost_context *ctx)
 {
@ -113,13 +208,18 @@ csf_oom_handler_init(struct panfrost_context *ctx)

   cs_function_def(&b, &handler, handler_ctx) {
      struct cs_index tiler_oom_ctx = cs_reg64(&b, TILER_OOM_CTX_REG);
-      struct cs_index counter = cs_reg32(&b, 47);
-      struct cs_index zero = cs_reg64(&b, 48);
-      struct cs_index flush_id = cs_reg32(&b, 48);
-      struct cs_index tiler_ctx = cs_reg64(&b, 50);
-      struct cs_index completed_top = cs_reg64(&b, 52);
-      struct cs_index completed_bottom = cs_reg64(&b, 54);
-      struct cs_index completed_chunks = cs_reg_tuple(&b, 52, 4);
+      struct cs_index counter = cs_reg32(&b, 31);
+      struct cs_index zero = cs_reg64(&b, 56);
+      struct cs_index flush_id = cs_reg32(&b, 58);
+      struct cs_index tiler_ctx = cs_reg64(&b, 60);
+      struct cs_index completed_top = cs_reg64(&b, 64);
+      struct cs_index completed_bottom = cs_reg64(&b, 66);
+      struct cs_index completed_chunks = cs_reg_tuple(&b, 64, 4);
+#if PAN_ARCH >= 14
+      struct cs_index fbd_pointer = cs_reg64(&b, PAN_CS_REG_FBD_LAYER_PTR);
+#else
+      struct cs_index fbd_pointer = cs_sr_reg64(&b, FRAGMENT, FBD_POINTER);
+#endif

      /* Ensure that the OTHER endpoint is valid */
 #if PAN_ARCH >= 11
@ -133,25 +233,31 @@ csf_oom_handler_init(struct panfrost_context *ctx)
      cs_load32_to(&b, counter, tiler_oom_ctx, FIELD_OFFSET(counter));
      cs_wait_slot(&b, 0);
      cs_if(&b, MALI_CS_CONDITION_GREATER, counter) {
-         cs_load64_to(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), tiler_oom_ctx,
-                      FBD_OFFSET(MIDDLE));
+         cs_load64_to(&b, fbd_pointer, tiler_oom_ctx, FBD_OFFSET(MIDDLE));
      }
      cs_else(&b) {
-         cs_load64_to(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), tiler_oom_ctx,
-                      FBD_OFFSET(FIRST));
+         cs_load64_to(&b, fbd_pointer, tiler_oom_ctx, FBD_OFFSET(FIRST));
      }

+#if PAN_ARCH >= 14
+      cs_emit_layer_fragment_state(&b, fbd_pointer);
+#else
      cs_load32_to(&b, cs_sr_reg32(&b, FRAGMENT, BBOX_MIN), tiler_oom_ctx,
                   FIELD_OFFSET(bbox_min));
      cs_load32_to(&b, cs_sr_reg32(&b, FRAGMENT, BBOX_MAX), tiler_oom_ctx,
                   FIELD_OFFSET(bbox_max));
      cs_move64_to(&b, cs_sr_reg64(&b, FRAGMENT, TEM_POINTER), 0);
      cs_move32_to(&b, cs_sr_reg32(&b, FRAGMENT, TEM_ROW_STRIDE), 0);
+#endif
      cs_wait_slot(&b, 0);

      /* Run the fragment job and wait */
      cs_select_endpoint_sb(&b, 3);
+#if PAN_ARCH >= 14
+      cs_run_fragment2(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER);
+#else
      cs_run_fragment(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER);
+#endif
      cs_wait_slot(&b, 3);

      /* Increment counter */
@ -218,6 +324,21 @@ GENX(csf_cleanup_batch)(struct panfrost_batch *batch)
   panfrost_pool_cleanup(&batch->csf.cs_chunk_pool);
 }

+#if PAN_ARCH >= 14
+static inline struct pan_ptr
+alloc_fbd(struct panfrost_batch *batch)
+{
+   const struct pan_desc_alloc_info fbd_layer = {
+      .size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64),
+      .align = alignof(struct pan_fbd_layer),
+      .nelems = 1,
+   };
+
+   return pan_pool_alloc_desc_aggregate(
+      &batch->pool.base, fbd_layer, PAN_DESC(ZS_CRC_EXTENSION),
+      PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
+}
+#else
 static inline struct pan_ptr
 alloc_fbd(struct panfrost_batch *batch)
 {
@ -225,6 +346,7 @@ alloc_fbd(struct panfrost_batch *batch)
      &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
      PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
 }
+#endif /* PAN_ARCH >= 14 */

 int
 GENX(csf_init_batch)(struct panfrost_batch *batch)
@ -758,7 +880,7 @@ GENX(csf_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
   (_ctx)->fbds[PAN_INCREMENTAL_RENDERING_##_pass##_PASS]
 #define EMIT_FBD(_ctx, _pass, _fb, _tls, _tiler_ctx)                           \
   GET_FBD(_ctx, _pass).gpu |=                                                 \
-      GENX(pan_emit_fbd)(_fb, 0, _tls, _tiler_ctx, GET_FBD(_ctx, _pass).cpu)
+      GENX(pan_emit_fbd)(_fb, 0, _tls, _tiler_ctx, GET_FBD(_ctx, _pass))

 void
 GENX(csf_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb,
@ -771,7 +893,7 @@ GENX(csf_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb,
   /* Default framebuffer descriptor */

   batch->framebuffer.gpu |=
-      GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu);
+      GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer);

   if (batch->draw_count == 0)
      return;
@ -854,15 +976,26 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch,
      cs_vt_end(b, cs_now());
   }

+#if PAN_ARCH >= 14
+   struct cs_index fbd_pointer = cs_reg64(b, PAN_CS_REG_FBD_LAYER_PTR);
+#else
+   struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER);
+#endif
+
   /* Set up the fragment job */
-   cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER),
-                batch->framebuffer.gpu);
+   cs_move64_to(b, fbd_pointer, batch->framebuffer.gpu);
+
+#if PAN_ARCH >= 14
+   cs_emit_static_fragment_state(b, batch, pfb);
+   cs_emit_layer_fragment_state(b, fbd_pointer);
+#else
   cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN),
                (batch->miny << 16) | batch->minx);
   cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MAX),
                ((batch->maxy - 1) << 16) | (batch->maxx - 1));
   cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, TEM_POINTER), 0);
   cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, TEM_ROW_STRIDE), 0);
+#endif

   /* Use different framebuffer descriptor if incremental rendering was
    * triggered while tiling */
@ -871,13 +1004,19 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch,
      cs_load32_to(b, counter, cs_reg64(b, TILER_OOM_CTX_REG), 0);
      cs_wait_slot(b, 0);
      cs_if(b, MALI_CS_CONDITION_GREATER, counter) {
-         cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER),
-                      GET_FBD(oom_ctx, LAST).gpu);
+         cs_move64_to(b, fbd_pointer, GET_FBD(oom_ctx, LAST).gpu);
+#if PAN_ARCH >= 14
+         cs_emit_layer_fragment_state(b, fbd_pointer);
+#endif
      }
   }

   /* Run the fragment job and wait */
+#if PAN_ARCH >= 14
+   cs_run_fragment2(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER);
+#else
   cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER);
+#endif
   cs_wait_slot(b, 2);

   /* Gather freed heap chunks and add them to the heap context free list
--- a/src/gallium/drivers/panfrost/pan_fb_preload.c
+++ b/src/gallium/drivers/panfrost/pan_fb_preload.c
@ -1105,9 +1105,14 @@ pan_preload_emit_dcd(struct pan_fb_preload_cache *cache, struct pan_pool *pool,
   pan_cast_and_pack(spd.cpu, SHADER_PROGRAM, cfg) {
      cfg.stage = MALI_SHADER_STAGE_FRAGMENT;
      cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
+#if PAN_ARCH >= 15
+      cfg.register_count = preload_shader->info.work_reg_count;
+      cfg.preload.r0_r15 = preload_shader->info.preload;
+#else
      cfg.register_allocation = MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD;
-      cfg.binary = preload_shader->address;
      cfg.preload.r48_r63 = preload_shader->info.preload >> 48;
+#endif
+      cfg.binary = preload_shader->address;
   }

   unsigned bd_count = views.rt_count;
--- a/src/gallium/drivers/panfrost/pan_jm.c
+++ b/src/gallium/drivers/panfrost/pan_jm.c
@ -257,8 +257,8 @@ GENX(jm_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb,
 {
   PAN_TRACE_FUNC(PAN_TRACE_GL_JM);

-   batch->framebuffer.gpu |= GENX(pan_emit_fbd)(
-      fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu);
+   batch->framebuffer.gpu |=
+      GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer);
 }

 void
--- a/src/gallium/drivers/panfrost/pan_precomp.c
+++ b/src/gallium/drivers/panfrost/pan_precomp.c
@ -98,10 +98,15 @@ panfrost_precomp_shader_create(

   pan_cast_and_pack(spd.cpu, SHADER_PROGRAM, cfg) {
      cfg.stage = pan_shader_stage(&res->info);
+#if PAN_ARCH >= 15
+      cfg.register_count = res->info.work_reg_count;
+      cfg.preload.r0_r15 = res->info.preload;
+#else
      cfg.register_allocation =
         pan_register_allocation(res->info.work_reg_count);
-      cfg.binary = res->code_ptr;
      cfg.preload.r48_r63 = (res->info.preload >> 48);
+#endif
+      cfg.binary = res->code_ptr;
      cfg.flush_to_zero_mode = panfrost_ftz_mode(&res->info);
   }

@ -197,8 +202,9 @@ emit_tls(struct panfrost_batch *batch,
   struct pan_tls_info info = {
      .tls.size = shader->info.tls_size,
      .wls.size = shader->info.wls_size,
-      .wls.instances = pan_calc_wls_instances(&shader->local_size,
-                                              &dev->kmod.dev->props, dim),
+      .wls.instances =
+         pan_calc_wls_instances(&shader->local_size, &dev->kmod.dev->props, dim,
+                                shader->info.work_reg_count),
   };

   if (info.tls.size) {
@ -325,7 +331,17 @@ GENX(panfrost_launch_precomp)(struct panfrost_batch *batch,
   uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56);
   cs_move64_to(b, cs_sr_reg64(b, COMPUTE, FAU_0), fau_ptr);

+#if PAN_ARCH >= 15
+   struct mali_shader_program_pointer_packed spp;
+   pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) {
+      ctx.register_count = shader->info.work_reg_count;
+      ctx.pointer = shader->state_ptr;
+   }
+   uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0];
+   cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_0), ptr);
+#else
   cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_0), shader->state_ptr);
+#endif
   cs_move64_to(b, cs_sr_reg64(b, COMPUTE, TSD_0), tsd);

   /* Global attribute offset */
--- a/src/gallium/drivers/panfrost/pan_screen.c
+++ b/src/gallium/drivers/panfrost/pan_screen.c
@ -1175,6 +1175,12 @@ panfrost_create_screen(int fd, const struct pipe_screen_config *config,
   case 13:
      panfrost_cmdstream_screen_init_v13(screen);
      break;
+   case 14:
+      panfrost_cmdstream_screen_init_v14(screen);
+      break;
+   case 15:
+      panfrost_cmdstream_screen_init_v15(screen);
+      break;
   default:
      debug_printf("panfrost: Unhandled architecture major %d", dev->arch);
      panfrost_destroy_screen(&(screen->base));
--- a/src/gallium/drivers/panfrost/pan_screen.h
+++ b/src/gallium/drivers/panfrost/pan_screen.h
@ -155,6 +155,8 @@ void panfrost_cmdstream_screen_init_v9(struct panfrost_screen *screen);
 void panfrost_cmdstream_screen_init_v10(struct panfrost_screen *screen);
 void panfrost_cmdstream_screen_init_v12(struct panfrost_screen *screen);
 void panfrost_cmdstream_screen_init_v13(struct panfrost_screen *screen);
+void panfrost_cmdstream_screen_init_v14(struct panfrost_screen *screen);
+void panfrost_cmdstream_screen_init_v15(struct panfrost_screen *screen);

 #define perf_debug(ctx, ...)                                                   \
   do {                                                                        \
--- a/src/panfrost/clc/pan_compile.c
+++ b/src/panfrost/clc/pan_compile.c
@ -10,6 +10,7 @@
 #include "panfrost/compiler/bifrost/bifrost_compile.h"
 #include "panfrost/compiler/pan_compiler.h"
 #include "panfrost/compiler/pan_nir.h"
+#include "panfrost/model/pan_model.h"
 #include "nir.h"
 #include "nir_builder.h"
 #include "nir_builder_opcodes.h"
@ -275,7 +276,7 @@ main(int argc, const char **argv)

   unsigned target_arch = atoi(target_arch_str);

-   if (target_arch < 4 || target_arch > 13) {
+   if (target_arch < 4 || target_arch > 15) {
      fprintf(stderr, "Unsupported target arch %d\n", target_arch);
      return 1;
   }
@ -353,7 +354,12 @@ main(int argc, const char **argv)
            libfunc, MESA_SHADER_COMPUTE, v, get_compiler_options(target_arch),
            &opt, load_kernel_input);

-         uint64_t target_gpu_id = (target_arch & 0xf) << 28;
+         uint64_t target_gpu_id;
+         if (target_arch >= PAN_ID64_COMPAT)
+            target_gpu_id =
+               ((uint64_t)(target_arch & 0xff) << 56) | (PAN_ID64_COMPAT << 28);
+         else
+            target_gpu_id = (target_arch & 0xf) << 28;

         struct pan_compile_inputs inputs = {
            .gpu_id = target_gpu_id,
--- a/src/panfrost/compiler/bifrost/bi_lower_swizzle.c
+++ b/src/panfrost/compiler/bifrost/bi_lower_swizzle.c
@ -16,14 +16,14 @@
 */

 static uint32_t
-va_op_swizzles(enum bi_opcode op, unsigned src)
+va_op_swizzles(enum bi_opcode op, unsigned src, unsigned arch)
 {
   /* This is a bifrost-only instruction that is lowered on valhall */
-   if (!valhall_opcodes[op].exact)
+   if (!get_valhall_opcode(op, arch).exact)
      return bi_op_swizzles[op][src];

   uint32_t swizzles = 0;
-   struct va_src_info info = va_src_info(op, src);
+   struct va_src_info info = va_src_info(op, src, arch);

   if (info.swizzle) {
      assert(info.size == VA_SIZE_16 || info.size == VA_SIZE_32);
@ -99,8 +99,8 @@ bool
 bi_op_supports_swizzle(enum bi_opcode op, unsigned src,
                       enum bi_swizzle swizzle, unsigned arch)
 {
-   uint32_t supported_swizzles = arch >= 9 ?
-      va_op_swizzles(op, src) : bi_op_swizzles[op][src];
+   uint32_t supported_swizzles =
+      arch >= 9 ? va_op_swizzles(op, src, arch) : bi_op_swizzles[op][src];
   return supported_swizzles & BITFIELD_BIT(swizzle);
 }

--- a/src/panfrost/compiler/bifrost/bi_ra.c
+++ b/src/panfrost/compiler/bifrost/bi_ra.c
@ -294,7 +294,8 @@ bi_compute_liveness_ra(bi_context *ctx)
 #define EVEN_BITS_MASK (0x5555555555555555ull)

 static uint64_t
-bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
+bi_make_affinity(uint64_t clobber, unsigned count, bool split_file,
+                 unsigned arch)
 {
   uint64_t clobbered = 0;

@ -308,12 +309,12 @@ bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
      clobbered |= mask << (64 - excess);

      if (split_file)
-         clobbered |= mask << (16 - excess);
+         clobbered |= mask << (((arch >= 15) ? 32 : 16) - excess);
   }

   /* Don't allocate the middle if we split out the middle */
   if (split_file)
-      clobbered |= BITFIELD64_MASK(32) << 16;
+      clobbered |= BITFIELD64_MASK(32) << ((arch >= 15) ? 32 : 16);

   /* We can use a register iff it's not clobberred */
   return ~clobbered;
@ -341,7 +342,7 @@ bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live,
         unsigned count = bi_count_write_registers(ins, d);
         unsigned offset = ins->dest[d].offset;
         uint64_t affinity =
-            bi_make_affinity(preload_live, count, split_file) >> offset;
+            bi_make_affinity(preload_live, count, split_file, arch) >> offset;
         /* Valhall needs >= 64-bit staging writes to be pair-aligned */
         if (aligned_sr && (count >= 2 || offset))
            affinity &= EVEN_BITS_MASK;
@ -381,8 +382,8 @@ bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live,
         bi_foreach_ssa_src(ins, s) {
            if (bi_count_read_registers(ins, s) >= 2)
               l->affinity[ins->src[s].value] &= EVEN_BITS_MASK;
-            else if (s < valhall_opcodes[ins->op].nr_srcs &&
-                     va_src_info(ins->op, s).size > VA_SIZE_32)
+            else if (s < get_valhall_opcode(ins->op, arch).nr_srcs &&
+                     va_src_info(ins->op, s, arch).size > VA_SIZE_32)
               l->affinity[ins->src[s].value] &= EVEN_BITS_MASK;
         }
      }
@ -435,7 +436,8 @@ bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
   uint64_t default_affinity =
      ctx->inputs->is_blend ? BITFIELD64_MASK(16)
      : full_regs           ? BITFIELD64_MASK(64)
-                  : (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
+      : (ctx->arch >= 15)   ? BITFIELD64_MASK(32)
+                          : (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));

   /* To test spilling, mimic a small register file */
   if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend && (bifrost_debug & BIFROST_DBG_NOSSARA))
--- a/src/panfrost/compiler/bifrost/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost/bifrost_compile.c
@ -703,8 +703,10 @@ bi_emit_load_var_buf(bi_builder *b, nir_intrinsic_instr *intr)
   assert(intr->intrinsic == nir_intrinsic_load_var_buf_pan ||
          intr->intrinsic == nir_intrinsic_load_var_buf_flat_pan);

+   const unsigned arch = b->shader->arch;
+
   /* These are only available on Valhall+ */
-   assert(b->shader->arch >= 9);
+   assert(arch >= 9);

   const bool flat = intr->intrinsic == nir_intrinsic_load_var_buf_flat_pan;
   const nir_alu_type src_type = nir_intrinsic_src_type(intr);
@ -757,19 +759,36 @@ bi_emit_load_var_buf(bi_builder *b, nir_intrinsic_instr *intr)
   bool use_imm_form = false;
   if (nir_src_is_const(intr->src[0])) {
      imm_offset = nir_src_as_uint(intr->src[0]);
-      assert(imm_offset < pan_ld_var_buf_off_size(b->shader->arch));
+      assert(imm_offset < pan_ld_var_buf_off_size(arch));

      use_imm_form = true;
   }

+   /* On v14+, flat source formats are removed from LD_VAR_BUF/LD_VAR_BUF_IMM,
+    * so flat buffer varyings must use the dedicated LD_VAR_BUF_FLAT*.
+    */
   if (use_imm_form) {
-      bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
+      if (arch >= 14 && flat) {
+         bi_ld_var_buf_flat_imm_to(b, dest, regfmt, vecsize, imm_offset);
+      } else {
+         bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
                           BI_UPDATE_STORE, vecsize, imm_offset);
+      }
   } else {
      bi_index offset = bi_src_index(&intr->src[0]);
-      bi_ld_var_buf_to(b, sz, dest, src0, offset, regfmt, sample,
-                       source_format, BI_UPDATE_STORE, vecsize);
+      if (arch >= 14 && flat) {
+         bi_ld_var_buf_flat_to(b, dest, offset, regfmt, vecsize);
+      } else {
+         bi_ld_var_buf_to(b, sz, dest, src0, offset, regfmt, sample,
+                          source_format, BI_UPDATE_STORE, vecsize);
+      }
   }
+
+   /* LD_VAR_BUF_FLAT* only support register formats F16 and F32. */
+   assert(
+      arch < 14 || !flat ||
+      (regfmt == BI_REGISTER_FORMAT_F16 || regfmt == BI_REGISTER_FORMAT_F32));
+
   bi_split_def(b, &intr->def);
 }

@ -4146,13 +4165,13 @@ va_count_stats(bi_context *ctx, unsigned nr_ins, unsigned size,
 }

 static unsigned
-va_gather_stats_block(bi_block *block, struct va_stats *counts)
+va_gather_stats_block(bi_block *block, unsigned arch, struct va_stats *counts)
 {
   unsigned nr_ins = 0;

   bi_foreach_instr_in_block(block, I) {
      nr_ins++;
-      va_count_instr_stats(I, counts);
+      va_count_instr_stats(I, arch, counts);
   }
   return nr_ins;
 }
@ -4161,7 +4180,8 @@ va_gather_stats_block(bi_block *block, struct va_stats *counts)
 * Gather stats for a minimum length path through the shader.
 */
 static unsigned
-va_gather_min_path_stats(bi_block *block, struct va_stats *counts)
+va_gather_min_path_stats(bi_block *block, unsigned arch,
+                         struct va_stats *counts)
 {
   struct va_stats min_counts;
   struct va_stats save_counts = *counts;
@ -4173,7 +4193,7 @@ va_gather_min_path_stats(bi_block *block, struct va_stats *counts)
      if (bi_block_dominates(next, block)) {
         continue;
      }
-      nr_ins = va_gather_min_path_stats(next, counts);
+      nr_ins = va_gather_min_path_stats(next, arch, counts);
      if (min_ins == 0 || nr_ins < min_ins) {
         min_ins = nr_ins;
         min_counts = *counts;
@ -4183,7 +4203,7 @@ va_gather_min_path_stats(bi_block *block, struct va_stats *counts)
   if (min_ins != 0) {
      *counts = min_counts;
   }
-   nr_ins = min_ins + va_gather_stats_block(block, counts);
+   nr_ins = min_ins + va_gather_stats_block(block, arch, counts);
   return nr_ins;
 }

@ -4194,7 +4214,8 @@ va_gather_min_path_stats(bi_block *block, struct va_stats *counts)
 * bail out.
 */
 static unsigned
-va_gather_max_path_stats(bi_block *block, struct va_stats *counts, BITSET_WORD *visited)
+va_gather_max_path_stats(bi_block *block, unsigned arch,
+                         struct va_stats *counts, BITSET_WORD *visited)
 {
   struct va_stats max_counts;
   struct va_stats save_counts = *counts;
@ -4207,7 +4228,7 @@ va_gather_max_path_stats(bi_block *block, struct va_stats *counts, BITSET_WORD *
      if (BITSET_TEST(visited, next->index)) {
         continue;
      }
-      nr_ins = va_gather_max_path_stats(next, counts, visited);
+      nr_ins = va_gather_max_path_stats(next, arch, counts, visited);
      if (nr_ins > max_ins) {
         max_ins = nr_ins;
         max_counts = *counts;
@ -4217,7 +4238,7 @@ va_gather_max_path_stats(bi_block *block, struct va_stats *counts, BITSET_WORD *
   if (max_ins != 0) {
      *counts = max_counts;
   }
-   nr_ins = max_ins + va_gather_stats_block(block, counts);
+   nr_ins = max_ins + va_gather_stats_block(block, arch, counts);
   return nr_ins;
 }

@ -4241,15 +4262,16 @@ va_gather_stats(bi_context *ctx, unsigned size, struct valhall_stats *out,
   case GATHER_STATS_FULL:
      bi_foreach_instr_global(ctx, I) {
         nr_ins++;
-         va_count_instr_stats(I, &counts);
+         va_count_instr_stats(I, ctx->arch, &counts);
      }
      break;
   case GATHER_STATS_MIN:
-      nr_ins = va_gather_min_path_stats(first_block, &counts);
+      nr_ins = va_gather_min_path_stats(first_block, ctx->arch, &counts);
      break;
   case GATHER_STATS_MAX:
      visited = BITSET_RZALLOC(NULL, ctx->num_blocks);
-      nr_ins = va_gather_max_path_stats(first_block, &counts, visited);
+      nr_ins =
+         va_gather_max_path_stats(first_block, ctx->arch, &counts, visited);
      ralloc_free(visited);
      break;
   }
@ -4509,7 +4531,7 @@ bi_compile_variant_nir(nir_shader *nir,
         va_lower_constants(ctx, I, const_hist, min_count_for_fau);

         bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
-         va_repair_fau(&b, I);
+         va_repair_fau(&b, I, ctx->arch);
      }

      _mesa_hash_table_u64_destroy(const_hist);
@ -4611,7 +4633,7 @@ bi_compile_variant_nir(nir_shader *nir,
                             bifrost_debug & BIFROST_DBG_VERBOSE);
      } else {
         disassemble_valhall(stderr, binary->data + offset,
-                             binary->size - offset,
+                             binary->size - offset, ctx->arch,
                             bifrost_debug & BIFROST_DBG_VERBOSE);
      }

@ -4679,7 +4701,7 @@ bi_compile_variant(nir_shader *nir,
   uint64_t preload = first_block->reg_live_in;

   /* If multisampling is used with a blend shader, the blend shader needs
-    * to access the sample coverage mask in r60 and the sample ID in r61.
+    * to access the sample coverage mask and the sample ID.
    * Blend shaders run in the same context as fragment shaders, so if a
    * blend shader could run, we need to preload these registers
    * conservatively. There is believed to be little cost to doing so, so
@ -4690,7 +4712,10 @@ bi_compile_variant(nir_shader *nir,
    * driver. We could unify the paths if the cost is acceptable.
    */
   if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9)
-      preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61);
+      preload |=
+         BITFIELD64_BIT(
+            bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch)) |
+         BITFIELD64_BIT(bi_preload_reg(BI_PRELOAD_SAMPLE_ID, ctx->arch));

   info->ubo_mask |= ctx->ubo_mask;
   info->tls_size = MAX2(info->tls_size, ctx->info.tls_size);
--- a/src/panfrost/compiler/bifrost/cmdline.c
+++ b/src/panfrost/compiler/bifrost/cmdline.c
@ -48,7 +48,8 @@ disassemble(const char *filename)
   }

   if (pan_arch(gpu_id) >= 9)
-      disassemble_valhall(stdout, entrypoint, filesize, verbose);
+      disassemble_valhall(stdout, entrypoint, filesize, pan_arch(gpu_id),
+                          verbose);
   else
      disassemble_bifrost(stdout, entrypoint, filesize, verbose);

--- a/src/panfrost/compiler/bifrost/compiler.h
+++ b/src/panfrost/compiler/bifrost/compiler.h
@ -1162,25 +1162,25 @@ bi_preload_reg(enum bi_preload val, unsigned arch)
   /* Compute */
   case BI_PRELOAD_LOCAL_ID_0:
      /* Bits [15;0] */
-      return 55;
+      return (arch >= 15) ? 4 : 55;
   case BI_PRELOAD_LOCAL_ID_1:
      /* Bits [31;16] */
-      return 55;
+      return (arch >= 15) ? 4 : 55;
   case BI_PRELOAD_LOCAL_ID_2:
      /* Bits [15;0] */
-      return 56;
+      return (arch >= 15) ? 3 : 56;
   case BI_PRELOAD_WORKGROUP_ID_0:
-      return 57;
+      return (arch >= 15) ? 5 : 57;
   case BI_PRELOAD_WORKGROUP_ID_1:
-      return 58;
+      return (arch >= 15) ? 6 : 58;
   case BI_PRELOAD_WORKGROUP_ID_2:
-      return 59;
+      return (arch >= 15) ? 7 : 59;
   case BI_PRELOAD_GLOBAL_ID_0:
-      return 60;
+      return (arch >= 15) ? 0 : 60;
   case BI_PRELOAD_GLOBAL_ID_1:
-      return 61;
+      return (arch >= 15) ? 1 : 61;
   case BI_PRELOAD_GLOBAL_ID_2:
-      return 62;
+      return (arch >= 15) ? 2 : 62;
   /* Vertex */
   case BI_PRELOAD_POS_RESULT_PTR_LO:
      assert(arch < 9);
@ -1190,58 +1190,58 @@ bi_preload_reg(enum bi_preload val, unsigned arch)
      return 59;
   case BI_PRELOAD_INTERNAL_ID:
      assert(arch >= 9);
-      return 59;
+      return (arch >= 15) ? 2 : 59;
   case BI_PRELOAD_VERTEX_ID:
-      return (arch >= 9) ? 60 : 61;
+      return (arch >= 15) ? 0 : (arch >= 9) ? 60 : 61;
   case BI_PRELOAD_INSTANCE_ID:
-      return (arch >= 9) ? 61 : 62;
+      return (arch >= 15) ? 1 : (arch >= 9) ? 61 : 62;
   case BI_PRELOAD_DRAW_ID:
      assert(arch >= 9);
-      return 62;
+      return (arch >= 15) ? 3 : 62;
   case BI_PRELOAD_VIEW_ID:
      assert(arch >= 9);
-      return 63;
+      return (arch >= 15) ? 4 : 63;
   /* Fragment */
   case BI_PRELOAD_PRIMITIVE_ID:
-      return 57;
+      return (arch >= 15) ? 6 : 57;
   case BI_PRELOAD_PRIMITIVE_FLAGS:
-      return 58;
+      return (arch >= 15) ? 3 : 58;
   case BI_PRELOAD_POSITION_XY:
-      return 59;
+      return (arch >= 15) ? 2 : 59;
   case BI_PRELOAD_CUMULATIVE_COVERAGE:
      /* Bits [15;0] */
-      return 60;
+      return (arch >= 15) ? 0 : 60;
   case BI_PRELOAD_RASTERIZER_COVERAGE:
      /* Bits [15;0] */
-      return 61;
+      return (arch >= 15) ? 1 : 61;
   case BI_PRELOAD_SAMPLE_ID:
      /* Bits [23;16] */
-      return 61;
+      return (arch >= 15) ? 0 : 61;
   case BI_PRELOAD_CENTROID_ID:
      /* Bits [31;24] */
-      return 61;
+      return (arch >= 15) ? 0 : 61;
   case BI_PRELOAD_FRAME_ARG:
      /* Double reg */
-      return 62;
+      return (arch >= 15) ? 4 : 62;
   /* Blend */
   case BI_PRELOAD_BLEND_SRC0_C0:
-      return 0;
+      return (arch >= 15) ? 8 : 0;
   case BI_PRELOAD_BLEND_SRC0_C1:
-      return 1;
+      return (arch >= 15) ? 9 : 1;
   case BI_PRELOAD_BLEND_SRC0_C2:
-      return 2;
+      return (arch >= 15) ? 10 : 2;
   case BI_PRELOAD_BLEND_SRC0_C3:
-      return 3;
+      return (arch >= 15) ? 11 : 3;
   case BI_PRELOAD_BLEND_SRC1_C0:
-      return 4;
+      return (arch >= 15) ? 12 : 4;
   case BI_PRELOAD_BLEND_SRC1_C1:
-      return 5;
+      return (arch >= 15) ? 13 : 5;
   case BI_PRELOAD_BLEND_SRC1_C2:
-      return 6;
+      return (arch >= 15) ? 14 : 6;
   case BI_PRELOAD_BLEND_SRC1_C3:
-      return 7;
+      return (arch >= 15) ? 15 : 7;
   case BI_PRELOAD_BLEND_LINK:
-      return 48;
+      return (arch >= 15) ? 7 : 48;
   }
   UNREACHABLE("Non-handled BI_PRELOAD");
 }
--- a/src/panfrost/compiler/bifrost/valhall/ISA.xml
+++ b/src/panfrost/compiler/bifrost/valhall/ISA.xml
--- a/src/panfrost/compiler/bifrost/valhall/asm.py
+++ b/src/panfrost/compiler/bifrost/valhall/asm.py
@ -29,16 +29,20 @@ class FAUState:
        die_if(self.page is not None and self.page != page, 'Mismatched pages')
        self.page = page

-    def push(self, source):
-        if not (source & (1 << 7)):
-            # Skip registers
+    def push(self, source, arch):
+        # Skip registers
+        if arch >= 15 and not (source & (1 << 8)):
+            return
+        elif arch < 15 and not (source & (1 << 7)):
            return

        self.buffer.add(source)
        die_if(len(self.buffer) > 2, "Overflowed FAU buffer")

-        if (source >> 5) == 0b110:
-            # Small constants need to check if the buffer overflows but no else
+        # Small constants need to check if the buffer overflows but no else
+        if arch >= 15 and (source >> 5) == 0b1110:
+            return
+        elif arch < 15 and (source >> 5) == 0b110:
            return

        slot = (source >> 1)
@ -120,6 +124,50 @@ def encode_source(op, fau):

        die('Invalid operand')

+def encode_source_v15(op, fau):
+    # Reg tuple
+    if op[0] == '[' and op[-1:] == ']':
+        # Remove brackets and split on ":"
+        unpacked = op[1:-1].split(":")
+        die_if(len(unpacked) != 2, 'Invalid tuple')
+        die_if(unpacked[0][0] != 'r', 'Invalid tuple')
+        die_if(unpacked[1][0] != 'r', 'Invalid tuple')
+        if (unpacked[0][-1:] == '^'):
+            val0 = parse_int(unpacked[0][1:-1], 0, 127)
+            val1 = parse_int(unpacked[1][1:-1], 0, 127)
+            die_if(val1 != val0 + 1, 'Invalid tuple value')
+            return val0 | 0x80
+        else:
+            val0 = parse_int(unpacked[0][1:], 0, 127)
+            val1 = parse_int(unpacked[1][1:], 0, 127)
+            die_if(val1 != val0 + 1, 'Invalid tuple value')
+            return val0
+    elif op[0] == 'r':
+        if (op[-1:] == '^'):
+            return parse_int(op[1:-1], 0, 127) | 0x80
+        return parse_int(op[1:], 0, 127)
+    elif op[0] == 'u':
+        val = parse_int(op[1:], 0, 254)
+        fau.set_page(val >> 6)
+        return ((val & 0x3F) << 1) | 0x100
+    elif op[0] == 'i':
+        return int(op[3:]) | 0x1C0
+    elif op.startswith('0x'):
+        try:
+            val = int(op, base=0)
+        except ValueError:
+            die('Expected value')
+
+        die_if(val not in immediates, 'Unexpected immediate value')
+        return immediates.index(val) | 0x1C0
+    else:
+        for i in [0, 1, 3]:
+            if op in enums[f'fau_special_page_{i}'].bare_values:
+                idx = 32 + (enums[f'fau_special_page_{i}'].bare_values.index(op) << 1)
+                fau.set_page(i)
+                return idx | 0x1E0
+
+        die('Invalid operand')

 def encode_dest(op):
    # Reg tuple
@ -156,7 +204,47 @@ def encode_dest(op):

    return value | (wrmask << 6)

-def parse_asm(line):
+def encode_dest_v15(op, dst64):
+    # Reg tuple
+    if op[0] == '[' and op[-1:] == ']':
+        # Remove brackets and split on ":"
+        unpacked = op[1:-1].split(":")
+        die_if(len(unpacked) != 2, 'Invalid tuple')
+        die_if(unpacked[0][0] != 'r', 'Invalid tuple')
+        die_if(unpacked[1][0] != 'r', 'Invalid tuple')
+
+        parts = unpacked[0].split(".")
+        reg = parts[0]
+        value = parse_int(reg[1:], 0, 127)
+
+        parts1 = unpacked[1].split(".")
+        reg1 = parts1[0]
+        val1 = parse_int(reg1[1:], 0, 127)
+        die_if(val1 != value + 1, 'Invalid tuple value')
+    else:
+        die_if(op[0] != 'r', f"Expected register destination {op}")
+        parts = op.split(".")
+        reg = parts[0]
+        value = parse_int(reg[1:], 0, 127)
+
+    # Default to writing in full
+    if (dst64):
+        wrmask = 0x0
+        die_if(len(parts) > 1, "Must write full")
+    else:
+        wrmask = 0x3
+
+    if len(parts) > 1:
+        WMASKS = ["h0", "h1"]
+        die_if(len(parts) > 2, "Too many modifiers")
+        mask = parts[1];
+        die_if(mask not in WMASKS, "Expected a write mask")
+        wrmask = 1 << WMASKS.index(mask)
+
+    return value | (wrmask << 13)
+
+
+def parse_asm(line, arch):
    global LINE
    LINE = line # For better errors
    encoded = 0
@ -187,7 +275,7 @@ def parse_asm(line):

    tail = line[(len(head) + 1):]
    operands = [x.strip() for x in tail.split(",") if len(x.strip()) > 0]
-    expected_op_count = len(ins.srcs) + len(ins.dests) + len(ins.immediates) + len(ins.staging)
+    expected_op_count = len(ins.srcs) + len(ins.dests) + len((ins.immediates_v15 if arch >= 15 else ins.immediates)) + len(ins.staging)
    if len(operands) != expected_op_count:
        die(f"Wrong number of operands in {line}, expected {expected_op_count}, got {len(operands)} {operands}")

@ -200,9 +288,9 @@ def parse_asm(line):
            parts = []

        die_if(any([x[0] != 'r' for x in parts]), f'Expected registers, got {op}')
-        regs = [parse_int(x[1:], 0, 63) for x in parts]
+        regs = [parse_int(x[1:], 0, (127 if arch >= 15 else 63)) for x in parts]

-        extended_write = "staging_register_write_count" in [x.name for x in ins.modifiers] and sr.write
+        extended_write = "staging_register_write_count" in [x.name for x in (ins.modifiers_v15 if arch >= 15 else ins.modifiers)] and sr.write
        max_sr_count = 8 if extended_write else 7

        sr_count = len(regs)
@ -215,22 +303,31 @@ def parse_asm(line):
                'Consecutive staging registers must be aligned to a register pair')

        if sr.count == 0:
-            if "staging_register_write_count" in [x.name for x in ins.modifiers] and sr.write:
+            if "staging_register_write_count" in [x.name for x in (ins.modifiers_v15 if arch >= 15 else ins.modifiers)] and sr.write:
                modifier_map["staging_register_write_count"] = sr_count - 1
            else:
-                assert "staging_register_count" in [x.name for x in ins.modifiers]
+                assert "staging_register_count" in [x.name for x in (ins.modifiers_v15 if arch >= 15 else ins.modifiers)]
                modifier_map["staging_register_count"] = sr_count
        else:
            die_if(sr_count != sr.count, f"Expected {sr.count} staging registers, got {sr_count}")

-        encoded |= ((sr.encoded_flags | base) << sr.start)
+        encoded |= base << sr.start
+        if arch >= 15:
+            encoded |= sr.encoded_flags_v15 << sr.offset['flags_v15']
+        else:
+            encoded |= sr.encoded_flags << sr.offset['flags']
+
+    # On v15, some instructions require special sr_control values
+    if arch >= 15 and ins.name == "BARRIER":
+        encoded |= 0b10 << 38
+
    operands = operands[len(ins.staging):]

    for op, dest in zip(operands, ins.dests):
-        encoded |= encode_dest(op) << 40
+        encoded |= (encode_dest_v15(op, dest.size >= 64) if arch >= 15 else encode_dest(op)) << 40
    operands = operands[len(ins.dests):]

-    if len(ins.dests) == 0 and len(ins.staging) == 0:
+    if arch < 15 and len(ins.dests) == 0 and len(ins.staging) == 0:
        # Set a placeholder writemask to prevent encoding faults
        encoded |= (0xC0 << 40)

@ -238,12 +335,18 @@ def parse_asm(line):

    for i, (op, src) in enumerate(zip(operands, ins.srcs)):
        parts = op.split('.')
-        encoded_src = encode_source(parts[0], fau)
-
-        # Require a word selection for special FAU values
-        may_have_word_select = ((encoded_src >> 5) == 0b111)
-        # or for regular FAU values
-        may_have_word_select |= ((encoded_src >> 6) == 0b10)
+        if (arch >= 15):
+            encoded_src = encode_source_v15(parts[0], fau)
+            # Require a word selection for special FAU values
+            may_have_word_select = ((encoded_src >> 5) == 0b1111)
+            # or for regular FAU values
+            may_have_word_select |= ((encoded_src >> 7) == 0b10)
+        else:
+            encoded_src = encode_source(parts[0], fau)
+            # Require a word selection for special FAU values
+            may_have_word_select = ((encoded_src >> 5) == 0b111)
+            # or for regular FAU values
+            may_have_word_select |= ((encoded_src >> 6) == 0b10)

        # Has a swizzle been applied yet?
        swizzled = False
@ -251,7 +354,11 @@ def parse_asm(line):
        for mod in parts[1:]:
            # Encode the modifier
            if mod in src.offset and src.mask[mod] == 0x1:
-                encoded |= (1 << src.offset[mod])
+                # On v15, FMA_RSCALE has a different offset src2.neg
+                if arch >= 15 and ins.name[:10] == "FMA_RSCALE" and mod == "neg" and  i == 2:
+                    encoded |= (1 << (src.offset[mod] + 1))
+                else:
+                    encoded |= (1 << src.offset[mod])
            elif src.halfswizzle and mod in enums[f'half_swizzles_{src.size}_bit'].bare_values:
                die_if(swizzled, "Multiple swizzles specified")
                swizzled = True
@ -318,12 +425,15 @@ def parse_asm(line):
            val = enums['swizzles_16_bit'].bare_values.index(mod)
            encoded |= (val << src.offset['widen'])

-        encoded |= encoded_src << src.start
-        fau.push(encoded_src)
+        if arch >= 15:
+            encoded |= ((encoded_src & 0x100) << (src.offset['high1_v15'] - 8)) | ((encoded_src & 0xFF) << src.start)
+        else:
+            encoded |= encoded_src << src.start
+        fau.push(encoded_src, arch)

    operands = operands[len(ins.srcs):]

-    for i, (op, imm) in enumerate(zip(operands, ins.immediates)):
+    for i, (op, imm) in enumerate(zip(operands, (ins.immediates_v15 if arch >= 15 else ins.immediates))):
        if op[0] == '#':
            die_if(imm.name != 'constant', "Wrong syntax for immediate")
            parts = [imm.name, op[1:]]
@ -347,15 +457,15 @@ def parse_asm(line):

        encoded |= (val << imm.start)

-    operands = operands[len(ins.immediates):]
+    operands = operands[len((ins.immediates_v15 if arch >= 15 else ins.immediates)):]

    # Encode the operation itself
-    for subcode in ins.opcode:
+    for subcode in (ins.opcode_v15 if arch >= 15 else ins.opcode):
        encoded |= (subcode.value << subcode.start)

    # Encode FAU page
    if fau.page:
-        encoded |= (fau.page << ins.offset['fau_page'])
+        encoded |= (fau.page << (ins.offset['fau_page_v15'] if arch >= 15 else ins.offset['fau_page']))

    # Encode modifiers
    has_flow = False
@ -366,9 +476,10 @@ def parse_asm(line):
        if mod in enums['flow'].bare_values:
            die_if(has_flow, "Multiple flow control modifiers specified")
            has_flow = True
-            encoded |= (enums['flow'].bare_values.index(mod) << ins.offset['flow'])
+            encoded |= (enums['flow'].bare_values.index(mod) << (ins.offset['flow_v15'] if arch >= 15 else
+                                                                 ins.offset['flow']))
        else:
-            candidates = [c for c in ins.modifiers if mod in c.bare_values]
+            candidates = [c for c in (ins.modifiers_v15 if arch >= 15 else ins.modifiers) if mod in c.bare_values]

            die_if(len(candidates) == 0, f"Invalid modifier {mod} used")
            assert(len(candidates) == 1) # No ambiguous modifiers
@ -380,13 +491,20 @@ def parse_asm(line):
            die_if(opts.name in modifier_map, f"{opts.name} specified twice")
            modifier_map[opts.name] = value

-    for mod in ins.modifiers:
+
+    for mod in (ins.modifiers_v15 if arch >= 15 else ins.modifiers):
        value = modifier_map.get(mod.name, mod.default)
        die_if(value is None, f"Missing required modifier {mod.name}")

        assert(value < (1 << mod.size))
        encoded |= (value << mod.start)

+   # On v15, some instrutions require an encoded null src.
+    requires_nullsrc = ['BARRIER', 'NOP', 'LD_GCLK_U64', 'LD_VAR_FLAT_IMM', 'LD_VAR_BUF_FLAT_IMM'];
+    if arch >= 15 and ins.name in requires_nullsrc:
+        enc_src = 0x1C0
+        encoded |= ((enc_src >> 8) & 0x1) << 48 | (enc_src & 0xFF)
+
    return encoded

 if __name__ == "__main__":
--- a/src/panfrost/compiler/bifrost/valhall/disasm.py
+++ b/src/panfrost/compiler/bifrost/valhall/disasm.py
@ -28,6 +28,10 @@ template = """
 #define VA_SRC_UNIFORM_TYPE 0x2
 #define VA_SRC_IMM_TYPE     0x3

+#define VA_SRC_V15_MODE1 BIT(8)
+#define VA_SRC_V15_MODE2 BIT(7)
+#define VA_SRC_V15_MODE4 BIT(5)
+
 % for name, en in ENUMS.items():
 UNUSED static const char *valhall_${name}[] = {
 % for v in en.values:
@ -91,22 +95,84 @@ va_print_float_src(FILE *fp, unsigned type, unsigned value, unsigned size, unsig
      fprintf(fp, ".abs");
 }

+static inline void
+va_print_src_v15(FILE *fp, unsigned high1, unsigned low8, unsigned size, unsigned fau_page)
+{
+   unsigned src = (high1 << 8) | low8;
+
+   /* Not reg */
+   if (src & VA_SRC_V15_MODE1) {
+      /* Not uniform */
+      if (src & VA_SRC_V15_MODE2) {
+         /* FAU special */
+         if (src & VA_SRC_V15_MODE4) {
+            unsigned value = src & MASK(5);
+            if (fau_page == 0)
+               fputs(valhall_fau_special_page_0[value >> 1] + 1, fp);
+            else if (fau_page == 1)
+               fputs(valhall_fau_special_page_1[value >> 1] + 1, fp);
+            else if (fau_page == 3)
+               fputs(valhall_fau_special_page_3[value >> 1] + 1, fp);
+            else
+               fprintf(fp, "reserved_page2");
+
+            fprintf(fp, ".w%u", value & 1);
+         }
+         /* Imm */
+         else {
+            unsigned value = src & MASK(5);
+            assert(value < 32 && "overflow in LUT");
+            fprintf(fp, "0x%X", va_immediates[value]);
+         }
+      }
+      /* Uniform */
+      else {
+         unsigned value = src & MASK(7);
+         fprintf(fp, "u%u", value >> 1 | (fau_page << 6));
+         if (size <= 32)
+            fprintf(fp, ".w%u", value & 1);
+      }
+   }
+   /* Reg */
+   else {
+      unsigned value = src & MASK(7);
+      bool discard = (src & BIT(7));
+      char *dmark = discard ? "^" : "";
+      if (size > 32)
+         fprintf(fp, "[r%u%s:r%u%s]", value, dmark, value + 1, dmark);
+      else
+         fprintf(fp, "r%u%s", value, dmark);
+   }
+}
+
+static inline void
+va_print_float_src_v15(FILE *fp, unsigned high1, unsigned low8, unsigned size, unsigned fau_page, bool neg, bool abs)
+{
+   va_print_src_v15(fp, high1, low8, size, fau_page);
+
+   if (neg)
+      fprintf(fp, ".neg");
+
+   if (abs)
+      fprintf(fp, ".abs");
+}
+
 static inline void
 va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
 {
   if (size > 32)
      fprintf(fp, "[r%u:r%u]", value, value + 1);
-   else
+   else {
      fprintf(fp, "r%u", value);
-
-   if (mask != 0x3)
-      fprintf(fp, ".h%u", (mask == 1) ? 0 : 1);
+      if (mask != 0x3)
+         fprintf(fp, ".h%u", (mask == 1) ? 0 : 1);
+    }
 }

-<%def name="print_instr(op)">
+<%def name="print_instr(op, v15)">
 <% no_comma = True %>
      fputs("${op.name}", fp);
-% for mod in op.modifiers:
+% for mod in (op.modifiers_v15 if v15 else op.modifiers):
 % if mod.name not in ["staging_register_count", "staging_register_write_count"]:
 % if mod.is_enum:
      fputs(valhall_${safe_name(mod.enum)}[(instr >> ${mod.start}) & ${hex((1 << mod.size) - 1)}], fp);
@ -115,10 +181,18 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
 % endif
 % endif
 % endfor
+% if v15:
+      fprintf(fp, "%s ", valhall_flow[(instr >> ${op.offset['flow_v15']}) & ${hex(op.mask['flow_v15'])}]);
+% else:
      fprintf(fp, "%s ", valhall_flow[(instr >> ${op.offset['flow']}) & ${hex(op.mask['flow'])}]);
+% endif
 % for i, dest in enumerate(op.dests):
 <% no_comma = False %>
+% if v15:
+      va_print_dest(fp, (instr >> ${dest.offset['mode_v15']}) & ${hex(dest.mask['mode_v15'])}, (instr >> ${dest.offset['value_v15']}) & ${hex(dest.mask['value_v15'])}, ${dest.size});
+% else:
      va_print_dest(fp, (instr >> ${dest.offset['mode']}) & ${hex(dest.mask['mode'])}, (instr >> ${dest.offset['value']}) & ${hex(dest.mask['value'])}, ${dest.size});
+% endif
 % endfor
 % for index, sr in enumerate(op.staging):
 % if not no_comma:
@ -130,13 +204,12 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
   if sr.count != 0:
      sr_count = sr.count;
   else:
-      for mod in op.modifiers:
+      for mod in (op.modifiers_v15 if v15 else op.modifiers):
         if mod.name == "staging_register_write_count" and sr.write:
            sr_count = f"(((instr >> {mod.start}) & {hex((1 << mod.size) - 1)}) + 1)";
         elif mod.name == "staging_register_count":
            sr_count = f"((instr >> {mod.start}) & {hex((1 << mod.size) - 1)})";
 %>
-//    assert(((instr >> ${sr.start}) & 0xC0) == ${sr.encoded_flags});
      fprintf(fp, "@");
      for (unsigned i = 0; i < ${sr_count}; ++i) {
         fprintf(fp, "%sr%u", (i == 0) ? "" : ":",
@ -148,6 +221,28 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
      fputs(", ", fp);
 % endif
 <% no_comma = False %>
+% if v15:
+% if src.absneg:
+      va_print_float_src_v15(fp, (instr >> ${src.offset['high1_v15']}) & ${hex(src.mask['high1_v15'])}, (instr >> ${src.offset['low8_v15']}) & ${hex(src.mask['low8_v15'])},
+                             ${src.size}, (instr >> ${op.offset['fau_page_v15']}) & ${hex(op.mask['fau_page_v15'])},
+% if op.name[:4] == "FMA." and i == 0:
+                             false,
+                             instr & BIT(${src.offset['abs']}));
+% elif op.name[:10] == "FMA_RSCALE" and i == 2:
+                             instr & BIT(${src.offset['neg'] + 1}),
+                             false);
+% else:
+                             instr & BIT(${src.offset['neg']}),
+                             instr & BIT(${src.offset['abs']}));
+% endif
+% elif src.is_float:
+      va_print_float_src_v15(fp, (instr >> ${src.offset['high1_v15']}) & ${src.mask['high1_v15']}, (instr >> ${src.offset['low8_v15']}) & ${hex(src.mask['low8_v15'])},
+                             ${src.size}, (instr >> ${op.offset['fau_page_v15']}) & ${hex(op.mask['fau_page_v15'])}, false, false);
+% else:
+      va_print_src_v15(fp, (instr >> ${src.offset['high1_v15']}) & ${src.mask['high1_v15']}, (instr >> ${src.offset['low8_v15']}) & ${hex(src.mask['low8_v15'])},
+                       ${src.size}, (instr >> ${op.offset['fau_page_v15']}) & ${hex(op.mask['fau_page_v15'])});
+% endif
+% else:
 % if src.absneg:
      va_print_float_src(fp, (instr >> ${src.offset['mode']}) & ${hex(src.mask['mode'])}, (instr >> ${src.offset['value']}) & ${hex(src.mask['value'])},
                         ${src.size}, (instr >> ${op.offset['fau_page']}) & ${hex(op.mask['fau_page'])},
@ -160,6 +255,7 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
      va_print_src(fp, (instr >> ${src.offset['mode']}) & ${src.mask['mode']}, (instr >> ${src.offset['value']}) & ${hex(src.mask['value'])},
                   ${src.size}, (instr >> ${op.offset['fau_page']}) & ${hex(op.mask['fau_page'])});
 % endif
+% endif
 % if src.swizzle:
 % if src.size == 32:
      fputs(valhall_widen[(instr >> ${src.offset['swizzle']}) & ${hex(src.mask['swizzle'])}], fp);
@ -183,7 +279,7 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
      if (instr & BIT(${src.offset['not']})) fputs(".not", fp);
 % endif
 % endfor
-% for imm in op.immediates:
+% for imm in (op.immediates_v15 if v15 else op.immediates):
 <%
   prefix = "#" if imm.name == "constant" else imm.name + ":"
   fmt = "%d" if imm.signed else "0x%X"
@ -192,16 +288,16 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
 % endfor
 </%def>

-<%def name="recurse_subcodes(op_bucket)">
+<%def name="recurse_subcodes(op_bucket, v15)">
 %if op_bucket.instr:
-${print_instr(op_bucket.instr)}
+${print_instr(op_bucket.instr, v15)}
 %else:
   opcode = (instr >> ${op_bucket.start}) & ${hex(op_bucket.mask)};
   switch (opcode) {
 %for op in op_bucket.children:
   case ${hex(op)}:
   {
-${recurse_subcodes(op_bucket.children[op])}
+${recurse_subcodes(op_bucket.children[op], v15)}
      break;
   }
 %endfor
@ -215,7 +311,15 @@ va_disasm_instr(FILE *fp, uint64_t instr)
 {
   unsigned opcode;

-${recurse_subcodes(OPCODES)}
+${recurse_subcodes(OPCODES, False)}
+}
+
+void
+va_disasm_instr_v15(FILE *fp, uint64_t instr)
+{
+   unsigned opcode;
+
+${recurse_subcodes(OPCODES_V15, True)}
 }

 static bool is_branch(uint64_t instr)
@ -229,8 +333,19 @@ static bool is_branch(uint64_t instr)
   return false;
 }

+static bool is_branch_v15(uint64_t instr)
+{
+<% (exact, mask) = OPCODES_V15.get_exact_mask("BRANCHZ") %>
+   if ((instr & ${hex(mask)}) == ${hex(exact)})
+      return true;
+<% (exact, mask) = OPCODES_V15.get_exact_mask("BRANCHZI") %>
+   if ((instr & ${hex(mask)}) == ${hex(exact)})
+      return true;
+   return false;
+}
+
 void
-disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose)
+disassemble_valhall(FILE *fp, const void *code, size_t size, unsigned arch, bool verbose)
 {
   assert((size & 7) == 0);

@ -256,11 +371,18 @@ disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose)
         fprintf(fp, "   ");
      }

-      va_disasm_instr(fp, instr);
+      bool instr_is_branch;
+      if (arch >= 15) {
+         va_disasm_instr_v15(fp, instr);
+         instr_is_branch = is_branch_v15(instr);
+      } else {
+         va_disasm_instr(fp, instr);
+         instr_is_branch = is_branch(instr);
+      }
      fprintf(fp, "\\n");

      /* Separate blocks visually by inserting whitespace after branches */
-      if (is_branch(instr))
+      if (instr_is_branch)
         fprintf(fp, "\\n");
   }

@ -276,6 +398,9 @@ class OpBucket:
      self.children = {}

   def insert(self, subcodes, ins):
+      # Need an early return in case of removed instructions
+      if subcodes is None:
+         return
      if len(subcodes) == 0:
         self.instr = ins
      else:
@ -305,10 +430,12 @@ class OpBucket:

 # Build opcode hierarchy:
 OPCODES = OpBucket()
+OPCODES_V15 = OpBucket()
 for ins in instructions:
   OPCODES.insert(ins.opcode, ins)
+   OPCODES_V15.insert(ins.opcode_v15, ins)

 try:
-   print(Template(template).render(OPCODES = OPCODES, IMMEDIATES = immediates, ENUMS = enums, typesize = typesize, safe_name = safe_name))
+   print(Template(template).render(OPCODES = OPCODES, OPCODES_V15 = OPCODES_V15, IMMEDIATES = immediates, ENUMS = enums, typesize = typesize, safe_name = safe_name))
 except:
   print(exceptions.text_error_template().render())
--- a/src/panfrost/compiler/bifrost/valhall/disassemble.h
+++ b/src/panfrost/compiler/bifrost/valhall/disassemble.h
@ -15,6 +15,8 @@
 #include <string.h>

 void va_disasm_instr(FILE *fp, uint64_t instr);
-void disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose);
+void va_disasm_instr_v15(FILE *fp, uint64_t instr);
+void disassemble_valhall(FILE *fp, const void *code, size_t size, unsigned arch,
+                         bool verbose);

 #endif
--- a/src/panfrost/compiler/bifrost/valhall/meson.build
+++ b/src/panfrost/compiler/bifrost/valhall/meson.build
@ -44,9 +44,7 @@ libpanfrost_valhall_disasm = static_library(
 )

 if with_tests
-  test(
-    'valhall_disasm',
-    executable(
+  valhall_disasm_test_e = executable(
      'valhall_disasm_test',
      files('test/test-disassembler.c'),
      c_args : [c_msvc_compat_args, no_override_init_args],
@ -54,15 +52,33 @@ if with_tests
      include_directories : [inc_include, inc_src],
      dependencies: [idep_valhall_enums_h],
      link_with : [libpanfrost_valhall_disasm],
-    ),
+  )
+
+  test(
+    'valhall_disasm',
+    valhall_disasm_test_e,
    suite : ['panfrost'],
-    args : files('test/assembler-cases.txt'),
+    args : [files('test/assembler-cases.txt'), 'v10'],
+  )
+
+  test(
+    'valhall_disasm',
+    valhall_disasm_test_e,
+    suite : ['panfrost'],
+    args : [files('test/assembler-cases-v15.txt'), 'v15'],
  )

  test(
    'valhall_asm',
    prog_python,
-    args : files('test-assembly.py', 'test/assembler-cases.txt', 'test/negative-cases.txt'),
+    args : [files('test-assembly.py', 'test/assembler-cases.txt', 'test/negative-cases.txt'), 'v10'],
+    suite : ['panfrost'],
+  )
+
+  test(
+    'valhall_asm',
+    prog_python,
+    args : [files('test-assembly.py', 'test/assembler-cases-v15.txt', 'test/negative-cases.txt'), 'v15'],
    suite : ['panfrost'],
  )
 endif
--- a/src/panfrost/compiler/bifrost/valhall/test-assembly.py
+++ b/src/panfrost/compiler/bifrost/valhall/test-assembly.py
@ -17,19 +17,19 @@ def hex_8(u64):
    return ' '.join(as_strings)

 # These should not throw exceptions
-def positive_test(machine, assembly):
+def positive_test(machine, assembly, arch):
    try:
        expected = parse_hex_8(machine)
-        val = parse_asm(assembly)
+        val = parse_asm(assembly, arch)
        if val != expected:
            return f"{hex_8(val)}    Incorrect assembly"
    except ParseError as exc:
        return f"Unexpected exception: {exc}"

 # These should throw exceptions
-def negative_test(assembly):
+def negative_test(assembly, arch):
    try:
-        parse_asm(assembly)
+        parse_asm(assembly, arch)
        return "Expected exception"
    except Exception:
        return None
@ -43,24 +43,34 @@ def record_case(case, error):
    else:
        FAIL.append((case, error))

-if len(sys.argv) < 3:
-    print("Expected positive and negative case lists")
+if len(sys.argv) < 4:
+    print("Expected positive and negative case lists, followed by arch")
    sys.exit(1)

+if sys.argv[3][0] == 'v':
+    try:
+        arch = int(sys.argv[3][1:], base = 0)
+    except ValueError:
+        print(f"Expected arch number {sys.argv[3][1:]}")
+        sys.exit(1)
+else:
+    print(f"Expected arch version {sys.argv[3]}")
+
+
 with open(sys.argv[1], "r") as f:
    cases = f.read().split('\n')
    cases = [x for x in cases if len(x) > 0 and x[0] != '#']

    for case in cases:
        (machine, assembly) = case.split('    ')
-        record_case(case, positive_test(machine, assembly))
+        record_case(case, positive_test(machine, assembly, arch))

 with open(sys.argv[2], "r") as f:
    cases = f.read().split('\n')
    cases = [x for x in cases if len(x) > 0]

    for case in cases:
-        record_case(case, negative_test(case))
+        record_case(case, negative_test(case, arch))

 print("Passed {}/{} tests.".format(len(PASS), len(PASS) + len(FAIL)))

--- a/src/panfrost/compiler/bifrost/valhall/test/assembler-cases-v15.txt
+++ b/src/panfrost/compiler/bifrost/valhall/test/assembler-cases-v15.txt
@ -0,0 +1,195 @@
+02 00 20 00 00 01 60 00    MOV.i32 r1, r2
+0a 00 20 00 00 01 61 00    MOV.i32 r1, u5.w0
+e3 00 20 00 00 01 61 40    MOV.i32 r1, thread_local_pointer.w1
+e6 00 20 00 00 01 61 40    MOV.i32 r1, workgroup_local_pointer.w0
+e2 00 20 00 00 01 61 c0    MOV.i32 r1, lane_id.w0
+e6 00 20 00 00 01 61 c0    MOV.i32 r1, core_id.w0
+01 02 00 00 00 00 f0 00    FADD.f32 r0, r1, r2
+01 02 00 00 20 00 f0 00    FADD.f32 r0, r1, r2.abs
+01 02 00 00 10 00 f0 00    FADD.f32 r0, r1, r2.neg
+01 02 00 00 30 00 f0 00    FADD.f32 r0, r1, r2.neg.abs
+01 02 00 80 30 00 f0 00    FADD.f32.clamp_m1_1 r0, r1, r2.neg.abs
+81 03 00 00 00 00 b8 2a    BRANCHZ.reconverge r1^, offset:3
+01 d0 00 00 00 00 f2 00    FADD.f32 r0, r1, 0x3F800000
+01 d0 00 00 10 00 f2 00    FADD.f32 r0, r1, 0x3F800000.neg
+01 c0 00 00 00 00 f2 00    FADD.f32 r0, r1, 0x0
+01 c0 00 00 10 00 f2 00    FADD.f32 r0, r1, 0x0.neg
+01 c9 00 00 00 00 e2 00    IADD.u32 r0, r1, 0x7060504
+01 00 00 08 00 00 f0 00    FADD.f32 r0, r1, r0.h1
+01 00 00 04 00 00 f0 00    FADD.f32 r0, r1, r0.h0
+01 00 00 0c 00 00 f4 00    FADD.v2f16 r0, r1.h00, r0.h11
+01 00 00 28 00 00 f4 00    FADD.v2f16 r0, r1, r0
+01 00 00 24 00 00 f4 00    FADD.v2f16 r0, r1, r0.h10
+01 02 00 08 00 00 e0 00    IADD.u32 r0, r1, r2.h0
+01 02 00 0c 00 00 e0 00    IADD.u32 r0, r1, r2.h1
+01 02 00 0c 70 00 e0 00    IADD.u32 r0, r1.b3, r2.h1
+01 c9 00 18 00 00 e2 00    IADD.u32 r0, r1, 0x7060504.b2
+01 02 00 08 20 00 e4 00    IADD.v2u16 r0, r1, r2
+02 3c 47 20 00 00 91 02    SHADDX.u64 [r0:r1], u1, [r60:r61].w0, shift:0x2
+80 00 00 00 19 00 20 07    LOAD.i32.slot0.wait0 @r0, [r0^:r1^], offset:0
+00 bc 87 20 00 00 91 02    SHADDX.u64 [r0:r1], u0, [r60^:r61^].w0, shift:0x4
+80 00 00 00 9c 04 20 3f    STORE.i128.slot0.end @r4:r5:r6:r7, [r0^:r1^], offset:0
+c0 00 e0 01 00 00 a1 3e    NOP.end 
+80 c4 c0 1e 02 01 e6 01    ICMP_OR.u32.gt.m1 r1, r0^, 0x1000000.b3, 0x0
+82 00 00 00 99 00 20 2b    STORE.i32.slot0.reconverge @r0, [r2^:r3^], offset:0
+00 c9 8f 12 30 00 e2 00    CLPER.i32.f1 r0, r0, 0x7060504.b00
+00 00 4b 00 00 02 60 00    F16_TO_F32 r2, r0.h0
+80 00 4b 10 00 03 60 00    F16_TO_F32 r3, r0^.h1
+c0 00 e0 01 00 00 a1 22    NOP.wait0126 
+80 c0 00 28 90 00 f6 24    FADD.v2f16.wait r0, r0^.abs, 0x0.neg
+c0 00 00 00 00 36 6d 00    IADD_IMM.i32 r54, 0x0, #0x0
+3c d0 ea 00 01 3c d6 37    ATEST.discard @r60, r60, 0x3F800000, atest_datum.w0
+80 db 05 04 00 01 e6 00    MKVEC.v2i16 r1, r0^.h0, 0x3C000000.h1
+f0 00 3c 33 82 00 1b 3f    BLEND.slot0.v4.f16.end @r0:r1, blend_descriptor_0.w0, r60, target:0x0
+bb 0d 00 40 02 04 08 07    LEA_BUF_IMM.slot1.wait0 @r4:r5, r59^, table:0xD, index:0x0
+00 dd c0 08 14 02 66 01    FMA.f32 r2, r0, 0x44000000.neg.h1, 0x0.neg
+81 08 c0 00 04 01 66 01    FMA.f32 r1, r1^, u4.w0, 0x0.neg
+80 08 c0 00 04 00 66 09    FMA.f32.wait1 r0, r0^, u4.w0, 0x0.neg
+84 00 00 02 93 00 20 3f    STORE.i96.estream.slot0.end @r0:r1:r2, [r4^:r5^], offset:0
+84 00 00 01 9c 08 20 3f    STORE.i128.istream.slot0.end @r8:r9:r10:r11, [r4^:r5^], offset:0
+c0 00 00 c0 80 00 3d 27    BARRIER.slot7.wait 
+00 00 00 00 01 02 21 03    LOAD.i8.slot0 @r2, u0, offset:0
+00 00 00 00 09 02 21 03    LOAD.i16.slot0 @r2, u0, offset:0
+00 00 00 00 11 02 21 03    LOAD.i24.slot0 @r2, u0, offset:0
+00 00 00 00 19 02 21 03    LOAD.i32.slot0 @r2, u0, offset:0
+00 00 00 00 02 02 21 03    LOAD.i48.slot0 @r2:r3, u0, offset:0
+00 00 00 00 0a 02 21 03    LOAD.i64.slot0 @r2:r3, u0, offset:0
+00 00 00 00 13 02 21 03    LOAD.i96.slot0 @r2:r3:r4, u0, offset:0
+00 00 00 00 1c 04 21 03    LOAD.i128.slot0 @r4:r5:r6:r7, u0, offset:0
+00 00 00 08 01 02 21 03    LOAD.i8.b1.slot0 @r2, u0, offset:0
+00 00 00 10 01 02 21 03    LOAD.i8.b2.slot0 @r2, u0, offset:0
+00 00 00 18 01 02 21 03    LOAD.i8.b3.slot0 @r2, u0, offset:0
+00 00 00 00 09 02 21 03    LOAD.i16.slot0 @r2, u0, offset:0
+00 14 00 08 09 02 21 03    LOAD.i16.h1.slot0 @r2, u0, offset:20
+82 00 4d 00 42 02 60 00    FROUND.f32.rtn r2, r2^.neg
+82 00 4b 00 40 02 60 00    F16_TO_F32 r2, r2^.neg.h0
+82 00 4c 00 43 02 60 00    F32_TO_S32.rtz r2, r2^.neg
+82 c0 c6 47 48 02 64 00    FADD_IMM.f32 r2, r2^, #0x4847C6C0
+82 84 67 ac 70 02 62 00    FADD_IMM.v2f16 r2, r2^, #0x70AC6784
+82 14 00 13 00 02 6a 00    IADD_IMM.v2i16 r2, r2^, #0x130014
+82 ab 4b 00 00 02 6c 00    IADD_IMM.i32 r2, r2^, #0x4BAB
+83 82 c0 c6 12 02 e4 01    ICMP_OR.v2s16.gt.m1 r2, r3^.h10, r2^.h10, 0x0
+83 82 c0 52 03 02 e4 01    FCMP_OR.v2f16.gt.m1 r2, r3^.h10, r2^.h00, 0x0
+81 03 00 00 00 00 b8 2a    BRANCHZ.reconverge r1^, offset:3
+00 03 00 00 20 00 b8 2a    BRANCHZ.reconverge r0.h0, offset:3
+00 03 00 00 40 00 b8 2a    BRANCHZ.reconverge r0.h1, offset:3
+00 03 00 00 00 00 b8 2a    BRANCHZ.reconverge r0, offset:3
+c0 00 00 00 00 00 6d 00    IADD_IMM.i32 r0, 0x0, #0x0
+c0 01 00 00 00 04 6d 28    IADD_IMM.i32.reconverge r4, 0x0, #0x1
+00 00 47 20 00 02 91 02    SHADDX.u64 [r2:r3], u0, [r0:r1].w0, shift:0x2
+80 c9 00 10 00 00 e2 00    IADD.u32 r0, r0^, 0x7060504.b0
+00 02 c0 02 06 01 e6 01    ICMP_OR.u32.ne.m1 r1, r0, u1.w0, 0x0
+04 00 20 00 00 05 60 00    MOV.i32 r5, r4
+04 00 20 00 00 06 60 00    MOV.i32 r6, r4
+04 00 20 00 00 07 60 04    MOV.i32.wait0 r7, r4
+82 00 00 00 9c 04 20 03    STORE.i128.slot0 @r4:r5:r6:r7, [r2^:r3^], offset:0
+81 f8 ff ff 07 00 b8 2a    BRANCHZ.reconverge r1^, offset:-8
+bd c0 00 08 10 3c c6 00    IADD.v2u16 r60.h1, r61^.h10, 0x0
+84 00 86 32 8c 00 12 3f    ST_CVT.slot0.istream.v4.f32.end @r0:r1:r2:r3, [r4^:r5^], r6^, offset:0x0
+84 00 86 34 8c 00 12 3f    ST_CVT.slot0.istream.v4.s32.end @r0:r1:r2:r3, [r4^:r5^], r6^, offset:0x0
+84 00 86 36 8c 00 12 3f    ST_CVT.slot0.istream.v4.u32.end @r0:r1:r2:r3, [r4^:r5^], r6^, offset:0x0
+bc c0 12 00 2b 04 86 03    LEA_TEX_IMM.slot0 @r4:r5:r6, r60^, 0x0, table:0x2, index:0x1
+bc c0 02 00 2b 04 86 03    LEA_TEX_IMM.slot0 @r4:r5:r6, r60^, 0x0, table:0x2, index:0x0
+02 01 00 00 0a 02 8b 03    LD_PKA.i64.slot0 @r2:r3, u1.w0, u0.w1
+00 01 00 40 0a 00 8b 03    LD_PKA.i64.slot1 @r0:r1, u0.w0, u0.w1
+04 01 00 80 0a 26 8b 03    LD_PKA.i64.slot2 @r38:r39, u2.w0, u0.w1
+03 01 00 80 0a 24 8b 03    LD_PKA.i64.slot2 @r36:r37, u1.w1, u0.w1
+03 04 00 00 0a 02 8b 03    LD_PKA.i64.slot0 @r2:r3, u1.w1, u2.w0
+81 02 00 00 13 02 8a 03    LD_PKA.i96.slot0 @r2:r3:r4, r1^, u1.w0
+80 03 00 00 13 06 8a 07    LD_PKA.i96.slot0.wait0 @r6:r7:r8, r0^, u1.w1
+80 00 80 01 c0 00 60 20    FRCP.f32.wait0126 r0, r0^.neg.abs
+80 84 00 80 00 00 7c 01    MUX.i32.neg r0, r0^, r4^, u0.w0
+80 84 00 80 04 00 7c 01    MUX.i32 r0, r0^, r4^, u0.w0
+80 84 00 80 08 00 7c 01    MUX.i32.fp_zero r0, r0^, r4^, u0.w0
+80 84 00 80 0c 00 7c 01    MUX.i32.bit r0, r0^, r4^, u0.w0
+00 00 20 41 00 01 60 34    FREXPM.f32.sqrt.discard r1, r0
+01 00 82 01 00 02 60 00    FRSQ.f32 r2, r1
+80 00 22 41 00 00 60 00    FREXPE.f32.sqrt r0, r0^
+81 82 c0 80 0a 00 64 02    FMA_RSCALE.f32.clamp_m1_1 r0, r1^, r2^, 0x0.neg, r0^
+81 82 c0 80 0e 00 64 22    FMA_RSCALE.f32.left.wait0126 r0, r1^, r2^, 0x0.neg, r0^
+82 83 04 05 00 01 7c 02    CSEL.u32.eq r1, r2^, r3^, u2.w0, u2.w1
+82 83 04 05 08 01 7c 02    CSEL.u32.lt r1, r2^, r3^, u2.w0, u2.w1
+82 83 04 05 48 01 7c 02    CSEL.s32.lt r1, r2^, r3^, u2.w0, u2.w1
+3d 00 00 12 5a 02 18 07    LD_VAR_SPECIAL.v2.f32.sample.clobber.slot0.wait0 @r2:r3, r61, index:0x0
+3d 00 00 3f 0a 02 10 07    LD_VAR_BUF_IMM.f16.slot0.v4.src_f16.center.retrieve.wait0 @r2:r3, r61, index:0x0
+3d 00 00 3f 42 00 10 07    LD_VAR_BUF_IMM.f16.slot0.v4.src_f16.sample.store.wait0 @r0:r1, r61, index:0x0
+3d 08 00 3f 22 00 10 07    LD_VAR_BUF_IMM.f16.slot0.v4.src_f16.centroid.store.wait0 @r0:r1, r61, index:0x8
+bc bd 11 33 02 00 84 03    LD_ATTR_IMM.v4.f16.slot0 @r0:r1, r60^, r61^, index:0x1, table:0x1
+80 3c 03 23 02 04 c0 03    LD_TILE.v3.f16.slot0 @r4:r5, r0^, r60, r3
+00 c9 00 20 10 01 c6 00    IADD.v2u16 r1.h1, r0.h10, 0x7060504.b11
+80 c0 00 08 10 01 a6 00    IADD.v2u16 r1.h0, r0^.h10, 0x0
+02 02 00 04 20 02 a4 00    IADD.v2u16 r2.h0, r2, r2.h10
+82 c0 05 00 00 02 e6 00    MKVEC.v2i16 r2, r2^.h0, 0x0.h0
+b7 c0 05 00 00 02 e6 00    MKVEC.v2i16 r2, r55^.h0, 0x0.h0
+b7 c0 05 10 00 02 e6 00    MKVEC.v2i16 r2, r55^.h1, 0x0.h0
+c0 b7 05 00 00 02 e5 00    MKVEC.v2i16 r2, 0x0.h0, r55^.h0
+c0 b7 05 04 00 02 e5 00    MKVEC.v2i16 r2, 0x0.h0, r55^.h1
+b7 00 54 00 00 02 60 00    U16_TO_U32 r2, r55^.h0
+b7 00 54 10 00 02 60 00    U16_TO_U32 r2, r55^.h1
+b7 00 44 00 00 02 60 00    S16_TO_S32 r2, r55^.h0
+b7 00 44 10 00 02 60 00    S16_TO_S32 r2, r55^.h1
+c0 b7 01 08 00 02 e9 00    ISUB.s32 r2, 0x0, r55^.h0
+c0 b7 01 0c 00 02 e9 00    ISUB.s32 r2, 0x0, r55^.h1
+00 c0 c0 c0 c0 07 7e 01    MKVEC.v2i8 r7, r0.b3, 0x0.b0, 0x0
+00 c0 c0 c0 80 06 7e 01    MKVEC.v2i8 r6, r0.b2, 0x0.b0, 0x0
+00 c0 c0 c0 00 04 7e 01    MKVEC.v2i8 r4, r0.b0, 0x0.b0, 0x0
+80 c0 c0 c0 40 05 7e 01    MKVEC.v2i8 r5, r0^.b1, 0x0.b0, 0x0
+
+3d 00 00 ba 44 00 10 37    LD_VAR_BUF_IMM.f32.slot2.v4.src_f32.sample.store.discard @r0:r1:r2:r3, r61, index:0x0
+3d 10 00 7a 0c 04 10 03    LD_VAR_BUF_IMM.f32.slot1.v4.src_f32.center.retrieve @r4:r5:r6:r7, r61, index:0x10
+c0 00 00 00 00 08 6d 00    IADD_IMM.i32 r8, 0x0, #0x0
+c0 00 00 00 00 09 6d 00    IADD_IMM.i32 r9, 0x0, #0x0
+3d 00 54 00 00 0a 60 00    U16_TO_U32 r10, r61.h0
+3d 09 00 00 30 00 b8 2a    BRANCHZ.eq.reconverge r61.h0, offset:9
+0a 00 20 00 00 0b 60 28    MOV.i32.reconverge r11, r10
+c0 00 e0 01 00 00 a1 26    NOP.wait 
+01 0b 00 33 02 0e c5 03    LD_TILE.v4.f16.slot0 @r14:r15, u0.w1, r11, u0.w0
+0b 00 24 00 00 0c 60 00    CLZ.u32 r12, r11
+02 8c c0 10 06 0c 6d 01    RSHIFT_XOR.i32.not_result r12, u1.w0, r12^.b00, 0x0
+8b c0 8c 50 00 0b 6a 05    LSHIFT_AND.i32.wait0 r11, r11^, 0x0.b00, r12^
+8f 89 00 28 00 09 f4 00    FADD.v2f16 r9, r15^, r9^
+8e 88 00 28 00 08 f4 00    FADD.v2f16 r8, r14^, r8^
+0b f8 ff ff 07 00 b8 2a    BRANCHZ.reconverge r11, offset:-8
+8a 00 2c 00 00 3e 60 00    POPCOUNT.i32 r62, r10^
+be 00 59 00 00 3e 60 00    U32_TO_F32 r62, r62^
+be 00 81 01 00 3e 60 00    FRCP.f16 r62, r62^.h00
+89 3e c0 22 44 09 64 19    FMA.v2f16.wait12 r9, r9^, r62.h00, 0x0.neg
+87 83 00 00 00 03 f0 00    FADD.f32 r3, r7^, r3^
+83 09 00 08 00 03 f0 20    FADD.f32.wait0126 r3, r3^, r9.h1
+3c 03 ea 00 01 3c d4 37    ATEST.discard @r60, r60, r3, atest_datum.w0
+86 82 00 00 00 02 f0 00    FADD.f32 r2, r6^, r2^
+84 80 00 00 00 00 f0 00    FADD.f32 r0, r4^, r0^
+88 be c0 22 44 3f 64 01    FMA.v2f16 r63, r8^, r62^.h00, 0x0.neg
+85 81 00 00 00 01 f0 00    FADD.f32 r1, r5^, r1^
+81 3f 00 08 00 01 f0 00    FADD.f32 r1, r1^, r63.h1
+80 bf 00 04 00 00 f0 00    FADD.f32 r0, r0^, r63^.h0
+82 89 00 04 00 02 f0 24    FADD.f32.wait r2, r2^, r9^.h0
+f0 00 3c 32 84 00 1b 3f    BLEND.slot0.v4.f32.end @r0:r1:r2:r3, blend_descriptor_0.w0, r60, target:0x0
+c0 00 00 00 00 36 6d 00    IADD_IMM.i32 r54, 0x0, #0x0
+c0 f1 0f 80 10 00 b3 06    BRANCHZI.eq.absolute.wait0 0x0, blend_descriptor_0.w1
+00 00 00 1f 5a 3c 69 03    TEX_FETCH.slot0.32.2d @r0:r1:r2:r3, @r60:r61, u0
+40 00 20 00 00 01 61 00    MOV.i32 r1, u32.w0
+41 00 20 00 00 01 61 00    MOV.i32 r1, u32.w1
+4a 00 20 00 00 01 61 00    MOV.i32 r1, u37.w0
+30 00 37 0f c1 0c 24 07    ATOM_RETURN.i32.slot0.axchg.wait0 @r55, @r12, [r48:r49], offset:0x0
+32 00 00 02 81 0c 2c 07    ATOM.i32.slot0.aadd.wait0 @r12, [r50:r51], offset:0x0
+32 00 00 00 01 0c 28 07    ATOM1_RETURN.i32.slot0.ainc.wait0 @r12, [r50:r51], offset:0x0
+32 00 00 00 01 00 28 07    ATOM1_RETURN.i32.slot0.ainc.wait0 @r0, [r50:r51], offset:0x0
+02 00 00 11 da 00 d5 27    VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.zero.wait @r0:r1:r2:r3, u1, u0.w0
+02 20 00 11 da 00 d5 07    VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0 @r0:r1:r2:r3, u1, u0.w0
+02 20 00 11 c2 00 d5 23    VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0126 @r0, u1, u0.w0
+80 c0 c0 02 06 00 e6 09    ICMP_OR.u32.ne.m1.wait1 r0, r0^, 0x0, 0x0
+82 83 80 80 02 00 e8 01    ICMP_AND.s32.gt.i1 r0, r2^, r3^, r0^
+82 c0 c0 03 06 00 f6 09    ICMP_MULTI.u32.ne.u1.wait1 r0, r2^, 0x0, 0x0
+84 86 c0 03 02 02 f4 01    ICMP_MULTI.u32.gt.u1 r2, r4^, r6^, 0x0
+85 87 82 02 02 02 f0 01    ICMP_MULTI.u32.gt.m1 r2, r5^, r7^, r2^
+83 c0 80 02 06 00 f2 01    ICMP_MULTI.u32.ne.m1 r0, r3^, 0x0, r0^
+80 82 c0 03 02 00 f4 01    ICMP_MULTI.u32.gt.u1 r0, r0^, r2^, 0x0
+81 83 80 82 02 04 f0 01    ICMP_MULTI.s32.gt.m1 r4, r1^, r3^, r0^
+80 c0 c0 6a 07 00 e6 09    FCMP_OR.v2f16.ne.m1.wait1 r0, r0^, 0x0, 0x0
+81 81 80 6e 03 00 e8 01    FCMP_AND.v2f16.gt.m1 r0, r1^, r1^.h11, r0^
+80 c0 c0 6a 07 00 e6 09    FCMP_OR.v2f16.ne.m1.wait1 r0, r0^, 0x0, 0x0
+81 81 80 6e 03 00 e8 01    FCMP_AND.v2f16.gt.m1 r0, r1^, r1^.h11, r0^
+c4 c0 80 52 70 00 6b 01    LSHIFT_AND.v4i8 r0, 0x1000000.b3333, 0x0.b00, r0^
+80 81 82 80 24 00 78 01    MUX.v4i8 r0, r0^, r1^, r2^
+c0 c0 00 00 02 02 8f 03    LEA_PKA.slot0 @r2:r3, 0x0, 0x0
--- a/src/panfrost/compiler/bifrost/valhall/test/assembler-cases.txt
+++ b/src/panfrost/compiler/bifrost/valhall/test/assembler-cases.txt
@ -126,6 +126,7 @@ c0 01 00 00 00 c4 10 51    IADD_IMM.i32.reconverge r4, 0x0, #0x1
 00 00 00 01 00 c1 99 68    FREXPM.f32.sqrt.discard r1, r0
 01 00 02 00 00 c2 9c 00    FRSQ.f32 r2, r1
 40 00 02 01 00 c0 99 00    FREXPE.f32.sqrt r0, r0^
+41 42 c0 40 06 c0 60 01    FMA_RSCALE.f32.clamp_m1_1 r0, r1^, r2^, 0x0.neg, r0^
 41 42 c0 40 04 c0 62 41    FMA_RSCALE_LEFT.f32.wait0126 r0, r1^, r2^, 0x0.neg, r0^
 42 43 84 85 00 c1 50 01    CSEL.u32.eq r1, r2^, r3^, u2.w0, u2.w1
 42 43 84 85 04 c1 50 01    CSEL.u32.lt r1, r2^, r3^, u2.w0, u2.w1
@ -213,17 +214,17 @@ c0 00 00 00 00 c9 10 01    IADD_IMM.i32 r9, 0x0, #0x0
 f0 00 3c 32 08 40 7f 78    BLEND.slot0.v4.f32.end @r0:r1:r2:r3, blend_descriptor_0.w0, r60, target:0x0
 c0 00 00 00 00 f6 10 01    IADD_IMM.i32 r54, 0x0, #0x0
 c0 f1 00 00 10 c1 2f 08    BRANCHZI.eq.absolute.wait0 0x0, blend_descriptor_0.w1
-80 00 c0 17 34 7c 25 01    TEX_FETCH.slot0.f.32.2d @r0:r1:r2:r3, @r60:r61, u0
+80 00 c0 13 34 7c 25 01    TEX_FETCH.slot0.32.2d @r0:r1:r2:r3, @r60:r61, u0
 80 00 00 00 00 c1 91 02    MOV.i32 r1, u32.w0
 81 00 00 00 00 c1 91 02    MOV.i32 r1, u32.w1
 8a 00 00 00 00 c1 91 02    MOV.i32 r1, u37.w0
 30 00 f7 1b 02 cc 20 09    ATOM_RETURN.i32.slot0.axchg.wait0 @r55, @r12, [r48:r49], offset:0x0
 32 00 80 18 02 4c 68 08    ATOM.i32.slot0.aadd.wait0 @r12, [r50:r51], offset:0x0
 32 00 00 18 02 8c 69 08    ATOM1_RETURN.i32.slot0.ainc.wait0 @r12, [r50:r51], offset:0x0
-32 00 00 18 00 80 69 08    ATOM1_RETURN.i32.slot0.ainc.wait0 @, [r50:r51], offset:0x0
-82 00 80 15 b4 80 38 49    VAR_TEX_SINGLE.slot0.skip.sample_store.f.32.2d.zero.wait @r0:r1:r2:r3, u1, u0.w0
-82 20 80 15 b4 80 38 09    VAR_TEX_SINGLE.slot0.skip.sample_store.f.32.2d.computed.wait0 @r0:r1:r2:r3, u1, u0.w0
-82 20 80 1d 84 80 38 41    VAR_TEX_SINGLE.slot0.skip.sample_store.s.32.2d.computed.wait0126 @r0, u1, u0.w0
+32 00 00 18 02 80 69 08    ATOM1_RETURN.i32.slot0.ainc.wait0 @r0, [r50:r51], offset:0x0
+82 00 80 11 b4 80 38 49    VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.zero.wait @r0:r1:r2:r3, u1, u0.w0
+82 20 80 11 b4 80 38 09    VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0 @r0:r1:r2:r3, u1, u0.w0
+82 20 80 11 84 80 38 41    VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0126 @r0, u1, u0.w0
 40 c0 c0 80 03 c0 f0 10    ICMP_OR.u32.ne.m1.wait1 r0, r0^, 0x0, 0x0
 42 43 40 01 01 c0 f8 00    ICMP_AND.s32.gt.i1 r0, r2^, r3^, r0^
 42 c0 c0 c2 03 c0 f0 10    ICMP_MULTI.u32.ne.u1.wait1 r0, r2^, 0x0, 0x0
--- a/src/panfrost/compiler/bifrost/valhall/test/test-disassembler.c
+++ b/src/panfrost/compiler/bifrost/valhall/test/test-disassembler.c
@ -33,8 +33,18 @@ parse_hex(const char *in)
 int
 main(int argc, const char **argv)
 {
-   if (argc < 2) {
-      fprintf(stderr, "Expected case list\n");
+   if (argc < 3) {
+      fprintf(stderr, "Expected case list and arch version\n");
+      return 1;
+   }
+
+   if (argv[2][0] != 'v') {
+      fprintf(stderr, "Invalid arch version: %s\n", argv[2]);
+      return 1;
+   }
+   unsigned arch = atoi(&argv[2][1]);
+   if (arch < 9 || arch > 15) {
+      fprintf(stderr, "Non-supported arch version: %d\n", arch);
      return 1;
   }

@ -65,7 +75,10 @@ main(int argc, const char **argv)

      uint64_t bin = parse_hex(line);
      FILE *outputp = open_memstream(&output, &sz);
-      va_disasm_instr(outputp, bin);
+      if (arch < 15)
+         va_disasm_instr(outputp, bin);
+      else
+         va_disasm_instr_v15(outputp, bin);
      fprintf(outputp, "\n");
      fclose(outputp);

--- a/src/panfrost/compiler/bifrost/valhall/test/test-lower-constants.cpp
+++ b/src/panfrost/compiler/bifrost/valhall/test/test-lower-constants.cpp
@ -12,6 +12,7 @@
 static inline void
 add_imm(bi_context *ctx)
 {
+   ctx->arch = 10;
   struct hash_table_u64 *stats = _mesa_hash_table_u64_create(ctx);
   bi_foreach_instr_global(ctx, I) {
      va_lower_constants(ctx, I, stats, UINT32_MAX);
--- a/src/panfrost/compiler/bifrost/valhall/test/test-mark-last.cpp
+++ b/src/panfrost/compiler/bifrost/valhall/test/test-mark-last.cpp
@ -26,7 +26,9 @@ strip_discard(bi_context *ctx)
   do {                                                                        \
      void *mem_ctx = ralloc_context(NULL);                                    \
      bi_builder *A = bit_builder(mem_ctx);                                    \
+      A->shader->arch = 10;                                                    \
      bi_builder *B = bit_builder(mem_ctx);                                    \
+      B->shader->arch = 10;                                                    \
      {                                                                        \
         UNUSED bi_builder *b = A;                                             \
         test;                                                                 \
--- a/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp
+++ b/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp
@ -1,5 +1,6 @@
 /*
 * Copyright (C) 2021 Collabora, Ltd.
+ * Copyright (C) 2026 Arm Ltd.
 * SPDX-License-Identifier: MIT
 */

@ -9,9 +10,9 @@

 #include <gtest/gtest.h>

-#define CASE(instr, expected)                                                  \
+#define CASE_ARCH(instr, arch, expected)                                       \
   do {                                                                        \
-      uint64_t _value = va_pack_instr(instr, 10);                              \
+      uint64_t _value = va_pack_instr(instr, arch);                            \
      if (_value != expected) {                                                \
         fprintf(stderr, "Got %" PRIx64 ", expected %" PRIx64 "\n", _value,    \
                 (uint64_t)expected);                                          \
@ -45,124 +46,153 @@ class ValhallPacking : public testing::Test {

 TEST_F(ValhallPacking, Moves)
 {
-   CASE(bi_mov_i32_to(b, bi_register(1), bi_register(2)),
-        0x0091c10000000002ULL);
-   CASE(bi_mov_i32_to(b, bi_register(1),
-                      bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 5), false)),
-        0x0091c1000000008aULL);
+   bi_instr *I = bi_mov_i32_to(b, bi_register(1), bi_register(2));
+   CASE_ARCH(I, 10, 0x0091c10000000002ULL);
+   CASE_ARCH(I, 15, 0x0060010000200002ULL);
+
+   I = bi_mov_i32_to(b, bi_register(1),
+                     bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 5), false));
+   CASE_ARCH(I, 10, 0x0091c1000000008aULL);
+   CASE_ARCH(I, 15, 0x006101000020000aULL);
 }

 TEST_F(ValhallPacking, Fadd)
 {
-   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_register(2)),
-        0x00a4c00000000201ULL);
-   CASE(
-      bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_abs(bi_register(2))),
-      0x00a4c02000000201ULL);
-   CASE(
-      bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_register(2))),
-      0x00a4c01000000201ULL);
+   bi_instr *I =
+      bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_register(2));
+   CASE_ARCH(I, 10, 0x00a4c00000000201ULL);
+   CASE_ARCH(I, 15, 0x00f0000000000201ULL);

-   CASE(bi_fadd_v2f16_to(b, bi_register(0),
-                         bi_swz_16(bi_register(1), false, false),
-                         bi_swz_16(bi_register(0), true, true)),
-        0x00a5c0000c000001ULL);
+   I =
+      bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_abs(bi_register(2)));
+   CASE_ARCH(I, 10, 0x00a4c02000000201ULL);
+   CASE_ARCH(I, 15, 0x00f0002000000201ULL);

-   CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_register(0)),
-        0x00a5c00028000001ULL);
+   I =
+      bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_register(2)));
+   CASE_ARCH(I, 10, 0x00a4c01000000201ULL);
+   CASE_ARCH(I, 15, 0x00f0001000000201ULL);

-   CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1),
-                         bi_swz_16(bi_register(0), true, false)),
-        0x00a5c00024000001ULL);
+   I = bi_fadd_v2f16_to(b, bi_register(0),
+                        bi_swz_16(bi_register(1), false, false),
+                        bi_swz_16(bi_register(0), true, true));
+   CASE_ARCH(I, 10, 0x00a5c0000c000001ULL);
+   CASE_ARCH(I, 15, 0x00f400000c000001ULL);

-   CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_discard(bi_abs(bi_register(0))),
-                         bi_neg(zero)),
-        0x00a5c0902800c040ULL);
+   I = bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_register(0));
+   CASE_ARCH(I, 10, 0x00a5c00028000001ULL);
+   CASE_ARCH(I, 15, 0x00f4000028000001ULL);

-   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), zero),
-        0x00a4c0000000c001ULL);
+   I = bi_fadd_v2f16_to(b, bi_register(0), bi_register(1),
+                        bi_swz_16(bi_register(0), true, false));
+   CASE_ARCH(I, 10, 0x00a5c00024000001ULL);
+   CASE_ARCH(I, 15, 0x00f4000024000001ULL);

-   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(zero)),
-        0x00a4c0100000c001ULL);
+   I = bi_fadd_v2f16_to(b, bi_register(0), bi_discard(bi_abs(bi_register(0))),
+                        bi_neg(zero));
+   CASE_ARCH(I, 10, 0x00a5c0902800c040ULL);
+   CASE_ARCH(I, 15, 0x00f600902800c080ULL);

-   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1),
-                       bi_half(bi_register(0), true)),
-        0x00a4c00008000001ULL);
+   I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), zero);
+   CASE_ARCH(I, 10, 0x00a4c0000000c001ULL);
+   CASE_ARCH(I, 15, 0x00f200000000c001ULL);

-   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1),
-                       bi_half(bi_register(0), false)),
-        0x00a4c00004000001ULL);
+   I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(zero));
+   CASE_ARCH(I, 10, 0x00a4c0100000c001ULL);
+   CASE_ARCH(I, 15, 0x00f200100000c001ULL);
+
+   I = bi_fadd_f32_to(b, bi_register(0), bi_register(1),
+                      bi_half(bi_register(0), true));
+   CASE_ARCH(I, 10, 0x00a4c00008000001ULL);
+   CASE_ARCH(I, 15, 0x00f0000008000001ULL);
+
+   I = bi_fadd_f32_to(b, bi_register(0), bi_register(1),
+                      bi_half(bi_register(0), false));
+   CASE_ARCH(I, 10, 0x00a4c00004000001ULL);
+   CASE_ARCH(I, 15, 0x00f0000004000001ULL);
 }

 TEST_F(ValhallPacking, Clper)
 {
-   CASE(bi_clper_i32_to(b, bi_register(0), bi_register(0), bi_byte(n4567, 0),
-                        BI_INACTIVE_RESULT_F1, BI_LANE_OP_NONE,
-                        BI_SUBGROUP_SUBGROUP16),
-        0x00a0c030128fc900);
+   bi_instr *I = bi_clper_i32_to(b, bi_register(0), bi_register(0),
+                                 bi_byte(n4567, 0), BI_INACTIVE_RESULT_F1,
+                                 BI_LANE_OP_NONE, BI_SUBGROUP_SUBGROUP16);
+   CASE_ARCH(I, 10, 0x00a0c030128fc900);
+   CASE_ARCH(I, 15, 0x00e20030028fc900);
 }

 TEST_F(ValhallPacking, Clamps)
 {
   bi_instr *I = bi_fadd_f32_to(b, bi_register(0), bi_register(1),
                                bi_neg(bi_abs(bi_register(2))));
-   CASE(I, 0x00a4c03000000201ULL);
+   CASE_ARCH(I, 10, 0x00a4c03000000201ULL);
+   CASE_ARCH(I, 15, 0x00f0003000000201ULL);

   I->clamp = BI_CLAMP_CLAMP_M1_1;
-   CASE(I, 0x00a4c03200000201ULL);
+   CASE_ARCH(I, 10, 0x00a4c03200000201ULL);
+   CASE_ARCH(I, 15, 0x00f0003080000201ULL);
 }

 TEST_F(ValhallPacking, Misc)
 {
-   CASE(bi_fma_f32_to(b, bi_register(1), bi_discard(bi_register(1)),
-                      bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 4), false),
-                      bi_neg(zero)),
-        0x00b2c10400c08841ULL);
+   bi_instr *I = bi_fma_f32_to(
+      b, bi_register(1), bi_discard(bi_register(1)),
+      bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 4), false), bi_neg(zero));
+   CASE_ARCH(I, 10, 0x00b2c10400c08841ULL);
+   CASE_ARCH(I, 15, 0x0166010400c00881ULL);

-   CASE(bi_fround_f32_to(b, bi_register(2), bi_discard(bi_neg(bi_register(2))),
-                         BI_ROUND_RTN),
-        0x0090c240800d0042ULL);
+   I = bi_fround_f32_to(b, bi_register(2), bi_discard(bi_neg(bi_register(2))),
+                        BI_ROUND_RTN);
+   CASE_ARCH(I, 10, 0x0090c240800d0042ULL);
+   CASE_ARCH(I, 15, 0x00600242004d0082ULL);

-   CASE(bi_fround_v2f16_to(b, bi_half(bi_register(0), false), bi_register(0),
-                           BI_ROUND_RTN),
-        0x00904000a00f0000ULL);
+   I = bi_fround_v2f16_to(b, bi_half(bi_register(0), false), bi_register(0),
+                          BI_ROUND_RTN);
+   CASE_ARCH(I, 10, 0x00904000a00f0000ULL);
+   /* Removed on v11 */

-   CASE(
-      bi_fround_v2f16_to(b, bi_half(bi_register(0), false),
-                         bi_swz_16(bi_register(1), true, false), BI_ROUND_RTN),
-      0x00904000900f0001ULL);
+   I = bi_fround_v2f16_to(b, bi_half(bi_register(0), false),
+                          bi_swz_16(bi_register(1), true, false), BI_ROUND_RTN);
+   CASE_ARCH(I, 10, 0x00904000900f0001ULL);
+   /* Removed on v11 */
 }

 TEST_F(ValhallPacking, FaddImm)
 {
-   CASE(bi_fadd_imm_f32_to(b, bi_register(2), bi_discard(bi_register(2)),
-                           0x4847C6C0),
-        0x0114C24847C6C042ULL);
+   bi_instr *I = bi_fadd_imm_f32_to(b, bi_register(2),
+                                    bi_discard(bi_register(2)), 0x4847C6C0);
+   CASE_ARCH(I, 10, 0x0114C24847C6C042ULL);
+   CASE_ARCH(I, 15, 0x0064024847c6c082ULL);

-   CASE(bi_fadd_imm_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)),
-                             0x70AC6784),
-        0x0115C270AC678442ULL);
+   I = bi_fadd_imm_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)),
+                            0x70AC6784);
+   CASE_ARCH(I, 10, 0x0115C270AC678442ULL);
+   CASE_ARCH(I, 15, 0x00620270ac678482ULL);
 }

 TEST_F(ValhallPacking, Comparions)
 {
-   CASE(bi_icmp_or_v2s16_to(b, bi_register(2),
-                            bi_discard(bi_swz_16(bi_register(3), true, false)),
-                            bi_discard(bi_swz_16(bi_register(2), true, false)),
-                            zero, BI_CMPF_GT, BI_RESULT_TYPE_M1),
-        0x00f9c21184c04243);
+   bi_instr *I = bi_icmp_or_v2s16_to(
+      b, bi_register(2), bi_discard(bi_swz_16(bi_register(3), true, false)),
+      bi_discard(bi_swz_16(bi_register(2), true, false)), zero, BI_CMPF_GT,
+      BI_RESULT_TYPE_M1);
+   CASE_ARCH(I, 10, 0x00f9c21184c04243);
+   CASE_ARCH(I, 15, 0x01e40212c6c08283);

-   CASE(bi_fcmp_or_v2f16_to(b, bi_register(2),
-                            bi_discard(bi_swz_16(bi_register(3), true, false)),
-                            bi_discard(bi_swz_16(bi_register(2), false, false)),
-                            zero, BI_CMPF_GT, BI_RESULT_TYPE_M1),
-        0x00f5c20190c04243);
+   I = bi_fcmp_or_v2f16_to(b, bi_register(2),
+                           bi_discard(bi_swz_16(bi_register(3), true, false)),
+                           bi_discard(bi_swz_16(bi_register(2), false, false)),
+                           zero, BI_CMPF_GT, BI_RESULT_TYPE_M1);
+   CASE_ARCH(I, 10, 0x00f5c20190c04243);
+   CASE_ARCH(I, 15, 0x01e4020352c08283);
 }

 TEST_F(ValhallPacking, Conversions)
 {
-   CASE(bi_v2s16_to_v2f16_to(b, bi_register(2), bi_discard(bi_register(2))),
-        0x0090c22000070042);
+   bi_instr *I =
+      bi_v2s16_to_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)));
+   CASE_ARCH(I, 10, 0x0090c22000070042);
+   /* Removed on v11 */
 }

 TEST_F(ValhallPacking, BranchzI16)
@ -170,88 +200,105 @@ TEST_F(ValhallPacking, BranchzI16)
   bi_instr *I =
      bi_branchz_i16(b, bi_half(bi_register(2), false), bi_null(), BI_CMPF_EQ);
   I->branch_offset = 1;
-   CASE(I, 0x001fc03000000102);
+   CASE_ARCH(I, 10, 0x001fc03000000102);
+   CASE_ARCH(I, 15, 0x02b8003000000102);
 }

 TEST_F(ValhallPacking, BranchzI16Backwards)
 {
   bi_instr *I = bi_branchz_i16(b, zero, bi_null(), BI_CMPF_EQ);
   I->branch_offset = -8;
-   CASE(I, 0x001fc017fffff8c0);
+   CASE_ARCH(I, 10, 0x001fc017fffff8c0);
+   CASE_ARCH(I, 15, 0x02b90017fffff8c0);
 }

 TEST_F(ValhallPacking, Blend)
 {
-   CASE(
+   bi_instr *I =
      bi_blend_to(b, bi_null(), bi_register(0), bi_register(60),
                  bi_fau(BIR_FAU_BLEND_0, false), bi_fau(BIR_FAU_BLEND_0, true),
-                  bi_null(), BI_REGISTER_FORMAT_F16, 2, 0),
-      0x007f4004333c00f0);
+                  bi_null(), BI_REGISTER_FORMAT_F16, 2, 0);
+   CASE_ARCH(I, 10, 0x007f4004333c00f0);
+   CASE_ARCH(I, 15, 0x031b0082333c00f0);
 }

 TEST_F(ValhallPacking, Mux)
 {
-   CASE(bi_mux_i32_to(b, bi_register(0), bi_discard(bi_register(0)),
-                      bi_discard(bi_register(4)),
-                      bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 0), false),
-                      BI_MUX_BIT),
-        0x00b8c00300804440ull);
+   bi_instr *I = bi_mux_i32_to(
+      b, bi_register(0), bi_discard(bi_register(0)), bi_discard(bi_register(4)),
+      bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 0), false), BI_MUX_BIT);
+   CASE_ARCH(I, 10, 0x00b8c00300804440ull);
+   CASE_ARCH(I, 15, 0x017c000c80008480ull);
 }

 TEST_F(ValhallPacking, AtestFP16)
 {
-   CASE(bi_atest_to(b, bi_register(60), bi_register(60),
-                    bi_half(bi_register(1), true),
-                    bi_fau(BIR_FAU_ATEST_PARAM, false)),
-        0x007dbc0208ea013c);
+   bi_instr *I = bi_atest_to(b, bi_register(60), bi_register(60),
+                             bi_half(bi_register(1), true),
+                             bi_fau(BIR_FAU_ATEST_PARAM, false));
+   CASE_ARCH(I, 10, 0x007dbc0208ea013c);
+   CASE_ARCH(I, 15, 0x03d43c0108ea013c);
 }

 TEST_F(ValhallPacking, AtestFP32)
 {
-   CASE(bi_atest_to(b, bi_register(60), bi_register(60), one,
-                    bi_fau(BIR_FAU_ATEST_PARAM, false)),
-        0x007dbc0200ead03c);
+   bi_instr *I = bi_atest_to(b, bi_register(60), bi_register(60), one,
+                             bi_fau(BIR_FAU_ATEST_PARAM, false));
+   CASE_ARCH(I, 10, 0x007dbc0200ead03c);
+   CASE_ARCH(I, 15, 0x03d63c0100ead03c);
 }

 TEST_F(ValhallPacking, Transcendentals)
 {
-   CASE(bi_frexpm_f32_to(b, bi_register(1), bi_register(0), false, true),
-        0x0099c10001000000);
+   bi_instr *I =
+      bi_frexpm_f32_to(b, bi_register(1), bi_register(0), false, true);
+   CASE_ARCH(I, 10, 0x0099c10001000000);
+   CASE_ARCH(I, 15, 0x0060010041200000);

-   CASE(bi_frexpe_f32_to(b, bi_register(0), bi_discard(bi_register(0)), false,
-                         true),
-        0x0099c00001020040);
+   I = bi_frexpe_f32_to(b, bi_register(0), bi_discard(bi_register(0)), false,
+                        true);
+   CASE_ARCH(I, 10, 0x0099c00001020040);
+   CASE_ARCH(I, 15, 0x0060000041220080);

-   CASE(bi_frsq_f32_to(b, bi_register(2), bi_register(1)), 0x009cc20000020001);
+   I = bi_frsq_f32_to(b, bi_register(2), bi_register(1));
+   CASE_ARCH(I, 10, 0x009cc20000020001);
+   CASE_ARCH(I, 15, 0x0060020001820001);

-   CASE(bi_fma_rscale_f32_to(b, bi_register(0), bi_discard(bi_register(1)),
-                             bi_discard(bi_register(2)), bi_neg(zero),
-                             bi_discard(bi_register(0)), BI_SPECIAL_LEFT),
-        0x0162c00440c04241);
+   I = bi_fma_rscale_f32_to(b, bi_register(0), bi_discard(bi_register(1)),
+                            bi_discard(bi_register(2)), bi_neg(zero),
+                            bi_discard(bi_register(0)), BI_SPECIAL_LEFT);
+   CASE_ARCH(I, 10, 0x0162c00440c04241);
+   CASE_ARCH(I, 15, 0x0264000e80c08281);
+
+   I = bi_fma_rscale_f32_to(b, bi_register(0), bi_register(1), bi_register(2),
+                            bi_neg(zero), bi_discard(bi_register(0)),
+                            BI_SPECIAL_N);
+   CASE_ARCH(I, 10, 0x0161c00440c00201);
+   CASE_ARCH(I, 15, 0x0264000d80c00201);
 }

 TEST_F(ValhallPacking, Csel)
 {
-   CASE(bi_csel_u32_to(b, bi_register(1), bi_discard(bi_register(2)),
-                       bi_discard(bi_register(3)),
-                       bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
-                       bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true),
-                       BI_CMPF_EQ),
-        0x0150c10085844342);
+   bi_instr *I = bi_csel_u32_to(
+      b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)),
+      bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
+      bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_EQ);
+   CASE_ARCH(I, 10, 0x0150c10085844342);
+   CASE_ARCH(I, 15, 0x027c010005048382);

-   CASE(bi_csel_u32_to(b, bi_register(1), bi_discard(bi_register(2)),
-                       bi_discard(bi_register(3)),
-                       bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
-                       bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true),
-                       BI_CMPF_LT),
-        0x0150c10485844342);
+   I = bi_csel_u32_to(
+      b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)),
+      bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
+      bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_LT);
+   CASE_ARCH(I, 10, 0x0150c10485844342);
+   CASE_ARCH(I, 15, 0x027c010805048382);

-   CASE(bi_csel_s32_to(b, bi_register(1), bi_discard(bi_register(2)),
-                       bi_discard(bi_register(3)),
-                       bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
-                       bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true),
-                       BI_CMPF_LT),
-        0x0158c10485844342);
+   I = bi_csel_s32_to(
+      b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)),
+      bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
+      bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_LT);
+   CASE_ARCH(I, 10, 0x0158c10485844342);
+   CASE_ARCH(I, 15, 0x027c014805048382);
 }

 TEST_F(ValhallPacking, LdAttrImm)
@ -261,34 +308,67 @@ TEST_F(ValhallPacking, LdAttrImm)
      bi_discard(bi_register(61)), BI_REGISTER_FORMAT_F16, BI_VECSIZE_V4, 1);
   I->table = 1;

-   CASE(I, 0x0066800433117d7c);
+   CASE_ARCH(I, 10, 0x0066800433117d7c);
+   CASE_ARCH(I, 15, 0x038400023311bdbc);
 }

 TEST_F(ValhallPacking, LdVarBufImmF16)
 {
-   CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(2), bi_register(61),
-                                 BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTER,
-                                 BI_SOURCE_FORMAT_F16, BI_UPDATE_RETRIEVE,
-                                 BI_VECSIZE_V4, 0),
-        0x005d82143300003d);
+   bi_instr *I = bi_ld_var_buf_imm_f16_to(
+      b, bi_register(2), bi_register(61), BI_REGISTER_FORMAT_F16,
+      BI_SAMPLE_CENTER, BI_SOURCE_FORMAT_F16, BI_UPDATE_RETRIEVE, BI_VECSIZE_V4,
+      0);
+   CASE_ARCH(I, 10, 0x005d82143300003d);
+   CASE_ARCH(I, 15, 0x0310020a3f00003d);

-   CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61),
-                                 BI_REGISTER_FORMAT_F16, BI_SAMPLE_SAMPLE,
-                                 BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE,
-                                 BI_VECSIZE_V4, 0),
-        0x005d80843300003d);
+   I = bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61),
+                                BI_REGISTER_FORMAT_F16, BI_SAMPLE_SAMPLE,
+                                BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE,
+                                BI_VECSIZE_V4, 0);
+   CASE_ARCH(I, 10, 0x005d80843300003d);
+   CASE_ARCH(I, 15, 0x031000423f00003d);

-   CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61),
-                                 BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTROID,
-                                 BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE,
-                                 BI_VECSIZE_V4, 8),
-        0x005d80443308003d);
+   I = bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61),
+                                BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTROID,
+                                BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE,
+                                BI_VECSIZE_V4, 8);
+   CASE_ARCH(I, 10, 0x005d80443308003d);
+   CASE_ARCH(I, 11, 0x005d80443300083d);
+   CASE_ARCH(I, 15, 0x031000223f00083d);
+}
+
+TEST_F(ValhallPacking, LdVarBufFlatImmFormat)
+{
+   bi_instr *I = bi_ld_var_buf_flat_imm_to(
+      b, bi_register(0), BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4, 0x12);
+   CASE_ARCH(I, 14, 0x0040800832001200);
+   CASE_ARCH(I, 15, 0x033900043a0012c0);
+
+   I = bi_ld_var_buf_flat_imm_to(b, bi_register(0), BI_REGISTER_FORMAT_F16,
+                                 BI_VECSIZE_V4, 0x12);
+   CASE_ARCH(I, 14, 0x0040800433001200);
+   CASE_ARCH(I, 15, 0x033900023b0012c0);
+}
+
+TEST_F(ValhallPacking, LdVarBufFlat)
+{
+   bi_instr *I = bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61),
+                                       BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4);
+   CASE_ARCH(I, 14, 0x005f80083200003d);
+   CASE_ARCH(I, 15, 0x031400043a00003d);
+
+   I = bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61),
+                             BI_REGISTER_FORMAT_F16, BI_VECSIZE_V4);
+   CASE_ARCH(I, 14, 0x005f80043300003d);
+   CASE_ARCH(I, 15, 0x031400023b00003d);
 }

 TEST_F(ValhallPacking, LeaBufImm)
 {
-   CASE(bi_lea_buf_imm_to(b, bi_register(4), bi_discard(bi_register(59))),
-        0x005e84040000007b);
+   bi_instr *I =
+      bi_lea_buf_imm_to(b, bi_register(4), bi_discard(bi_register(59)));
+   CASE_ARCH(I, 10, 0x005e84040000007b);
+   CASE_ARCH(I, 15, 0x03080402000000bb);
 }

 TEST_F(ValhallPacking, StoreMemoryAccess)
@ -296,61 +376,94 @@ TEST_F(ValhallPacking, StoreMemoryAccess)
   bi_instr *I = bi_store_i96(b, bi_register(0), bi_discard(bi_register(4)),
                              bi_discard(bi_register(5)), BI_SEG_NONE, 0);
   I->mem_access = VA_MEMORY_ACCESS_ESTREAM;
-   CASE(I, 0x0061400632000044);
+   CASE_ARCH(I, 10, 0x0061400632000044);
+   CASE_ARCH(I, 15, 0x0320009302000084);
 }

 TEST_F(ValhallPacking, Convert16To32)
 {
-   CASE(bi_u16_to_u32_to(b, bi_register(2),
-                         bi_discard(bi_half(bi_register(55), false))),
-        0x0090c20000140077);
+   bi_instr *I = bi_u16_to_u32_to(b, bi_register(2),
+                                  bi_discard(bi_half(bi_register(55), false)));
+   CASE_ARCH(I, 10, 0x0090c20000140077);
+   CASE_ARCH(I, 15, 0x00600200005400b7);

-   CASE(bi_u16_to_u32_to(b, bi_register(2),
-                         bi_discard(bi_half(bi_register(55), true))),
-        0x0090c20010140077);
+   I = bi_u16_to_u32_to(b, bi_register(2),
+                        bi_discard(bi_half(bi_register(55), true)));
+   CASE_ARCH(I, 10, 0x0090c20010140077);
+   CASE_ARCH(I, 15, 0x00600200105400b7);

-   CASE(bi_u16_to_f32_to(b, bi_register(2),
-                         bi_discard(bi_half(bi_register(55), false))),
-        0x0090c20000150077);
+   I = bi_u16_to_f32_to(b, bi_register(2),
+                        bi_discard(bi_half(bi_register(55), false)));
+   CASE_ARCH(I, 10, 0x0090c20000150077);
+   /* Removed on v11 */

-   CASE(bi_u16_to_f32_to(b, bi_register(2),
-                         bi_discard(bi_half(bi_register(55), true))),
-        0x0090c20010150077);
+   I = bi_u16_to_f32_to(b, bi_register(2),
+                        bi_discard(bi_half(bi_register(55), true)));
+   CASE_ARCH(I, 10, 0x0090c20010150077);
+   /* Removed on v11 */

-   CASE(bi_s16_to_s32_to(b, bi_register(2),
-                         bi_discard(bi_half(bi_register(55), false))),
-        0x0090c20000040077);
+   I = bi_s16_to_s32_to(b, bi_register(2),
+                        bi_discard(bi_half(bi_register(55), false)));
+   CASE_ARCH(I, 10, 0x0090c20000040077);
+   CASE_ARCH(I, 15, 0x00600200004400b7);

-   CASE(bi_s16_to_s32_to(b, bi_register(2),
-                         bi_discard(bi_half(bi_register(55), true))),
-        0x0090c20010040077);
+   I = bi_s16_to_s32_to(b, bi_register(2),
+                        bi_discard(bi_half(bi_register(55), true)));
+   CASE_ARCH(I, 10, 0x0090c20010040077);
+   CASE_ARCH(I, 15, 0x00600200104400b7);
 }

 TEST_F(ValhallPacking, Swizzle8)
 {
-   CASE(bi_icmp_or_v4u8_to(b, bi_register(1), bi_byte(bi_register(0), 0), zero,
-                           zero, BI_CMPF_NE, BI_RESULT_TYPE_I1),
-        0x00f2c14300c0c000);
+   bi_instr *I =
+      bi_icmp_or_v4u8_to(b, bi_register(1), bi_byte(bi_register(0), 0), zero,
+                         zero, BI_CMPF_NE, BI_RESULT_TYPE_I1);
+   CASE_ARCH(I, 10, 0x00f2c14300c0c000);
+   /* Removed on v11 */
 }

 TEST_F(ValhallPacking, FauPage1)
 {
-   CASE(bi_mov_i32_to(b, bi_register(1),
-                      bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 32), false)),
-        0x0291c10000000080ULL);
+   bi_instr *I = bi_mov_i32_to(
+      b, bi_register(1), bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 32), false));
+   CASE_ARCH(I, 10, 0x0291c10000000080ULL);
+   CASE_ARCH(I, 15, 0x0061010000200040ULL);
 }

 TEST_F(ValhallPacking, LdTileV3F16)
 {
-   CASE(bi_ld_tile_to(b, bi_register(4), bi_discard(bi_register(0)),
-                      bi_register(60), bi_register(3), BI_REGISTER_FORMAT_F16,
-                      BI_VECSIZE_V3),
-        0x0078840423033c40);
+   bi_instr *I = bi_ld_tile_to(b, bi_register(4), bi_discard(bi_register(0)),
+                               bi_register(60), bi_register(3),
+                               BI_REGISTER_FORMAT_F16, BI_VECSIZE_V3);
+   CASE_ARCH(I, 10, 0x0078840423033c40);
+   CASE_ARCH(I, 15, 0x03c0040223033c80);
 }

 TEST_F(ValhallPacking, Rhadd8)
 {
-   CASE(bi_hadd_v4s8_to(b, bi_register(0), bi_discard(bi_register(1)),
-                        bi_discard(bi_register(0)), BI_ROUND_RTP),
-        0x00aac000400b4041);
+   bi_instr *I = bi_hadd_v4s8_to(b, bi_register(0), bi_discard(bi_register(1)),
+                                 bi_discard(bi_register(0)), BI_ROUND_RTP);
+   CASE_ARCH(I, 10, 0x00aac000400b4041);
+   /* Removed on v11 */
+}
+
+TEST_F(ValhallPacking, Atomics)
+{
+
+   bi_instr *I =
+      bi_atom1_return_i64_to(b, bi_register(0), bi_discard(bi_register(2)),
+                             bi_register(3), BI_ATOM_OPC_AINC, 2);
+   CASE_ARCH(I, 10, 0x0069800428000042);
+   CASE_ARCH(I, 15, 0x0328000220000082);
+
+   I = bi_atom_return_i32_to(b, bi_register(0), bi_discard(bi_register(1)),
+                             bi_register(2), bi_register(3), BI_ATOM_OPC_AXCHG,
+                             1);
+   CASE_ARCH(I, 10, 0x0120c1021bc00002);
+   CASE_ARCH(I, 15, 0x032401c10f000002);
+
+   I = bi_atom_return_i64_to(b, bi_register(0), bi_register(2), bi_register(6),
+                             bi_register(7), BI_ATOM_OPC_ACMPXCHG, 2);
+   CASE_ARCH(I, 10, 0x0120c2182fc00006);
+   CASE_ARCH(I, 15, 0x032802cc2f000006);
 }
--- a/src/panfrost/compiler/bifrost/valhall/test/test-validate-fau.cpp
+++ b/src/panfrost/compiler/bifrost/valhall/test/test-validate-fau.cpp
@ -9,9 +9,9 @@

 #include <gtest/gtest.h>

-#define CASE(instr, expected)                                                  \
+#define CASE_ARCH(instr, arch, expected)                                       \
   do {                                                                        \
-      if (va_validate_fau(instr) != expected) {                                \
+      if (va_validate_fau(instr, arch) != expected) {                          \
         fprintf(stderr, "Incorrect validation for:\n");                       \
         bi_print_instr(instr, stderr);                                        \
         fprintf(stderr, "\n");                                                \
@ -19,8 +19,8 @@
      }                                                                        \
   } while (0)

-#define VALID(instr)   CASE(instr, true)
-#define INVALID(instr) CASE(instr, false)
+#define VALID(instr)   CASE_ARCH(instr, 10, true)
+#define INVALID(instr) CASE_ARCH(instr, 10, false)

 class ValidateFau : public testing::Test {
 protected:
--- a/src/panfrost/compiler/bifrost/valhall/va_compiler.h
+++ b/src/panfrost/compiler/bifrost/valhall/va_compiler.h
@ -13,9 +13,9 @@
 extern "C" {
 #endif

-bool va_validate_fau(bi_instr *I);
+bool va_validate_fau(bi_instr *I, unsigned arch);
 void va_validate(FILE *fp, bi_context *ctx);
-void va_repair_fau(bi_builder *b, bi_instr *I);
+void va_repair_fau(bi_builder *b, bi_instr *I, unsigned arch);
 void va_fuse_add_imm(bi_instr *I);
 void va_lower_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts, uint32_t min_fau_count);
 void va_count_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts);
@ -28,14 +28,15 @@ void va_gather_hsr_info(bi_context *ctx, struct pan_shader_info *info);
 uint64_t va_pack_instr(const bi_instr *I, unsigned arch);

 static inline unsigned
-va_fau_page(enum bir_fau value)
+va_fau_page(enum bir_fau value, unsigned arch)
 {
   /* Uniform slots of FAU have a 7-bit index. The top 2-bits are the page; the
    * bottom 5-bits are specified in the source.
    */
   if (value & BIR_FAU_UNIFORM) {
+      unsigned value_shift = arch >= 15 ? 6 : 5;
      unsigned slot = value & ~BIR_FAU_UNIFORM;
-      unsigned page = slot >> 5;
+      unsigned page = slot >> value_shift;

      assert(page <= 3);
      return page;
@ -57,11 +58,11 @@ va_fau_page(enum bir_fau value)
 }

 static inline unsigned
-va_select_fau_page(const bi_instr *I)
+va_select_fau_page(const bi_instr *I, unsigned arch)
 {
   bi_foreach_src(I, s) {
      if (I->src[s].type == BI_INDEX_FAU)
-         return va_fau_page((enum bir_fau)I->src[s].value);
+         return va_fau_page((enum bir_fau)I->src[s].value, arch);
   }

   return 0;
@ -77,7 +78,7 @@ struct va_stats {
   unsigned nr_fau_uniforms;
 };

-void va_count_instr_stats(bi_instr *I, struct va_stats *stats);
+void va_count_instr_stats(bi_instr *I, unsigned arch, struct va_stats *stats);

 #ifdef __cplusplus
 } /* extern C */
--- a/src/panfrost/compiler/bifrost/valhall/va_gather_hsr_info.c
+++ b/src/panfrost/compiler/bifrost/valhall/va_gather_hsr_info.c
@ -77,6 +77,8 @@ walk_bir_shader(bi_context *ctx, struct pan_shader_info *info)
            if (instr->sample == BI_SAMPLE_CENTROID)
               info->fs.hsr.centroid_interpolation = true;
            FALLTHROUGH;
+         case BI_OPCODE_LD_VAR_BUF_FLAT:
+         case BI_OPCODE_LD_VAR_BUF_FLAT_IMM:
         case BI_OPCODE_LD_VAR_FLAT:
         case BI_OPCODE_LD_VAR_FLAT_IMM:
            if (!found_atest)
--- a/src/panfrost/compiler/bifrost/valhall/va_insert_flow.c
+++ b/src/panfrost/compiler/bifrost/valhall/va_insert_flow.c
@ -520,7 +520,7 @@ va_assign_slots(bi_context *ctx)

   bi_foreach_instr_global(ctx, I) {
      if (I->op == BI_OPCODE_BARRIER) {
-         I->slot = 7;
+         I->slot = (ctx->arch >= 15) ? VA_SLOT_V15_SLOT7 : VA_SLOT_SLOT7;
      } else if (I->op == BI_OPCODE_ZS_EMIT || I->op == BI_OPCODE_ATEST) {
         I->slot = 0;
      } else if (bi_get_opcode_props(I)->message) {
--- a/src/panfrost/compiler/bifrost/valhall/va_lower_constants.c
+++ b/src/panfrost/compiler/bifrost/valhall/va_lower_constants.c
@ -211,7 +211,7 @@ va_resolve_constant(bi_builder *b, uint32_t value, struct va_src_info info,
 static uint32_t
 va_resolve_swizzles(bi_context *ctx, bi_instr *I, unsigned s)
 {
-   struct va_src_info info = va_src_info(I->op, s);
+   struct va_src_info info = va_src_info(I->op, s, ctx->arch);
   uint32_t value = I->src[s].value;
   enum bi_swizzle swz = I->src[s].swizzle;

@ -257,9 +257,10 @@ va_lower_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts,
         /* abs(#c) is pointless, but -#c occurs in transcendental sequences */
         assert(!I->src[s].abs && "redundant .abs modifier");

-         bool is_signed = valhall_opcodes[I->op].is_signed;
-         bool staging = (s < valhall_opcodes[I->op].nr_staging_srcs);
-         struct va_src_info info = va_src_info(I->op, s);
+         bool is_signed = get_valhall_opcode(I->op, ctx->arch).is_signed;
+         bool staging =
+            (s < get_valhall_opcode(I->op, ctx->arch).nr_staging_srcs);
+         struct va_src_info info = va_src_info(I->op, s, ctx->arch);
         const uint32_t value = va_resolve_swizzles(ctx, I, s);

         const uint32_t count = (uintptr_t)_mesa_hash_table_u64_search(counts, value);
@ -294,12 +295,13 @@ va_count_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts)
      if (I->src[s].type != BI_INDEX_CONSTANT)
         continue;

-      const bool staging = (s < valhall_opcodes[I->op].nr_staging_srcs);
+      const bool staging =
+         (s < get_valhall_opcode(I->op, ctx->arch).nr_staging_srcs);
      if (staging)
         continue;

-      bool is_signed = valhall_opcodes[I->op].is_signed;
-      struct va_src_info info = va_src_info(I->op, s);
+      bool is_signed = get_valhall_opcode(I->op, ctx->arch).is_signed;
+      struct va_src_info info = va_src_info(I->op, s, ctx->arch);
      uint32_t value = va_resolve_swizzles(ctx, I, s);

      bi_index cons = va_lookup_constant(value, info, is_signed);
--- a/src/panfrost/compiler/bifrost/valhall/va_lower_split_64bit.c
+++ b/src/panfrost/compiler/bifrost/valhall/va_lower_split_64bit.c
@ -78,7 +78,7 @@ va_lower_split_64bit(bi_context *ctx)
         if (bi_is_null(I->src[s]) || s >= 4)
            continue;

-         struct va_src_info info = va_src_info(I->op, s);
+         struct va_src_info info = va_src_info(I->op, s, ctx->arch);

         /* Only split if the instruction expects 64-bit inputs as two separate
          * sources. */
--- a/src/panfrost/compiler/bifrost/valhall/va_mark_last.c
+++ b/src/panfrost/compiler/bifrost/valhall/va_mark_last.c
@ -179,7 +179,7 @@ va_mark_last(bi_context *ctx)
               break;

            /* Only need to unmark split registers. */
-            if (va_src_info(I->op, s).size == VA_SIZE_64 &&
+            if (va_src_info(I->op, s, ctx->arch).size == VA_SIZE_64 &&
                bi_count_read_registers(I, s) == 1) {
               bool both_discard = I->src[s].discard && I->src[s + 1].discard;

--- a/src/panfrost/compiler/bifrost/valhall/va_optimize.c
+++ b/src/panfrost/compiler/bifrost/valhall/va_optimize.c
@ -286,7 +286,7 @@ va_fuse_cmp(bi_context *ctx, bi_instr **lut, const BITSET_WORD *multiple,
 static bool
 va_propagate_replicate_wide(bi_context *ctx, bi_instr **lut, bi_instr *I)
 {
-   struct va_opcode_info info = valhall_opcodes[I->op];
+   struct va_opcode_info info = get_valhall_opcode(I->op, ctx->arch);
   bool progress = false;

   bi_foreach_ssa_src(I, s) {
--- a/src/panfrost/compiler/bifrost/valhall/va_pack.c
+++ b/src/panfrost/compiler/bifrost/valhall/va_pack.c
@ -74,6 +74,15 @@ va_pack_reg(const bi_instr *I, bi_index idx)
   return idx.value;
 }

+static unsigned
+va_pack_reg_v15(const bi_instr *I, bi_index idx)
+{
+   pack_assert(I, idx.type == BI_INDEX_REGISTER);
+   pack_assert(I, idx.value < 128);
+
+   return idx.value;
+}
+
 static unsigned
 va_pack_fau_special(const bi_instr *I, enum bir_fau fau)
 {
@ -124,6 +133,21 @@ va_pack_fau_64(const bi_instr *I, bi_index idx)
      return (0x7 << 5) | (va_pack_fau_special(I, idx.value) << 1);
 }

+static unsigned
+va_pack_fau_64_v15(const bi_instr *I, bi_index idx)
+{
+   pack_assert(I, idx.type == BI_INDEX_FAU);
+
+   unsigned val = (idx.value & BITFIELD_MASK(6));
+
+   if (idx.value & BIR_FAU_IMMEDIATE)
+      return (0x7 << 6) | (val << 1);
+   else if (idx.value & BIR_FAU_UNIFORM)
+      return (0x2 << 7) | (val << 1);
+   else
+      return (0xf << 5) | (va_pack_fau_special(I, idx.value) << 1);
+}
+
 static unsigned
 va_pack_src(const bi_instr *I, unsigned s)
 {
@ -142,6 +166,33 @@ va_pack_src(const bi_instr *I, unsigned s)
   invalid_instruction(I, "type of source %u", s);
 }

+static uint64_t
+va_pack_src_v15(const bi_instr *I, unsigned s, unsigned loc)
+{
+   bi_index idx = I->src[s];
+
+   uint64_t hex = 0;
+   uint64_t regval = 0;
+
+   if (idx.type == BI_INDEX_REGISTER) {
+      regval = va_pack_reg_v15(I, idx);
+      if (idx.discard)
+         regval |= (1 << 7);
+   } else if (idx.type == BI_INDEX_FAU) {
+      pack_assert(I, idx.offset <= 1);
+      regval = va_pack_fau_64_v15(I, idx) | idx.offset;
+   } else
+      invalid_instruction(I, "type of source %u", s);
+
+   uint64_t low8 = regval & 0xff;
+   uint64_t high1 = (regval >> 8) & 0x1;
+
+   hex |= (low8 << (8 * loc));
+   hex |= (high1 << (48 + loc));
+
+   return hex;
+}
+
 static unsigned
 va_pack_wrmask(const bi_instr *I)
 {
@ -211,6 +262,20 @@ va_pack_dest(const bi_instr *I)
   return va_pack_reg(I, I->dest[0]) | (va_pack_wrmask(I) << 6);
 }

+static unsigned
+va_pack_dest_v15(const bi_instr *I)
+{
+   assert(I->nr_dests);
+   switch (I->op) {
+   case BI_OPCODE_SHADDX_S64:
+   case BI_OPCODE_SHADDX_U64:
+      /* 64 bit dest has a 0x0 wrmask */
+      return va_pack_reg_v15(I, I->dest[0]);
+   default:
+      return va_pack_reg_v15(I, I->dest[0]) | (va_pack_wrmask(I) << 13);
+   }
+}
+
 static enum va_widen
 va_pack_widen_f32(const bi_instr *I, enum bi_swizzle swz)
 {
@ -454,10 +519,22 @@ va_pack_rhadd(const bi_instr *I)
   }
 }

+static uint64_t
+va_pack_clamp_special_round_v15(const bi_instr *I)
+{
+   pack_assert(I, I->special < 4);
+   if (I->special == BI_SPECIAL_N && I->round == BI_ROUND_RTZ)
+      return 0x4;
+   else if (I->special)
+      return 0x4 | I->special;
+   else
+      return I->clamp;
+}
+
 static uint64_t
 va_pack_alu(const bi_instr *I, unsigned arch)
 {
-   struct va_opcode_info info = valhall_opcodes[I->op];
+   struct va_opcode_info info = get_valhall_opcode(I->op, arch);
   uint64_t hex = 0;

   switch (I->op) {
@ -467,25 +544,25 @@ va_pack_alu(const bi_instr *I, unsigned arch)
   case BI_OPCODE_FREXPM_F32:
   case BI_OPCODE_FREXPM_V2F16:
      if (I->sqrt)
-         hex |= 1ull << 24;
+         hex |= 1ull << ((arch >= 15) ? 30 : 24);
      if (I->log)
-         hex |= 1ull << 25;
+         hex |= 1ull << ((arch >= 15) ? 31 : 25);
      break;

   case BI_OPCODE_FLUSH_F32:
   case BI_OPCODE_FLUSH_V2F16:
-      hex |= I->nan_mode << 8;
+      hex |= I->nan_mode << ((arch >= 15) ? 30 : 8);
      if (I->ftz)
-         hex |= 1ull << 10;
+         hex |= 1ull << ((arch >= 15) ? 32 : 10);
      if (I->flush_inf)
-         hex |= 1ull << 11;
+         hex |= 1ull << ((arch >= 15) ? 33 : 11);
      break;

   /* Add mux type */
   case BI_OPCODE_MUX_I32:
   case BI_OPCODE_MUX_V2I16:
   case BI_OPCODE_MUX_V4I8:
-      hex |= (uint64_t)I->mux << 32;
+      hex |= (uint64_t)I->mux << ((arch >= 15) ? 34 : 32);
      break;

   /* Add .eq flag */
@ -497,7 +574,7 @@ va_pack_alu(const bi_instr *I, unsigned arch)
         hex |= (1ull << 36);

      if (I->op == BI_OPCODE_BRANCHZI)
-         hex |= (0x1ull << 40); /* Absolute */
+         hex |= (0x1ull << ((arch >= 15) ? 31 : 40)); /* Absolute */
      else
         hex |= ((uint64_t)I->branch_offset & BITFIELD_MASK(27)) << 8;

@ -513,7 +590,46 @@ va_pack_alu(const bi_instr *I, unsigned arch)
   case BI_OPCODE_RSHIFT_XOR_I32:
   case BI_OPCODE_RSHIFT_XOR_V2I16:
   case BI_OPCODE_RSHIFT_XOR_V4I8:
-      hex |= (uint64_t)I->arithmetic << 34;
+      if (arch >= 15) {
+         /* Rewrite exact to ARSHIFT */
+         if (I->arithmetic) {
+            switch (I->op) {
+            case BI_OPCODE_RSHIFT_AND_I32:
+            case BI_OPCODE_RSHIFT_AND_V2I16:
+            case BI_OPCODE_RSHIFT_AND_V4I8: {
+               uint64_t arshift_and_op = (0xcULL << 30);
+               /* Check that we can safely overwrite opcode */
+               pack_assert(I, ((info.exact & (0xfULL << 30)) |
+                               arshift_and_op) == arshift_and_op);
+               hex |= arshift_and_op;
+               break;
+            }
+            case BI_OPCODE_RSHIFT_OR_I32:
+            case BI_OPCODE_RSHIFT_OR_V2I16:
+            case BI_OPCODE_RSHIFT_OR_V4I8: {
+               uint64_t arshift_or_op = (0xdULL << 30);
+               /* Check that we can safely overwrite opcode */
+               pack_assert(I, ((info.exact & (0xfULL << 30)) | arshift_or_op) ==
+                                 arshift_or_op);
+               hex |= arshift_or_op;
+               break;
+            }
+            case BI_OPCODE_RSHIFT_XOR_I32:
+            case BI_OPCODE_RSHIFT_XOR_V2I16:
+            case BI_OPCODE_RSHIFT_XOR_V4I8: {
+               uint64_t arshift_xor_op = (0xbULL << 30);
+               /* Check that we can safely overwrite opcode */
+               pack_assert(I, ((info.exact & (0xfULL << 30)) |
+                               arshift_xor_op) == arshift_xor_op);
+               hex |= arshift_xor_op;
+               break;
+            }
+            default:
+               UNREACHABLE("RSHIFT->ARSHIFT");
+            }
+         }
+      } else
+         hex |= (uint64_t)I->arithmetic << 34;
      break;

   case BI_OPCODE_LEA_BUF_IMM:
@ -564,8 +680,12 @@ va_pack_alu(const bi_instr *I, unsigned arch)
      }

      hex |= ((uint64_t)va_pack_source_format(I)) << 24;
-      hex |= ((uint64_t)I->update) << 36;
-      hex |= ((uint64_t)I->sample) << 38;
+      hex |= ((uint64_t)I->update) << ((arch >= 15) ? 35 : 36);
+      hex |= ((uint64_t)I->sample) << ((arch >= 15) ? 37 : 38);
+      break;
+
+   case BI_OPCODE_LD_VAR_BUF_FLAT_IMM:
+      hex |= ((uint64_t)I->index) << 8;
      break;

   case BI_OPCODE_LD_ATTR_IMM:
@ -599,20 +719,18 @@ va_pack_alu(const bi_instr *I, unsigned arch)
      break;
   }

-   /* FMA_RSCALE.f32 special modes treated as extra opcodes */
-   if (I->op == BI_OPCODE_FMA_RSCALE_F32) {
-      pack_assert(I, I->special < 4);
-      hex |= ((uint64_t)I->special) << 48;
-   }
-
   /* Add the normal destination or a placeholder.  Staging destinations are
    * added elsewhere, as they require special handling for control fields.
    */
   if (info.has_dest && info.nr_staging_dests == 0) {
-      hex |= (uint64_t)va_pack_dest(I) << 40;
+      if (arch >= 15)
+         hex |= (uint64_t)va_pack_dest_v15(I) << 40;
+      else
+         hex |= (uint64_t)va_pack_dest(I) << 40;
   } else if (info.nr_staging_dests == 0 && info.nr_staging_srcs == 0) {
      pack_assert(I, I->nr_dests == 0);
-      hex |= 0xC0ull << 40; /* Placeholder */
+      if (arch < 15)
+         hex |= 0xC0ull << 40; /* Placeholder */
   }

   bool swap12 = va_swap_12(I->op);
@ -627,7 +745,10 @@ va_pack_alu(const bi_instr *I, unsigned arch)
      enum va_size size = src_info.size;

      bi_index src = I->src[logical_i + src_offset];
-      hex |= (uint64_t)va_pack_src(I, logical_i + src_offset) << (8 * i);
+      if (arch >= 15)
+         hex |= va_pack_src_v15(I, logical_i + src_offset, i);
+      else
+         hex |= (uint64_t)va_pack_src(I, logical_i + src_offset) << (8 * i);

      if (src_info.notted) {
         if (src.neg)
@ -636,10 +757,15 @@ va_pack_alu(const bi_instr *I, unsigned arch)
         unsigned neg_offs = 32 + 2 + ((2 - i) * 2);
         unsigned abs_offs = 33 + 2 + ((2 - i) * 2);

-         if (src.neg)
-            hex |= 1ull << neg_offs;
-         if (src.abs)
-            hex |= 1ull << abs_offs;
+         if (arch >= 15 && I->op == BI_OPCODE_FMA_RSCALE_F32 && i == 2) {
+            if (src.neg)
+               hex |= 1ull << (neg_offs + 1);
+         } else {
+            if (src.neg)
+               hex |= 1ull << neg_offs;
+            if (src.abs)
+               hex |= 1ull << abs_offs;
+         }
      } else {
         if (src.neg)
            invalid_instruction(I, "negate");
@ -659,8 +785,8 @@ va_pack_alu(const bi_instr *I, unsigned arch)
         unsigned offs = (i == 1) ? 26 : 36;
         hex |= (uint64_t)va_pack_widen(I, src.swizzle, src_info.size) << offs;
      } else if (src_info.lane) {
-         unsigned offs = (I->op == BI_OPCODE_MKVEC_V2I8) ?
-            ((i == 0) ? 38 : 36) : ((i == 0) ? 28 : 26);
+         unsigned offs = (I->op == BI_OPCODE_MKVEC_V2I8) ? ((i == 0) ? 38 : 36)
+                                                         : ((i == 0) ? 28 : 26);

         if (src_info.size == VA_SIZE_16) {
            hex |= (src.swizzle == BI_SWIZZLE_H1 ? 1 : 0) << offs;
@ -673,7 +799,25 @@ va_pack_alu(const bi_instr *I, unsigned arch)
      } else if (src_info.lanes) {
         pack_assert(I, src_info.size == VA_SIZE_8);
         pack_assert(I, i == 1);
-         hex |= (uint64_t)va_pack_shift_lanes(I, src.swizzle) << 26;
+         if (arch >= 15 && I->op == BI_OPCODE_CLPER_I32) {
+            switch (src.swizzle) {
+            case BI_SWIZZLE_B00:
+               hex |= 0x0ULL << 28;
+               break;
+            case BI_SWIZZLE_B11:
+               hex |= 0x1ULL << 28;
+               break;
+            case BI_SWIZZLE_B22:
+               hex |= 0x2ULL << 28;
+               break;
+            case BI_SWIZZLE_B33:
+               hex |= 0x3ULL << 28;
+               break;
+            default:
+               invalid_instruction(I, "lane shift");
+            }
+         } else
+            hex |= (uint64_t)va_pack_shift_lanes(I, src.swizzle) << 26;
      } else if (src_info.combine) {
         /* Treat as swizzle, subgroup ops not yet supported */
         pack_assert(I, src_info.size == VA_SIZE_32);
@ -689,17 +833,33 @@ va_pack_alu(const bi_instr *I, unsigned arch)
   }

   if (info.saturate)
-      hex |= (uint64_t)I->saturate << 30;
-   if (info.rhadd)
+      hex |= (uint64_t)I->saturate << ((arch >= 15) ? 25 : 30);
+   if (info.rhadd) {
+      pack_assert(I, arch < 15);
      hex |= va_pack_rhadd(I);
-   if (info.clamp)
-      hex |= (uint64_t)I->clamp << 32;
-   if (info.round_mode)
-      hex |= (uint64_t)I->round << 30;
+   }
+   /* FMA_RSCALE.f32 special modes treated as extra opcodes */
+   if (I->op == BI_OPCODE_FMA_RSCALE_F32) {
+      if (arch >= 15) {
+         hex |= va_pack_clamp_special_round_v15(I) << 32;
+      } else {
+         pack_assert(I, I->special < 4);
+         hex |= ((uint64_t)I->special) << 48;
+         if (info.clamp)
+            hex |= (uint64_t)I->clamp << 32;
+         if (info.round_mode && I->round == BI_ROUND_RTZ)
+            hex |= (uint64_t)0x1 << 50;
+      }
+   } else {
+      if (info.clamp)
+         hex |= (uint64_t)I->clamp << ((arch >= 15) ? 30 : 32);
+      if (info.round_mode)
+         hex |= (uint64_t)I->round << ((arch >= 15) ? 32 : 30);
+   }
   if (info.condition)
-      hex |= (uint64_t)I->cmpf << 32;
+      hex |= (uint64_t)I->cmpf << ((arch >= 15) ? 33 : 32);
   if (info.result_type)
-      hex |= (uint64_t)I->result_type << 30;
+      hex |= (uint64_t)I->result_type << ((arch >= 15) ? 24 : 30);

   return hex;
 }
@ -748,7 +908,8 @@ va_pack_load(const bi_instr *I, bool buffer_descriptor)
      VA_LOAD_LANE_96_BIT_IDENTITY, VA_LOAD_LANE_128_BIT_IDENTITY,
   };

-   unsigned memory_size = (valhall_opcodes[I->op].exact >> 27) & 0x7;
+   /* TODO hack */
+   unsigned memory_size = (get_valhall_opcode(I->op, 10).exact >> 27) & 0x7;
   uint64_t hex = (uint64_t)load_lane_identity[memory_size] << 36;

   // unsigned
@ -765,6 +926,26 @@ va_pack_load(const bi_instr *I, bool buffer_descriptor)

   return hex;
 }
+
+static uint64_t
+va_pack_load_v15(const bi_instr *I, bool buffer_descriptor)
+{
+   /* This implicitly means identity: VA_LOAD_LANE_8_BIT_B0 for i8 (bits[28;27])
+    * and VA_LOAD_LANE_16_BIT_H0 for i16 (bit[27]) */
+   uint64_t hex = 0;
+
+   if (!buffer_descriptor)
+      hex |= va_pack_byte_offset(I);
+
+   hex |= va_pack_src_v15(I, 0, 0);
+   hex |= (uint64_t)I->mem_access << 24;
+
+   if (buffer_descriptor)
+      hex |= va_pack_src_v15(I, 1, 1);
+
+   return hex;
+}
+
 static uint64_t
 va_pack_store(const bi_instr *I)
 {
@ -779,6 +960,20 @@ va_pack_store(const bi_instr *I)
   return hex;
 }

+static uint64_t
+va_pack_store_v15(const bi_instr *I)
+{
+   uint64_t hex = 0;
+
+   va_validate_register_pair(I, 1);
+   hex |= va_pack_src_v15(I, 1, 0);
+   hex |= I->mem_access << 24;
+
+   hex |= va_pack_byte_offset(I);
+
+   return hex;
+}
+
 static enum va_lod_mode
 va_pack_lod_mode(const bi_instr *I)
 {
@ -798,27 +993,6 @@ va_pack_lod_mode(const bi_instr *I)
   invalid_instruction(I, "LOD mode");
 }

-static enum va_register_type
-va_pack_register_type(const bi_instr *I)
-{
-   switch (I->register_format) {
-   case BI_REGISTER_FORMAT_F16:
-   case BI_REGISTER_FORMAT_F32:
-      return VA_REGISTER_TYPE_F;
-
-   case BI_REGISTER_FORMAT_U16:
-   case BI_REGISTER_FORMAT_U32:
-      return VA_REGISTER_TYPE_U;
-
-   case BI_REGISTER_FORMAT_S16:
-   case BI_REGISTER_FORMAT_S32:
-      return VA_REGISTER_TYPE_S;
-
-   default:
-      invalid_instruction(I, "register type");
-   }
-}
-
 static enum va_register_format
 va_pack_register_format(const bi_instr *I)
 {
@ -842,13 +1016,45 @@ va_pack_register_format(const bi_instr *I)
   }
 }

+static uint64_t
+va_pack_src_null_v15(unsigned loc)
+{
+   uint64_t hex = 0;
+   uint64_t regval = 0x1c0;
+
+   uint64_t low8 = regval & 0xff;
+   uint64_t high1 = (regval >> 8) & 0x1;
+
+   hex |= (low8 << (8 * loc));
+   hex |= (high1 << (48 + loc));
+
+   return hex;
+}
+
+static unsigned
+va_repack_sr_control_v15(unsigned sr_control)
+{
+   unsigned repacked = 0;
+   bool read = sr_control & 0x1;
+   bool write = sr_control & 0x2;
+
+   if (read) {
+      repacked |= 0x2;
+      if (write)
+         repacked |= 0x1;
+   }
+
+   return repacked;
+}
+
 uint64_t
 va_pack_instr(const bi_instr *I, unsigned arch)
 {
-   struct va_opcode_info info = valhall_opcodes[I->op];
+   struct va_opcode_info info = get_valhall_opcode(I->op, arch);

-   uint64_t hex = info.exact | (((uint64_t)I->flow) << 59);
-   hex |= ((uint64_t)va_select_fau_page(I)) << 57;
+   uint64_t hex =
+      info.exact | (((uint64_t)I->flow) << ((arch >= 15) ? 58 : 59));
+   hex |= ((uint64_t)va_select_fau_page(I, arch)) << ((arch >= 15) ? 62 : 57);

   if (info.slot)
      hex |= ((uint64_t)I->slot << 30);
@ -860,14 +1066,60 @@ va_pack_instr(const bi_instr *I, unsigned arch)
      unsigned count =
         read ? bi_count_read_registers(I, 0) : bi_count_write_registers(I, 0);

-      hex |= ((uint64_t)count << 33);
-      hex |= (uint64_t)va_pack_reg(I, sr) << 40;
-      hex |= ((uint64_t)info.sr_control << 46);
+      hex |= ((uint64_t)count << ((arch >= 15) ? 32 : 33));
+      if (arch >= 15) {
+         hex |= (uint64_t)va_pack_reg_v15(I, sr) << 40;
+         hex |= ((uint64_t)va_repack_sr_control_v15(info.sr_control) << 38);
+      } else {
+         hex |= (uint64_t)va_pack_reg(I, sr) << 40;
+         hex |= ((uint64_t)info.sr_control << 46);
+      }
+   }
+
+   /* On v15, some instructions require special sr_control values */
+   if (arch >= 15) {
+      switch (I->op) {
+      case BI_OPCODE_BARRIER: {
+         unsigned sr_control = va_repack_sr_control_v15(info.sr_control);
+         pack_assert(I, sr_control == 0x0 || sr_control == 0x2);
+         hex |= (uint64_t)0x2 << 38;
+         break;
+      }
+      case BI_OPCODE_ATOM1_RETURN_I32:
+      case BI_OPCODE_ATOM1_RETURN_I64: {
+         unsigned sr_control = va_repack_sr_control_v15(info.sr_control);
+         pack_assert(I, sr_control == 0x0);
+         break;
+      }
+      case BI_OPCODE_ATOM_I32:
+      case BI_OPCODE_ATOM_I64: {
+         unsigned sr_control = va_repack_sr_control_v15(info.sr_control);
+         pack_assert(I, sr_control == 0x2);
+         break;
+      }
+      case BI_OPCODE_ATOM_RETURN_I32:
+      case BI_OPCODE_ATOM_RETURN_I64:
+      case BI_OPCODE_AXCHG_I32:
+      case BI_OPCODE_AXCHG_I64:
+      case BI_OPCODE_ACMPXCHG_I32:
+      case BI_OPCODE_ACMPXCHG_I64: {
+         unsigned sr_control = va_repack_sr_control_v15(info.sr_control);
+         pack_assert(I, sr_control == 0x0 || sr_control == 0x3);
+         hex |= (uint64_t)0x3 << 38;
+         break;
+      }
+      default:
+         break;
+      }
   }

   if (info.sr_write_count) {
-      hex |= ((uint64_t)bi_count_write_registers(I, 0) - 1) << 36;
-      hex |= ((uint64_t)va_pack_reg(I, I->dest[0])) << 16;
+      hex |= ((uint64_t)bi_count_write_registers(I, 0) - 1)
+             << ((arch >= 15) ? 35 : 36);
+      if (arch >= 15)
+         hex |= ((uint64_t)va_pack_reg_v15(I, I->dest[0])) << 16;
+      else
+         hex |= ((uint64_t)va_pack_reg(I, I->dest[0])) << 16;
   }

   if (info.vecsize)
@ -885,7 +1137,10 @@ va_pack_instr(const bi_instr *I, unsigned arch)
   case BI_OPCODE_LOAD_I64:
   case BI_OPCODE_LOAD_I96:
   case BI_OPCODE_LOAD_I128:
-      hex |= va_pack_load(I, false);
+      if (arch >= 15)
+         hex |= va_pack_load_v15(I, false);
+      else
+         hex |= va_pack_load(I, false);
      break;

   case BI_OPCODE_LD_PKA_I8:
@ -896,7 +1151,10 @@ va_pack_instr(const bi_instr *I, unsigned arch)
   case BI_OPCODE_LD_PKA_I64:
   case BI_OPCODE_LD_PKA_I96:
   case BI_OPCODE_LD_PKA_I128:
-      hex |= va_pack_load(I, true);
+      if (arch >= 15)
+         hex |= va_pack_load_v15(I, true);
+      else
+         hex |= va_pack_load(I, true);
      break;

   case BI_OPCODE_STORE_I8:
@ -907,20 +1165,26 @@ va_pack_instr(const bi_instr *I, unsigned arch)
   case BI_OPCODE_STORE_I64:
   case BI_OPCODE_STORE_I96:
   case BI_OPCODE_STORE_I128:
-      hex |= va_pack_store(I);
+      if (arch >= 15)
+         hex |= va_pack_store_v15(I);
+      else
+         hex |= va_pack_store(I);
      break;

   case BI_OPCODE_ATOM1_RETURN_I64:
      /* Permit omitting the destination for plain ATOM1 */
-      if (!bi_count_write_registers(I, 0)) {
+      if (arch < 15 && !bi_count_write_registers(I, 0)) {
         hex |= (0x40ull << 40); // fake read
      }

      /* 64-bit source */
      va_validate_register_pair(I, 0);
-      hex |= (uint64_t)va_pack_src(I, 0) << 0;
+      if (arch >= 15)
+         hex |= va_pack_src_v15(I, 0, 0);
+      else
+         hex |= (uint64_t)va_pack_src(I, 0) << 0;
      hex |= va_pack_byte_offset_8(I);
-      hex |= ((uint64_t)va_pack_atom_opc_1(I)) << 22;
+      hex |= ((uint64_t)va_pack_atom_opc_1(I)) << ((arch >= 15) ? 24 : 22);
      break;

   case BI_OPCODE_ACMPXCHG_I64:
@ -929,29 +1193,43 @@ va_pack_instr(const bi_instr *I, unsigned arch)
   case BI_OPCODE_ATOM_RETURN_I64:
      /* 64-bit source */
      va_validate_register_pair(I, 1);
-      hex |= (uint64_t)va_pack_src(I, 1) << 0;
+      if (arch >= 15)
+         hex |= va_pack_src_v15(I, 1, 0);
+      else
+         hex |= (uint64_t)va_pack_src(I, 1) << 0;
      hex |= va_pack_byte_offset_8(I);
-      hex |= ((uint64_t)va_pack_atom_opc(I)) << 22;
+      hex |= ((uint64_t)va_pack_atom_opc(I)) << ((arch >= 15) ? 24 : 22);

-      if (I->op == BI_OPCODE_ATOM_RETURN_I64)
-         hex |= (0xc0ull << 40); // flags
+      if (arch >= 15) {
+         if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG) {
+            /* Change bits [51;50] to be ACMPXCHG */
+            pack_assert(I, ((hex >> 50) & 0b11) == 0b01);
+            hex ^= (0b11ull << 50);
+         }
+      } else {
+         if (I->op == BI_OPCODE_ATOM_RETURN_I64)
+            hex |= (0xc0ull << 40); // flags

-      if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
-         hex |= (1 << 26); /* .compare */
+         if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
+            hex |= (1 << 26); /* .compare */
+      }

      break;

   case BI_OPCODE_ATOM1_RETURN_I32:
      /* Permit omitting the destination for plain ATOM1 */
-      if (!bi_count_write_registers(I, 0)) {
+      if (arch < 15 && !bi_count_write_registers(I, 0)) {
         hex |= (0x40ull << 40); // fake read
      }

      /* 64-bit source */
      va_validate_register_pair(I, 0);
-      hex |= (uint64_t)va_pack_src(I, 0) << 0;
+      if (arch >= 15)
+         hex |= va_pack_src_v15(I, 0, 0);
+      else
+         hex |= (uint64_t)va_pack_src(I, 0) << 0;
      hex |= va_pack_byte_offset_8(I);
-      hex |= ((uint64_t)va_pack_atom_opc_1(I)) << 22;
+      hex |= ((uint64_t)va_pack_atom_opc_1(I)) << ((arch >= 15) ? 24 : 22);
      break;

   case BI_OPCODE_ACMPXCHG_I32:
@ -960,41 +1238,67 @@ va_pack_instr(const bi_instr *I, unsigned arch)
   case BI_OPCODE_ATOM_RETURN_I32:
      /* 64-bit source */
      va_validate_register_pair(I, 1);
-      hex |= (uint64_t)va_pack_src(I, 1) << 0;
+      if (arch >= 15)
+         hex |= va_pack_src_v15(I, 1, 0);
+      else
+         hex |= (uint64_t)va_pack_src(I, 1) << 0;
      hex |= va_pack_byte_offset_8(I);
-      hex |= ((uint64_t)va_pack_atom_opc(I)) << 22;
+      hex |= ((uint64_t)va_pack_atom_opc(I)) << ((arch >= 15) ? 24 : 22);

-      if (I->op == BI_OPCODE_ATOM_RETURN_I32)
-         hex |= (0xc0ull << 40); // flags
+      if (arch >= 15) {
+         if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG) {
+            /* Change bits [51;50] to be ACMPXCHG */
+            pack_assert(I, ((hex >> 50) & 0b11) == 0b01);
+            hex ^= (0b11ull << 50);
+         }
+      } else {
+         if (I->op == BI_OPCODE_ATOM_RETURN_I32)
+            hex |= (0xc0ull << 40); // flags

-      if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
-         hex |= (1 << 26); /* .compare */
+         if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
+            hex |= (1 << 26); /* .compare */
+      }

      break;

   case BI_OPCODE_LD_CVT:
-      hex |= (uint64_t)va_pack_src(I, 0);
+      if (arch >= 15)
+         hex |= va_pack_src_v15(I, 0, 0);
+      else
+         hex |= (uint64_t)va_pack_src(I, 0);
      hex |= va_pack_byte_offset(I);

      /* Conversion descriptor */
-      hex |= (uint64_t)va_pack_src(I, 2) << 16;
-      hex |= (uint64_t)I->mem_access << 37;
+      if (arch >= 15)
+         hex |= va_pack_src_v15(I, 2, 2);
+      else
+         hex |= (uint64_t)va_pack_src(I, 2) << 16;
+      hex |= (uint64_t)I->mem_access << ((arch >= 15) ? 35 : 37);
      break;

   case BI_OPCODE_ST_CVT:
      /* Staging read */
      va_validate_register_pair(I, 1);
-      hex |= (uint64_t)va_pack_src(I, 1) << 0;
+      if (arch >= 15)
+         hex |= va_pack_src_v15(I, 1, 0);
+      else
+         hex |= (uint64_t)va_pack_src(I, 1) << 0;
      hex |= va_pack_byte_offset(I);

      /* Conversion descriptor */
-      hex |= (uint64_t)va_pack_src(I, 3) << 16;
-      hex |= (uint64_t)I->mem_access << 37;
+      if (arch >= 15)
+         hex |= va_pack_src_v15(I, 3, 2);
+      else
+         hex |= (uint64_t)va_pack_src(I, 3) << 16;
+      hex |= (uint64_t)I->mem_access << ((arch >= 15) ? 35 : 37);
      break;

   case BI_OPCODE_BLEND: {
      /* Source 0 - Blend descriptor (64-bit) */
-      hex |= ((uint64_t)va_pack_src(I, 2)) << 0;
+      if (arch >= 15)
+         hex |= va_pack_src_v15(I, 2, 0);
+      else
+         hex |= ((uint64_t)va_pack_src(I, 2)) << 0;
      va_validate_register_pair(I, 2);

      /* Target */
@ -1005,7 +1309,10 @@ va_pack_instr(const bi_instr *I, unsigned arch)
      hex |= ((I->branch_offset >> 3) << 8);

      /* Source 2 - coverage mask */
-      hex |= ((uint64_t)va_pack_reg(I, I->src[1])) << 16;
+      if (arch >= 15)
+         hex |= va_pack_src_v15(I, 1, 2);
+      else
+         hex |= ((uint64_t)va_pack_reg(I, I->src[1])) << 16;

      /* Vector size */
      unsigned vecsize = 4;
@ -1015,7 +1322,7 @@ va_pack_instr(const bi_instr *I, unsigned arch)
   }

   case BI_OPCODE_LD_GCLK_U64:
-      hex |= va_pack_gclk(I);
+      hex |= va_pack_gclk(I) << ((arch >= 15) ? 8 : 0);
      break;

   case BI_OPCODE_TEX_GRADIENT:
@ -1023,7 +1330,10 @@ va_pack_instr(const bi_instr *I, unsigned arch)
   case BI_OPCODE_TEX_FETCH:
   case BI_OPCODE_TEX_GATHER: {
      /* Image to read from */
-      hex |= ((uint64_t)va_pack_src(I, 1)) << 0;
+      if (arch >= 15)
+         hex |= va_pack_src_v15(I, 1, 0);
+      else
+         hex |= ((uint64_t)va_pack_src(I, 1)) << 0;

      if ((I->op == BI_OPCODE_TEX_FETCH || I->op == BI_OPCODE_TEX_GRADIENT) &&
          I->shadow)
@ -1040,7 +1350,7 @@ va_pack_instr(const bi_instr *I, unsigned arch)
      if (I->skip)
         hex |= (1ull << 39);
      if (!bi_is_regfmt_16(I->register_format))
-         hex |= (1ull << 46);
+         hex |= (1ull << ((arch >= 15) ? 38 : 46));

      if (I->op == BI_OPCODE_TEX_GRADIENT) {
         if (I->force_delta_enable)
@ -1062,20 +1372,35 @@ va_pack_instr(const bi_instr *I, unsigned arch)
         hex |= ((uint64_t)I->fetch_component) << 14;
      }

-      hex |= (I->write_mask << 22);
+      hex |= (I->write_mask << ((arch >= 15) ? 24 : 22));
      hex |= ((uint64_t)I->dimension) << 28;

      break;
   }

   default:
-      if (!info.exact && I->op != BI_OPCODE_NOP)
+      if (!info.exact && (arch >= 15 || I->op != BI_OPCODE_NOP))
         invalid_instruction(I, "opcode");

      hex |= va_pack_alu(I, arch);
      break;
   }

+   /* On v15, some instrutions require an encoded null src. */
+   if (arch >= 15) {
+      switch (I->op) {
+      case BI_OPCODE_NOP:
+      case BI_OPCODE_LD_VAR_FLAT_IMM:
+      case BI_OPCODE_LD_VAR_BUF_FLAT_IMM:
+      case BI_OPCODE_LD_GCLK_U64:
+      case BI_OPCODE_BARRIER:
+         hex |= va_pack_src_null_v15(0);
+         break;
+      default:
+         break;
+      }
+   }
+
   return hex;
 }

--- a/src/panfrost/compiler/bifrost/valhall/va_perf.c
+++ b/src/panfrost/compiler/bifrost/valhall/va_perf.c
@ -9,7 +9,7 @@
 #include "valhall.h"

 void
-va_count_instr_stats(bi_instr *I, struct va_stats *stats)
+va_count_instr_stats(bi_instr *I, unsigned arch, struct va_stats *stats)
 {
   /* Adjusted for 64-bit arithmetic */
   unsigned words = bi_count_write_registers(I, 0);
@ -35,7 +35,7 @@ va_count_instr_stats(bi_instr *I, struct va_stats *stats)
         }
      }
   }
-   switch (valhall_opcodes[I->op].unit) {
+   switch (get_valhall_opcode(I->op, arch).unit) {
   /* Arithmetic is 2x slower for 64-bit than 32-bit */
   case VA_UNIT_FMA:
      stats->fma += words;
--- a/src/panfrost/compiler/bifrost/valhall/va_validate.c
+++ b/src/panfrost/compiler/bifrost/valhall/va_validate.c
@ -93,7 +93,8 @@ fau_state_uniform(struct fau_state *fau, bi_index idx, enum bi_opcode op)
 }

 static bool
-fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op)
+fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op,
+                  unsigned arch)
 {
   for (unsigned i = 0; i < ARRAY_SIZE(fau->buffer); ++i) {
      bi_index buf = fau->buffer[i];
@ -106,7 +107,7 @@ fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op)
   /* Instructions executed by the messaging unit should not encode WARP_ID or
    * anything from special page 3. */
   if (can_run_on_message_unit(op) &&
-       (va_fau_page(idx.value) == 3 || idx.value == BIR_FAU_WARP_ID))
+       (va_fau_page(idx.value, arch) == 3 || idx.value == BIR_FAU_WARP_ID))
      return false;

   return fau->uniform_slot == -1 || can_use_two_fau_indices(op);
@ -114,7 +115,7 @@ fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op)

 static bool
 valid_src(struct fau_state *fau, unsigned fau_page, bi_index src,
-          enum bi_opcode op)
+          enum bi_opcode op, unsigned arch)
 {
   if (src.type != BI_INDEX_FAU)
      return true;
@ -128,42 +129,42 @@ valid_src(struct fau_state *fau, unsigned fau_page, bi_index src,
      return fau_state_buffer(fau, src);
   }

-   bool valid = (fau_page == va_fau_page(src.value));
+   bool valid = (fau_page == va_fau_page(src.value, arch));
   valid &= fau_state_buffer(fau, src);

   if (src.value & BIR_FAU_UNIFORM)
      valid &= fau_state_uniform(fau, src, op);
   else if (fau_is_special(src.value))
-      valid &= fau_state_special(fau, src, op);
+      valid &= fau_state_special(fau, src, op, arch);

   return valid;
 }

 bool
-va_validate_fau(bi_instr *I)
+va_validate_fau(bi_instr *I, unsigned arch)
 {
   bool valid = true;
   struct fau_state fau = {.uniform_slot = -1};
-   unsigned fau_page = va_select_fau_page(I);
+   unsigned fau_page = va_select_fau_page(I, arch);

   bi_foreach_src(I, s) {
-      valid &= valid_src(&fau, fau_page, I->src[s], I->op);
+      valid &= valid_src(&fau, fau_page, I->src[s], I->op, arch);
   }

   return valid;
 }

 void
-va_repair_fau(bi_builder *b, bi_instr *I)
+va_repair_fau(bi_builder *b, bi_instr *I, unsigned arch)
 {
   struct fau_state fau = {.uniform_slot = -1};
-   unsigned fau_page = va_select_fau_page(I);
+   unsigned fau_page = va_select_fau_page(I, arch);

   bi_foreach_src(I, s) {
      struct fau_state push = fau;
      bi_index src = I->src[s];

-      if (!valid_src(&fau, fau_page, src, I->op)) {
+      if (!valid_src(&fau, fau_page, src, I->op, arch)) {
         bi_replace_src(I, s, bi_mov_i32(b, bi_strip_index(src)));

         /* Rollback update. Since the replacement move doesn't affect FAU
@ -180,7 +181,7 @@ va_validate(FILE *fp, bi_context *ctx)
   bool errors = false;

   bi_foreach_instr_global(ctx, I) {
-      if (!va_validate_fau(I)) {
+      if (!va_validate_fau(I, ctx->arch)) {
         if (!errors) {
            fprintf(fp, "Validation failed, this is a bug. Shader:\n\n");
            bi_print_shader(ctx, fp);
--- a/src/panfrost/compiler/bifrost/valhall/valhall.c.py
+++ b/src/panfrost/compiler/bifrost/valhall/valhall.c.py
@ -97,10 +97,10 @@ valhall_opcodes[BI_NUM_OPCODES] = {
    sr_control = 0

    if len(op.staging) > 0:
-        sr_control = op.staging[0].encoded_flags >> 6
+        sr_control = op.staging[0].encoded_flags
 %>
    [BI_OPCODE_${name.replace('.', '_').upper()}] = {
-        .exact = ${hex(exact(op))}ULL,
+        .exact = ${hex(exact(op.opcode))}ULL,
        .srcs = {
 % for src in ([sr for sr in op.staging if sr.read] + op.srcs):
            {
@ -141,12 +141,84 @@ valhall_opcodes[BI_NUM_OPCODES] = {
 % endif
 % endfor
 };
+
+const struct va_opcode_info
+valhall_v15_opcodes[BI_NUM_OPCODES] = {
+% for op in instructions:
+% if op.name not in skip:
+<%
+    name = op.name
+    if name == 'BRANCHZ':
+        name = 'BRANCHZ.i16'
+
+    sr_control = 0
+
+    if len(op.staging) > 0:
+        sr_control = op.staging[0].encoded_flags
+%>
+    [BI_OPCODE_${name.replace('.', '_').upper()}] = {
+        .exact = ${hex(exact(op.opcode_v15))}ULL,
+        .srcs = {
+% for src in ([sr for sr in op.staging if sr.read] + op.srcs):
+            {
+                .absneg = ${ibool(src.absneg)},
+                .swizzle = ${ibool(src.swizzle)},
+                .notted = ${ibool(src.notted)},
+                .widen = ${ibool(src.widen)},
+                .lanes = ${ibool(src.lanes)},
+                .halfswizzle = ${ibool(src.halfswizzle)},
+                .lane = ${ibool(src.lane)},
+                .combine = ${ibool(src.combine)},
+% if src.size in [8, 16, 32, 64]:
+                .size = VA_SIZE_${src.size},
+% endif
+            },
+% endfor
+        },
+        .type_size = ${typesize(op.name)},
+        .has_dest = ${ibool(len(op.dests) > 0)},
+        .is_signed = ${ibool(op.is_signed)},
+        .unit = VA_UNIT_${op.unit},
+        .nr_srcs = ${len(op.srcs)},
+        .nr_staging_srcs = ${sum([sr.read for sr in op.staging])},
+        .nr_staging_dests = ${sum([sr.write for sr in op.staging])},
+        .clamp = ${hasmod(x, 'clamp')},
+        .saturate = ${hasmod(x, 'saturate')},
+        .rhadd = ${hasmod(x, 'rhadd')},
+        .round_mode = ${hasmod(x, 'round_mode')},
+        .condition = ${hasmod(x, 'condition')},
+        .result_type = ${hasmod(x, 'result_type')},
+        .vecsize = ${hasmod(x, 'vector_size')},
+        .register_format = ${hasmod(x, 'register_format')},
+        .slot = ${hasmod(x, 'slot')},
+        .sr_count = ${hasmod(x, 'staging_register_count')},
+        .sr_write_count = ${hasmod(x, 'staging_register_write_count')},
+        .sr_control = ${sr_control},
+    },
+% endif
+% endfor
+};
+
+const struct va_opcode_info
+get_valhall_opcode(enum bi_opcode op, unsigned arch)
+{
+   assert(arch >= 9);
+   if (arch < 15)
+      return valhall_opcodes[op];
+   else
+      return valhall_v15_opcodes[op];
+}
 """

 # Exact value to be ORed in to every opcode
-def exact_op(op):
+def exact_op(opcode):
    exact_op = 0
-    for subcode in op.opcode:
+
+    # Need an early return in case of removed instructions
+    if not opcode:
+        return exact_op
+
+    for subcode in opcode:
        exact_op |= (subcode.value << subcode.start)
    return exact_op

--- a/src/panfrost/compiler/bifrost/valhall/valhall.h
+++ b/src/panfrost/compiler/bifrost/valhall/valhall.h
@ -89,7 +89,8 @@ struct va_opcode_info {
   unsigned sr_control       : 2;
 };

-extern const struct va_opcode_info valhall_opcodes[BI_NUM_OPCODES];
+const struct va_opcode_info get_valhall_opcode(enum bi_opcode op,
+                                               unsigned arch);

 /* Bifrost specifies the source of bitwise operations as (A, B, shift), but
 * Valhall specifies (A, shift, B). We follow Bifrost conventions in the
@ -130,10 +131,10 @@ va_swap_12(enum bi_opcode op)
 }

 static inline struct va_src_info
-va_src_info(enum bi_opcode op, unsigned src)
+va_src_info(enum bi_opcode op, unsigned src, unsigned arch)
 {
   unsigned idx = (va_swap_12(op) && (src == 1 || src == 2)) ? (3 - src) : src;
-   return valhall_opcodes[op].srcs[idx];
+   return get_valhall_opcode(op, arch).srcs[idx];
 }

 static inline bool
--- a/src/panfrost/compiler/bifrost/valhall/valhall.py
+++ b/src/panfrost/compiler/bifrost/valhall/valhall.py
@ -14,6 +14,7 @@ import sys
 instructions = []

 MODIFIERS = {}
+MODIFIERS_V15 = {}
 enums = {}
 immediates = []

@ -102,6 +103,11 @@ class Source:
        self.offset['value'] = self.start
        self.mask['value'] = bitmask(6)

+        self.offset['high1_v15'] = (index + 48)
+        self.mask['high1_v15'] = bitmask(1)
+        self.offset['low8_v15'] = self.start
+        self.mask['low8_v15'] = bitmask(8)
+
        if absneg:
            self.offset['neg'] = 32 + 2 + ((2 - index) * 2)
            self.offset['abs'] = 33 + 2 + ((2 - index) * 2)
@ -137,6 +143,11 @@ class Dest:
        self.offset['value'] = self.start
        self.mask['value'] = bitmask(6)

+        self.offset['mode_v15'] = self.start + 13
+        self.mask['mode_v15'] = bitmask(2)
+        self.offset['value_v15'] = self.start
+        self.mask['value_v15'] = bitmask(8)
+
 class Staging:
    def __init__(self, read = False, write = False, count = 0, flags = 'true', name = ""):
        self.name = name
@ -152,6 +163,14 @@ class Staging:

        self.offset['value'] = self.start
        self.mask['value'] = bitmask(6)
+        self.offset['flags'] = self.start + 6
+        self.mask['flags'] = bitmask(2)
+
+        self.offset['value_v15'] = self.start
+        self.mask['value_v15'] = bitmask(8)
+        self.offset['flags_v15'] = 38
+        self.mask['flags_v15'] = bitmask(2)
+

        # For compatibility
        self.absneg = False
@ -166,11 +185,14 @@ class Staging:

        if not self.flags:
            self.encoded_flags = 0
+            self.encoded_flags_v15 = 0
        elif flags == 'rw':
-            self.encoded_flags = 0xc0
+            self.encoded_flags = 0b11
+            self.encoded_flags_v15 = 0b11
        else:
            assert(flags == 'true')
-            self.encoded_flags = (0x80 if write else 0) | (0x40 if read else 0)
+            self.encoded_flags = (0b10 if write else 0) | (0b01 if read else 0)
+            self.encoded_flags_v15 = (0b10 if read else 0) | (0b01 if read and write else 0)

 class Immediate:
    def __init__(self, name, start, size, signed):
@ -186,13 +208,16 @@ class Opcode:
        self.mask = mask

 class Instruction:
-    def __init__(self, name, opcode, srcs = [], dests = [], immediates = [], modifiers = [], staging = None, unit = None):
+    def __init__(self, name, opcode, opcode_v15, srcs = [], dests = [], immediates = [], immediates_v15 = [], modifiers = [], modifiers_v15 = [], staging = None, unit = None):
        self.name = name
        self.srcs = srcs
        self.dests = dests
        self.opcode = opcode
+        self.opcode_v15 = opcode_v15
        self.immediates = immediates
+        self.immediates_v15 = immediates_v15
        self.modifiers = modifiers
+        self.modifiers_v15 = modifiers_v15
        self.staging = staging
        self.unit = unit
        self.is_signed = len(name.split(".")) > 1 and ('s' in name.split(".")[1])
@ -205,6 +230,11 @@ class Instruction:
        self.offset['fau_page'] = 57
        self.mask['fau_page'] = bitmask(2)

+        self.offset['flow_v15'] = 58
+        self.mask['flow_v15'] = bitmask(4)
+        self.offset['fau_page_v15'] = 62
+        self.mask['fau_page_v15'] = bitmask(2)
+
        # Message-passing instruction <===> not ALU instruction
        self.message = unit not in ["FMA", "CVT", "SFU"]

@ -273,6 +303,7 @@ def build_instr(el, overrides = {}):
    # Get overridables
    name = overrides.get('name') or el.attrib.get('name')
    opcode = overrides.get('opcode') or build_opcode(el, 'opcode')
+    opcode_v15 = overrides.get('opcode_v15') or build_opcode(el, 'opcode_v15')
    unit = overrides.get('unit') or el.attrib.get('unit')

    # Get explicit sources/dests
@ -304,15 +335,25 @@ def build_instr(el, overrides = {}):

    # Get immediates
    imms = [build_imm(imm) for imm in el.findall('imm')]
+    imms_v15 = [build_imm(imm) for imm in el.findall('imm_v15_override')]
+    for imm in imms:
+        if imm.name not in {imm.name for imm in imms_v15}:
+            imms_v15.append(imm)

    modifiers = []
+    modifiers_v15 = []
    for mod in el:
        if (mod.tag in MODIFIERS) and not (mod.attrib.get('pseudo', False)):
            modifiers.append(MODIFIERS[mod.tag])
+            modifiers_v15.append(MODIFIERS_V15[mod.tag])
        elif mod.tag =='va_mod':
            modifiers.append(build_modifier(mod))
+        elif mod.tag =='va_mod_v15':
+            modifiers_v15.append(build_modifier(mod))

-    instr = Instruction(name, opcode, srcs = sources, dests = dests, immediates = imms, modifiers = modifiers, staging = staging, unit = unit)
+
+    instr = Instruction(name, opcode, opcode_v15, srcs = sources, dests = dests, immediates = imms, immediates_v15 = imms_v15,
+                        modifiers = modifiers, modifiers_v15 = modifiers_v15, staging = staging, unit = unit)

    instructions.append(instr)

@ -323,6 +364,7 @@ def build_group(el):
        build_instr(el, overrides = {
            'name': ins.attrib['name'],
            'opcode': build_opcode(ins, 'opcode'),
+            'opcode_v15': build_opcode(ins, 'opcode_v15'),
            'unit': ins.attrib.get('unit'),
        })

@ -377,6 +419,7 @@ def typesize(name):
 # Parse the ISA
 def valhall_parse_isa(xmlfile):
    global MODIFIERS
+    global MODIFIERS_V15
    global enums
    global immediates
    global root
@ -404,7 +447,6 @@ def valhall_parse_isa(xmlfile):
        "lod_bias_disable": Modifier("lod_mode", 13, 1),
        "lod_clamp_disable": Modifier("lod_mode", 14, 1),
        "write_mask": Modifier("write_mask", 22, 4),
-        "register_type": Modifier("register_type", 26, 2),
        "dimension": Modifier("dimension", 28, 2),
        "skip": Flag("skip", 39),
        "register_width": Modifier("register_width", 46, 1, force_enum = "register_width"),
@ -438,6 +480,52 @@ def valhall_parse_isa(xmlfile):
        "sample": Modifier("sample_mode", 38, 2),
    }

+    MODIFIERS_V15 = {
+        # Texture instructions share a common encoding
+        "wide_indices": Flag("wide_indices", 8),
+        "array_enable": Flag("array_enable", 10),
+        "texel_offset": Flag("texel_offset", 11),
+        "shadow": Flag("shadow", 12),
+        "integer_coordinates": Flag("integer_coordinates", 13),
+        "fetch_component": Modifier("fetch_component", 14, 2),
+        "lod_mode": Modifier("lod_mode", 13, 3),
+        "lod_bias_disable": Modifier("lod_mode", 13, 1),
+        "lod_clamp_disable": Modifier("lod_mode", 14, 1),
+        "write_mask": Modifier("write_mask", 24, 4),
+        "dimension": Modifier("dimension", 28, 2),
+        "skip": Flag("skip", 39),
+        "register_width": Modifier("register_width", 38, 1, force_enum = "register_width"),
+        "secondary_register_width": Modifier("secondary_register_width", 54, 1, force_enum = "register_width"),
+        "vartex_register_width": Modifier("varying_texture_register_width", 24, 2),
+
+        "atom_opc": Modifier("atomic_operation", 24, 4),
+        "atom_opc_1": Modifier("atomic_operation_with_1", 24, 3),
+        "inactive_result": Modifier("inactive_result", 22, 4),
+        "memory_access": Modifier("memory_access", 24, 2),
+        "regfmt": Modifier("register_format", 24, 3),
+        "source_format": Modifier("source_format", 24, 2),
+        "vecsize": Modifier("vector_size", 28, 2),
+
+        "slot": Modifier("slot_v15", 30, 2),
+        "roundmode": Modifier("round_mode", 32, 2),
+        "result_type": Modifier("result_type", 24, 2),
+        "saturate": Flag("saturate", 25),
+        "not_result": Flag("not_result", 34),
+
+        "lane_op": Modifier("lane_operation", 32, 4),
+        "cmp": Modifier("condition", 33, 3),
+        "clamp": Modifier("clamp", 30, 2),
+        "sr_count": Modifier("staging_register_count", 32, 3, implied = True),
+        "sample_and_update": Modifier("sample_and_update_mode", 32, 3),
+        "sr_write_count": Modifier("staging_register_write_count", 35, 3, implied = True),
+
+        "conservative": Flag("conservative", 35),
+        "subgroup": Modifier("subgroup_size", 36, 4),
+        "update": Modifier("update_mode", 35, 2),
+        "sample": Modifier("sample_mode", 37, 2),
+    }
+
+
    for child in root:
        if child.tag == 'group':
            build_group(child)
--- a/src/panfrost/compiler/pan_compiler.c
+++ b/src/panfrost/compiler/pan_compiler.c
@ -52,8 +52,10 @@ pan_get_nir_shader_compiler_options(unsigned arch, bool merge_wg)
   case 11:
   case 12:
   case 13:
-      return merge_wg ? &bifrost_nir_options_v11_merge_wg :
-                        &bifrost_nir_options_v11;
+   case 14:
+   case 15:
+      return merge_wg ? &bifrost_nir_options_v11_merge_wg
+                      : &bifrost_nir_options_v11;
   default:
      assert(!"Unsupported arch");
      return NULL;
@ -285,7 +287,8 @@ pan_disassemble(FILE *fp, const void *code, size_t size, uint64_t gpu_id,
                bool verbose)
 {
   if (pan_arch(gpu_id) >= 9)
-      disassemble_valhall(fp, (const uint64_t *)code, size, verbose);
+      disassemble_valhall(fp, (const uint64_t *)code, size, pan_arch(gpu_id),
+                          verbose);
   else if (pan_arch(gpu_id) >= 6)
      disassemble_bifrost(fp, code, size, verbose);
   else
--- a/src/panfrost/genxml/cs_builder.h
+++ b/src/panfrost/genxml/cs_builder.h
@ -824,7 +824,11 @@ cs_instr_is_asynchronous(enum mali_cs_opcode opcode, uint16_t wait_mask)
   case MALI_CS_OPCODE_STORE_MULTIPLE:
   case MALI_CS_OPCODE_RUN_COMPUTE:
   case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
+#if PAN_ARCH >= 14
+   case MALI_CS_OPCODE_RUN_FRAGMENT2:
+#else
   case MALI_CS_OPCODE_RUN_FRAGMENT:
+#endif
   case MALI_CS_OPCODE_RUN_FULLSCREEN:
 #if PAN_ARCH >= 12
   case MALI_CS_OPCODE_RUN_IDVS2:
@ -1614,6 +1618,22 @@ cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool malloc_enable,
 }
 #endif

+#if PAN_ARCH >= 14
+static inline void
+cs_run_fragment2(struct cs_builder *b, bool enable_tem,
+                 enum mali_tile_render_order tile_order)
+{
+   /* Staging regs */
+   cs_flush_loads(b);
+
+   b->req_resource_mask |= CS_FRAG_RES;
+
+   cs_emit(b, RUN_FRAGMENT2, I) {
+      I.enable_tem = enable_tem;
+      I.tile_order = tile_order;
+   }
+}
+#else
 static inline void
 cs_run_fragment(struct cs_builder *b, bool enable_tem,
                enum mali_tile_render_order tile_order)
@ -1628,6 +1648,7 @@ cs_run_fragment(struct cs_builder *b, bool enable_tem,
      I.tile_order = tile_order;
   }
 }
+#endif

 static inline void
 cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override,
@ -2469,6 +2490,53 @@ cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
   (int16_t)(offsetof(struct cs_##__type##_trace, __field) -                   \
             sizeof(struct cs_##__type##_trace))

+#if PAN_ARCH >= 14
+#define CS_RUN_FRAGMENT2_SR_COUNT 56
+#define CS_RUN_FRAGMENT2_SR_MASK  BITFIELD64_RANGE(0, CS_RUN_FRAGMENT2_SR_COUNT)
+struct cs_run_fragment2_trace {
+   uint64_t ip;
+   uint32_t sr[CS_RUN_FRAGMENT2_SR_COUNT];
+} __attribute__((aligned(64)));
+
+static inline void
+cs_trace_run_fragment2(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
+                       struct cs_index scratch_regs, bool enable_tem,
+                       enum mali_tile_render_order tile_order)
+{
+   if (likely(!ctx->enabled)) {
+      cs_run_fragment2(b, enable_tem, tile_order);
+      return;
+   }
+
+   struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
+   struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
+
+   cs_trace_preamble(b, ctx, scratch_regs,
+                     sizeof(struct cs_run_fragment2_trace));
+
+   /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
+    * won't point to the right instruction. */
+   cs_load_ip_to(b, data);
+   cs_run_fragment2(b, enable_tem, tile_order);
+   cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_fragment2, ip));
+
+   ASSERTED unsigned sr_count = 0;
+   unsigned sr_offset = cs_trace_field_offset(run_fragment2, sr);
+   for (unsigned i = 0; i < CS_RUN_FRAGMENT2_SR_COUNT; i += 16) {
+      unsigned mask = (CS_RUN_FRAGMENT2_SR_MASK >> i) & BITFIELD_MASK(16);
+      if (!mask)
+         continue;
+
+      cs_store(b, cs_reg_tuple(b, i, util_last_bit(mask)), tracebuf_addr, mask,
+               sr_offset);
+      sr_offset += util_bitcount(mask) * sizeof(uint32_t);
+      sr_count += util_bitcount(mask);
+   }
+   assert(sr_count == CS_RUN_FRAGMENT2_SR_COUNT);
+
+   cs_flush_stores(b);
+}
+#else
 struct cs_run_fragment_trace {
   uint64_t ip;
   uint32_t sr[7];
@ -2500,6 +2568,7 @@ cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
            cs_trace_field_offset(run_fragment, sr));
   cs_flush_stores(b);
 }
+#endif

 #if PAN_ARCH >= 13
 #define CS_RUN_FULLSCREEN_SR_MASK \
--- a/src/panfrost/genxml/decode.c
+++ b/src/panfrost/genxml/decode.c
@ -152,22 +152,22 @@ pandecode_rt(struct pandecode_context *ctx, unsigned index, uint64_t gpu_va)

 }

-static void
-pandecode_rts(struct pandecode_context *ctx, uint64_t gpu_va,
-              const struct MALI_FRAMEBUFFER_PARAMETERS *fb)
+void
+GENX(pandecode_rts)(struct pandecode_context *ctx, uint64_t gpu_va,
+                    uint32_t render_target_count)
 {
   pandecode_log(ctx, "Color Render Targets @%" PRIx64 ":\n", gpu_va);
   ctx->indent++;

-   for (int i = 0; i < (fb->render_target_count); i++)
+   for (int i = 0; i < render_target_count; i++)
      pandecode_rt(ctx, i, gpu_va);

   ctx->indent--;
   pandecode_log(ctx, "\n");
 }

-static void
-pandecode_zs_crc_ext(struct pandecode_context *ctx, uint64_t gpu_va)
+void
+GENX(pandecode_zs_crc_ext)(struct pandecode_context *ctx, uint64_t gpu_va)
 {
   const struct mali_zs_crc_extension_packed *PANDECODE_PTR_VAR(
      ctx, zs_crc_packed, (uint64_t)gpu_va);
@ -223,22 +223,65 @@ pandecode_zs_crc_ext(struct pandecode_context *ctx, uint64_t gpu_va)


 #if PAN_ARCH >= 6
-static void
-pandecode_sample_locations(struct pandecode_context *ctx, const void *fb)
+void
+GENX(pandecode_frame_shader_dcds)(struct pandecode_context *ctx,
+                                  uint64_t dcd_pointer, unsigned pre_frame_0,
+                                  unsigned pre_frame_1, unsigned post_frame,
+                                  unsigned job_type_param, uint64_t gpu_id)
 {
-   pan_section_unpack(fb, FRAMEBUFFER, PARAMETERS, params);
+   const unsigned dcd_size = pan_size(DRAW);

-   const uint16_t *PANDECODE_PTR_VAR(ctx, samples, params.sample_locations);
+   if (pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
+      const struct mali_draw_packed *PANDECODE_PTR_VAR(
+         ctx, dcd, dcd_pointer + (0 * dcd_size));
+      pan_unpack(dcd, DRAW, draw)
+         ;
+      pandecode_log(ctx, "Pre frame 0 @%" PRIx64 " (mode=%d):\n", dcd_pointer,
+                    pre_frame_0);
+      ctx->indent++;
+      GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
+      ctx->indent--;
+   }

-   pandecode_log(ctx, "Sample locations @%" PRIx64 ":\n",
-                 params.sample_locations);
+   if (pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
+      const struct mali_draw_packed *PANDECODE_PTR_VAR(
+         ctx, dcd, dcd_pointer + (1 * dcd_size));
+      pan_unpack(dcd, DRAW, draw)
+         ;
+      pandecode_log(ctx, "Pre frame 1 @%" PRIx64 ":\n",
+                    dcd_pointer + (1 * dcd_size));
+      ctx->indent++;
+      GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
+      ctx->indent--;
+   }
+
+   if (post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
+      const struct mali_draw_packed *PANDECODE_PTR_VAR(
+         ctx, dcd, dcd_pointer + (2 * dcd_size));
+      pan_unpack(dcd, DRAW, draw)
+         ;
+      pandecode_log(ctx, "Post frame:\n");
+      ctx->indent++;
+      GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
+      ctx->indent--;
+   }
+}
+
+void
+GENX(pandecode_sample_locations)(struct pandecode_context *ctx,
+                                 uint64_t sample_locations)
+{
+   const uint16_t *PANDECODE_PTR_VAR(ctx, samples, sample_locations);
+
+   pandecode_log(ctx, "Sample locations @%" PRIx64 ":\n", sample_locations);
   for (int i = 0; i < 33; i++) {
      pandecode_log(ctx, "  (%d, %d),\n", samples[2 * i] - 128,
                    samples[2 * i + 1] - 128);
   }
 }
-#endif
+#endif /* PAN_ARCH >= 6 */

+#if PAN_ARCH < 14
 struct pandecode_fbd
 GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
                    bool is_fragment, uint64_t gpu_id)
@ -248,46 +291,17 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
   DUMP_UNPACKED(ctx, FRAMEBUFFER_PARAMETERS, params, "Parameters:\n");

 #if PAN_ARCH >= 6
-   pandecode_sample_locations(ctx, fb);
+   GENX(pandecode_sample_locations)(ctx, params.sample_locations);

-   unsigned dcd_size = pan_size(DRAW);
   unsigned job_type_param = 0;

 #if PAN_ARCH <= 9
   job_type_param = MALI_JOB_TYPE_FRAGMENT;
 #endif

-   if (params.pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
-      const struct mali_draw_packed *PANDECODE_PTR_VAR(
-         ctx, dcd, params.frame_shader_dcds + (0 * dcd_size));
-      pan_unpack(dcd, DRAW, draw);
-      pandecode_log(ctx, "Pre frame 0 @%" PRIx64 " (mode=%d):\n",
-                    params.frame_shader_dcds, params.pre_frame_0);
-      ctx->indent++;
-      GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
-      ctx->indent--;
-   }
-
-   if (params.pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
-      const struct mali_draw_packed *PANDECODE_PTR_VAR(
-         ctx, dcd, params.frame_shader_dcds + (1 * dcd_size));
-      pan_unpack(dcd, DRAW, draw);
-      pandecode_log(ctx, "Pre frame 1 @%" PRIx64 ":\n",
-                    params.frame_shader_dcds + (1 * dcd_size));
-      ctx->indent++;
-      GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
-      ctx->indent--;
-   }
-
-   if (params.post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
-      const struct mali_draw_packed *PANDECODE_PTR_VAR(
-         ctx, dcd, params.frame_shader_dcds + (2 * dcd_size));
-      pan_unpack(dcd, DRAW, draw);
-      pandecode_log(ctx, "Post frame:\n");
-      ctx->indent++;
-      GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
-      ctx->indent--;
-   }
+   GENX(pandecode_frame_shader_dcds)
+   (ctx, params.frame_shader_dcds, params.pre_frame_0, params.pre_frame_1,
+    params.post_frame, job_type_param, gpu_id);
 #else
   DUMP_SECTION(ctx, FRAMEBUFFER, LOCAL_STORAGE, fb, "Local Storage:\n");

@ -312,13 +326,13 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
   gpu_va += pan_size(FRAMEBUFFER);

   if (params.has_zs_crc_extension) {
-      pandecode_zs_crc_ext(ctx, gpu_va);
+      GENX(pandecode_zs_crc_ext)(ctx, gpu_va);

      gpu_va += pan_size(ZS_CRC_EXTENSION);
   }

   if (is_fragment)
-      pandecode_rts(ctx, gpu_va, &params);
+      GENX(pandecode_rts)(ctx, gpu_va, params.render_target_count);

   return (struct pandecode_fbd){
      .rt_count = params.render_target_count,
@ -336,6 +350,7 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
   };
 #endif
 }
+#endif /* PAN_ARCH < 14 */

 #if PAN_ARCH >= 5
 uint64_t
--- a/src/panfrost/genxml/decode.h
+++ b/src/panfrost/genxml/decode.h
@ -132,6 +132,20 @@ void pandecode_cs_binary_v13(struct pandecode_context *ctx, uint64_t bin,
 void pandecode_cs_trace_v13(struct pandecode_context *ctx, uint64_t trace,
                            uint32_t trace_size, uint64_t gpu_id);

+void pandecode_interpret_cs_v14(struct pandecode_context *ctx, uint64_t queue,
+                                uint32_t size, uint64_t gpu_id, uint32_t *regs);
+void pandecode_cs_binary_v14(struct pandecode_context *ctx, uint64_t bin,
+                             uint32_t bin_size);
+void pandecode_cs_trace_v14(struct pandecode_context *ctx, uint64_t trace,
+                            uint32_t trace_size, uint64_t gpu_id);
+
+void pandecode_interpret_cs_v15(struct pandecode_context *ctx, uint64_t queue,
+                                uint32_t size, uint64_t gpu_id, uint32_t *regs);
+void pandecode_cs_binary_v15(struct pandecode_context *ctx, uint64_t bin,
+                             uint32_t bin_size);
+void pandecode_cs_trace_v15(struct pandecode_context *ctx, uint64_t trace,
+                            uint32_t trace_size, uint64_t gpu_id);
+
 /* Logging infrastructure */
 static void
 pandecode_make_indent(struct pandecode_context *ctx)
@ -275,4 +289,22 @@ void GENX(pandecode_depth_stencil)(struct pandecode_context *ctx,

 #endif

+#if PAN_ARCH >= 6
+void GENX(pandecode_sample_locations)(struct pandecode_context *ctx,
+                                      uint64_t sample_locations);
+
+void
+   GENX(pandecode_frame_shader_dcds)(struct pandecode_context *ctx,
+                                     uint64_t dcd_pointer, unsigned pre_frame_0,
+                                     unsigned pre_frame_1, unsigned post_frame,
+                                     unsigned job_type_param, uint64_t gpu_id);
+#endif
+
+#if PAN_ARCH >= 5
+void GENX(pandecode_rts)(struct pandecode_context *ctx, uint64_t gpu_va,
+                         uint32_t render_target_count);
+
+void GENX(pandecode_zs_crc_ext)(struct pandecode_context *ctx, uint64_t gpu_va);
+#endif
+
 #endif /* __MMAP_TRACE_H__ */
--- a/src/panfrost/genxml/decode_common.c
+++ b/src/panfrost/genxml/decode_common.c
@ -423,6 +423,12 @@ pandecode_interpret_cs(struct pandecode_context *ctx, uint64_t queue_gpu_va,
   case 13:
      pandecode_interpret_cs_v13(ctx, queue_gpu_va, size, gpu_id, regs);
      break;
+   case 14:
+      pandecode_interpret_cs_v14(ctx, queue_gpu_va, size, gpu_id, regs);
+      break;
+   case 15:
+      pandecode_interpret_cs_v15(ctx, queue_gpu_va, size, gpu_id, regs);
+      break;
   default:
      UNREACHABLE("Unsupported architecture");
   }
@ -446,6 +452,12 @@ pandecode_cs_binary(struct pandecode_context *ctx, uint64_t bin_gpu_va,
   case 13:
      pandecode_cs_binary_v13(ctx, bin_gpu_va, size);
      break;
+   case 14:
+      pandecode_cs_binary_v14(ctx, bin_gpu_va, size);
+      break;
+   case 15:
+      pandecode_cs_binary_v15(ctx, bin_gpu_va, size);
+      break;
   default:
      UNREACHABLE("Unsupported architecture");
   }
@ -469,6 +481,12 @@ pandecode_cs_trace(struct pandecode_context *ctx, uint64_t trace_gpu_va,
   case 13:
      pandecode_cs_trace_v13(ctx, trace_gpu_va, size, gpu_id);
      break;
+   case 14:
+      pandecode_cs_trace_v14(ctx, trace_gpu_va, size, gpu_id);
+      break;
+   case 15:
+      pandecode_cs_trace_v15(ctx, trace_gpu_va, size, gpu_id);
+      break;
   default:
      UNREACHABLE("Unsupported architecture");
   }
--- a/src/panfrost/genxml/decode_csf.c
+++ b/src/panfrost/genxml/decode_csf.c
@ -1,5 +1,6 @@
 /*
 * Copyright (C) 2022-2023 Collabora, Ltd.
+ * Copyright (C) 2026 Arm Ltd.
 * SPDX-License-Identifier: MIT
 */

@ -117,8 +118,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr)

   case MALI_CS_OPCODE_WAIT: {
      cs_unpack(instr, CS_WAIT, I);
-      fprintf(fp, "WAIT%s #%x", I.progress_increment ? ".progress_inc" : "",
-              I.wait_mask);
+      fprintf(fp, "WAIT #%x", I.wait_mask);
      break;
   }

@ -130,15 +130,13 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
       * since we'll print them implicitly later.
       */
 #if PAN_ARCH >= 12
-      fprintf(fp, "RUN_COMPUTE%s.%s.srt%d.spd%d.tsd%d.fau%d #%u, #%u",
-              I.progress_increment ? ".progress_inc" : "", axes[I.task_axis],
-              I.srt_select, I.spd_select, I.tsd_select, I.fau_select,
-              I.task_increment, I.ep_limit);
+      fprintf(fp, "RUN_COMPUTE.%s.srt%d.spd%d.tsd%d.fau%d #%u, #%u",
+              axes[I.task_axis], I.srt_select, I.spd_select, I.tsd_select,
+              I.fau_select, I.task_increment, I.ep_limit);
 #else
-      fprintf(fp, "RUN_COMPUTE%s.%s.srt%d.spd%d.tsd%d.fau%d #%u",
-              I.progress_increment ? ".progress_inc" : "", axes[I.task_axis],
-              I.srt_select, I.spd_select, I.tsd_select, I.fau_select,
-              I.task_increment);
+      fprintf(fp, "RUN_COMPUTE.%s.srt%d.spd%d.tsd%d.fau%d #%u",
+              axes[I.task_axis], I.srt_select, I.spd_select, I.tsd_select,
+              I.fau_select, I.task_increment);
 #endif
      break;
   }
@ -146,8 +144,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
 #if PAN_ARCH == 10
   case MALI_CS_OPCODE_RUN_TILING: {
      cs_unpack(instr, CS_RUN_TILING, I);
-      fprintf(fp, "RUN_TILING%s.srt%d.spd%d.tsd%d.fau%d",
-              I.progress_increment ? ".progress_inc" : "", I.srt_select,
+      fprintf(fp, "RUN_TILING.srt%d.spd%d.tsd%d.fau%d", I.srt_select,
              I.spd_select, I.tsd_select, I.fau_select);
      break;
   }
@ -158,8 +155,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
      cs_unpack(instr, CS_RUN_IDVS, I);
      fprintf(
         fp,
-         "RUN_IDVS%s%s%s.varying_srt%d.varying_fau%d.varying_tsd%d.frag_srt%d.frag_tsd%d r%u, #%" PRIx64,
-         I.progress_increment ? ".progress_inc" : "",
+         "RUN_IDVS%s%s.varying_srt%d.varying_fau%d.varying_tsd%d.frag_srt%d.frag_tsd%d r%u, #%" PRIx64,
         I.malloc_enable ? "" : ".no_malloc",
         I.draw_id_register_enable ? ".draw_id_enable" : "",
         I.varying_srt_select, I.varying_fau_select, I.varying_tsd_select,
@ -178,8 +174,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
         ".INVALID",
      };

-      fprintf(fp, "RUN_IDVS2%s%s%s%s r%u, #%" PRIx64,
-              I.progress_increment ? ".progress_inc" : "",
+      fprintf(fp, "RUN_IDVS2%s%s%s r%u, #%" PRIx64,
              I.malloc_enable ? "" : ".no_malloc",
              I.draw_id_register_enable ? ".draw_id_enable" : "",
              vertex_shading_str[I.vertex_shading_mode], I.draw_id,
@ -318,31 +313,36 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
   case MALI_CS_OPCODE_SHARED_SB_INC: {
      cs_unpack(instr, CS_SHARED_SB_INC, I);

-      const char *progress_increment_name[] = {
-         ".no_increment",
-         ".increment",
-      };
-
-      fprintf(fp, "SHARED_SB_INC%s%s #%u, #%u",
-              progress_increment_name[I.progress_increment],
-              defer_mode_str(I), I.sb_mask, I.shared_entry);
+      fprintf(fp, "SHARED_SB_INC%s #%u, #%u", defer_mode_str(I), I.sb_mask,
+              I.shared_entry);
      break;
   }

   case MALI_CS_OPCODE_SHARED_SB_DEC: {
      cs_unpack(instr, CS_SHARED_SB_DEC, I);

-      const char *progress_increment_name[] = {
-         ".no_increment",
-         ".increment",
-      };
-
-      fprintf(fp, "SHARED_SB_DEC%s #%u",
-              progress_increment_name[I.progress_increment], I.shared_entry);
+      fprintf(fp, "SHARED_SB_DEC #%u", I.shared_entry);
      break;
   }
 #endif

+#if PAN_ARCH >= 14
+   case MALI_CS_OPCODE_RUN_FRAGMENT2: {
+      static const char *tile_order[] = {
+         "zorder",  "horizontal",     "vertical",     "unknown",
+         "unknown", "rev_horizontal", "rev_vertical", "unknown",
+         "unknown", "unknown",        "unknown",      "unknown",
+         "unknown", "unknown",        "unknown",      "unknown",
+      };
+
+      cs_unpack(instr, CS_RUN_FRAGMENT2, I);
+
+      fprintf(fp, "RUN_FRAGMENT2%s.tile_order=%s",
+              I.enable_tem ? ".tile_enable_map_enable" : "",
+              tile_order[I.tile_order]);
+      break;
+   }
+#else
   case MALI_CS_OPCODE_RUN_FRAGMENT: {
      static const char *tile_order[] = {
         "zorder",  "horizontal",     "vertical",     "unknown",
@ -350,27 +350,25 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
         "unknown", "unknown",        "unknown",      "unknown",
         "unknown", "unknown",        "unknown",      "unknown",
      };
+
      cs_unpack(instr, CS_RUN_FRAGMENT, I);

-      fprintf(fp, "RUN_FRAGMENT%s%s.tile_order=%s",
-              I.progress_increment ? ".progress_inc" : "",
+      fprintf(fp, "RUN_FRAGMENT%s.tile_order=%s",
              I.enable_tem ? ".tile_enable_map_enable" : "",
              tile_order[I.tile_order]);
      break;
   }
+#endif

   case MALI_CS_OPCODE_RUN_FULLSCREEN: {
      cs_unpack(instr, CS_RUN_FULLSCREEN, I);
-      fprintf(fp, "RUN_FULLSCREEN%s r%u, #%" PRIx64,
-              I.progress_increment ? ".progress_inc" : "", I.dcd,
-              I.flags_override);
+      fprintf(fp, "RUN_FULLSCREEN r%u, #%" PRIx64, I.dcd, I.flags_override);
      break;
   }

   case MALI_CS_OPCODE_FINISH_TILING: {
      cs_unpack(instr, CS_FINISH_TILING, I);
-      fprintf(fp, "FINISH_TILING%s",
-              I.progress_increment ? ".progress_inc" : "");
+      fprintf(fp, "FINISH_TILING");
      break;
   }

@ -443,12 +441,6 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
      break;
   }

-   case MALI_CS_OPCODE_PROGRESS_WAIT: {
-      cs_unpack(instr, CS_PROGRESS_WAIT, I);
-      fprintf(fp, "PROGRESS_WAIT d%u, #%u", I.source, I.queue);
-      break;
-   }
-
   case MALI_CS_OPCODE_SET_EXCEPTION_HANDLER: {
      cs_unpack(instr, CS_SET_EXCEPTION_HANDLER, I);
      fprintf(fp, "SET_EXCEPTION_HANDLER d%u, r%u", I.address, I.length);
@ -547,29 +539,16 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
      break;
   }

-   case MALI_CS_OPCODE_PROGRESS_STORE: {
-      cs_unpack(instr, CS_PROGRESS_STORE, I);
-      fprintf(fp, "PROGRESS_STORE d%u", I.source);
-      break;
-   }
-
-   case MALI_CS_OPCODE_PROGRESS_LOAD: {
-      cs_unpack(instr, CS_PROGRESS_LOAD, I);
-      fprintf(fp, "PROGRESS_LOAD d%u", I.destination);
-      break;
-   }
-
   case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: {
      cs_unpack(instr, CS_RUN_COMPUTE_INDIRECT, I);
 #if PAN_ARCH >= 12
-      fprintf(fp, "RUN_COMPUTE_INDIRECT%s.srt%d.spd%d.tsd%d.fau%d #%u, #%u",
-              I.progress_increment ? ".progress_inc" : "", I.srt_select,
-              I.spd_select, I.tsd_select, I.fau_select, I.workgroups_per_task,
-              I.ep_limit);
+      fprintf(fp, "RUN_COMPUTE_INDIRECT.srt%d.spd%d.tsd%d.fau%d #%u, #%u",
+              I.srt_select, I.spd_select, I.tsd_select, I.fau_select,
+              I.workgroups_per_task, I.ep_limit);
 #else
-      fprintf(fp, "RUN_COMPUTE_INDIRECT%s.srt%d.spd%d.tsd%d.fau%d #%u",
-              I.progress_increment ? ".progress_inc" : "", I.srt_select,
-              I.spd_select, I.tsd_select, I.fau_select, I.workgroups_per_task);
+      fprintf(fp, "RUN_COMPUTE_INDIRECT.srt%d.spd%d.tsd%d.fau%d #%u",
+              I.srt_select, I.spd_select, I.tsd_select, I.fau_select,
+              I.workgroups_per_task);
 #endif

      break;
@ -672,8 +651,19 @@ pandecode_run_compute(struct pandecode_context *ctx, FILE *fp,
   if (fau)
      GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");

-   GENX(pandecode_shader)
-   (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
+   uint64_t addr = cs_get_u64(qctx, reg_spd);
+#if PAN_ARCH >= 15
+   const struct mali_shader_program_pointer_packed spp_packed = {
+      .opaque[0] = addr & 0xFFFFFFFF,
+      .opaque[1] = (addr >> 32) & 0xFFFFFFFF,
+   };
+   pan_unpack(&spp_packed, SHADER_PROGRAM_POINTER, spp)
+      ;
+   DUMP_UNPACKED(ctx, SHADER_PROGRAM_POINTER, spp,
+                 "Shader Program Pointer (%" PRIx64 "):\n", addr);
+   addr = spp.pointer;
+#endif
+   GENX(pandecode_shader)(ctx, addr, "Shader", qctx->gpu_id);

   DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
             "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
@ -714,8 +704,19 @@ pandecode_run_compute_indirect(struct pandecode_context *ctx, FILE *fp,
   if (fau)
      GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");

-   GENX(pandecode_shader)
-   (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
+   uint64_t addr = cs_get_u64(qctx, reg_spd);
+#if PAN_ARCH >= 15
+   const struct mali_shader_program_pointer_packed spp_packed = {
+      .opaque[0] = addr & 0xFFFFFFFF,
+      .opaque[1] = (addr >> 32) & 0xFFFFFFFF,
+   };
+   pan_unpack(&spp_packed, SHADER_PROGRAM_POINTER, spp)
+      ;
+   DUMP_UNPACKED(ctx, SHADER_PROGRAM_POINTER, spp,
+                 "Shader Program Pointer (%" PRIx64 "):\n", addr);
+   addr = spp.pointer;
+#endif
+   GENX(pandecode_shader)(ctx, addr, "Shader", qctx->gpu_id);

   DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
             "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
@ -1097,6 +1098,101 @@ pandecode_run_idvs(struct pandecode_context *ctx, FILE *fp,
 }
 #endif

+#if PAN_ARCH >= 14
+static void
+pandecode_run_fragment2(struct pandecode_context *ctx, FILE *fp,
+                        struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT2 *I)
+{
+   if (qctx->in_exception_handler)
+      return;
+
+   ctx->indent++;
+
+   pandecode_log(ctx, "Iter trace ID0: %" PRIu32 "\n",
+                 cs_get_u32(qctx, MALI_FRAGMENT_SR_ITER_TRACE_ID0));
+   pandecode_log(ctx, "Iter trace ID1: %" PRIu32 "\n",
+                 cs_get_u32(qctx, MALI_FRAGMENT_SR_ITER_TRACE_ID1));
+   pandecode_log(ctx, "TEM pointer: %" PRIx64 "\n",
+                 cs_get_u64(qctx, MALI_FRAGMENT_SR_TEM_POINTER));
+   pandecode_log(ctx, "TEM row stride: %" PRIu32 "\n",
+                 cs_get_u32(qctx, MALI_FRAGMENT_SR_TEM_ROW_STRIDE));
+
+   for (unsigned i = 0; i < 11; ++i) {
+      const unsigned reg = MALI_FRAGMENT_SR_IRD_BUFFER_POINTER_0 + (i * 2);
+      pandecode_log(ctx, "IRD buffer pointer %u: %" PRIx64 "\n", i,
+                    cs_get_u64(qctx, reg));
+   }
+
+   DUMP_CL(ctx, FRAGMENT_FLAGS_3, &qctx->regs[MALI_FRAGMENT_SR_FLAGS_3],
+           "Flags 3:\n");
+   DUMP_CL(ctx, FRAGMENT_BOUNDING_BOX,
+           &qctx->regs[MALI_FRAGMENT_SR_BOUNDING_BOX], "Bounding Box:\n");
+   DUMP_CL(ctx, FRAME_SIZE, &qctx->regs[MALI_FRAGMENT_SR_FRAME_SIZE],
+           "Frame size:\n");
+
+   pan_unpack((const struct mali_fragment_flags_0_packed *)&qctx
+                 ->regs[MALI_FRAGMENT_SR_FLAGS_0],
+              FRAGMENT_FLAGS_0, flags0_unpacked)
+      ;
+   DUMP_UNPACKED(ctx, FRAGMENT_FLAGS_0, flags0_unpacked, "Flags 0:\n");
+
+   pan_unpack((const struct mali_fragment_flags_1_packed *)&qctx
+                 ->regs[MALI_FRAGMENT_SR_FLAGS_1],
+              FRAGMENT_FLAGS_1, flags1_unpacked)
+      ;
+   DUMP_UNPACKED(ctx, FRAGMENT_FLAGS_1, flags1_unpacked, "Flags 1:\n");
+
+   DUMP_CL(ctx, FRAGMENT_FLAGS_2, &qctx->regs[MALI_FRAGMENT_SR_FLAGS_2],
+           "Flags 2:\n");
+   pandecode_log(ctx, "Z clear: %f\n",
+                 uif(cs_get_u32(qctx, MALI_FRAGMENT_SR_Z_CLEAR)));
+
+   const uint64_t tiler_pointer =
+      cs_get_u64(qctx, MALI_FRAGMENT_SR_TILER_DESCRIPTOR_POINTER);
+   pandecode_log(ctx, "Tiler descriptor pointer: 0x%" PRIx64 "\n",
+                 tiler_pointer);
+
+   const uint64_t rtd_pointer = cs_get_u64(qctx, MALI_FRAGMENT_SR_RTD_POINTER);
+   pandecode_log(ctx, "RTD pointer: 0x%" PRIx64 "\n", rtd_pointer);
+
+   const uint64_t dbd_pointer = cs_get_u64(qctx, MALI_FRAGMENT_SR_DBD_POINTER);
+   pandecode_log(ctx, "DBD pointer: 0x%" PRIx64 "\n", dbd_pointer);
+
+   pandecode_log(ctx, "Frame argument: %" PRIx64 "\n",
+                 cs_get_u64(qctx, MALI_FRAGMENT_SR_FRAME_ARG));
+
+   const uint64_t sample_locations =
+      cs_get_u64(qctx, MALI_FRAGMENT_SR_SAMPLE_POSITION_ARRAY_POINTER);
+   pandecode_log(ctx, "Sample locations: 0x%" PRIx64 "\n", sample_locations);
+
+   const uint64_t dcd_pointer =
+      cs_get_u64(qctx, MALI_FRAGMENT_SR_FRAME_SHADER_DCD_POINTER);
+   pandecode_log(ctx, "Frame shader DCD pointer: 0x%" PRIx64 "\n", dcd_pointer);
+
+   DUMP_CL(ctx, VRS_IMAGE, &qctx->regs[MALI_FRAGMENT_SR_VRS_IMAGE],
+           "VRS image:\n");
+
+   GENX(pandecode_sample_locations)
+   (ctx, sample_locations);
+
+   const unsigned job_type_param = 0;
+   GENX(pandecode_frame_shader_dcds)
+   (ctx, dcd_pointer, flags0_unpacked.pre_frame_0, flags0_unpacked.pre_frame_1,
+    flags0_unpacked.post_frame, job_type_param, qctx->gpu_id);
+
+   if (tiler_pointer)
+      GENX(pandecode_tiler)(ctx, tiler_pointer);
+
+   if (dbd_pointer)
+      GENX(pandecode_zs_crc_ext)(ctx, dbd_pointer);
+
+   if (rtd_pointer)
+      GENX(pandecode_rts)
+   (ctx, rtd_pointer, flags1_unpacked.render_target_count);
+
+   ctx->indent--;
+}
+#else
 static void
 pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp,
                       struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT *I)
@ -1115,6 +1211,7 @@ pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp,

   ctx->indent--;
 }
+#endif /* PAN_ARCH >= 14 */

 static void
 pandecode_run_fullscreen(struct pandecode_context *ctx, FILE *fp,
@ -1261,11 +1358,19 @@ interpret_cs_instr(struct pandecode_context *ctx, struct queue_ctx *qctx)
   }
 #endif

+#if PAN_ARCH >= 14
+   case MALI_CS_OPCODE_RUN_FRAGMENT2: {
+      cs_unpack(bytes, CS_RUN_FRAGMENT2, I);
+      pandecode_run_fragment2(ctx, fp, qctx, &I);
+      break;
+   }
+#else
   case MALI_CS_OPCODE_RUN_FRAGMENT: {
      cs_unpack(bytes, CS_RUN_FRAGMENT, I);
      pandecode_run_fragment(ctx, fp, qctx, &I);
      break;
   }
+#endif

   case MALI_CS_OPCODE_RUN_FULLSCREEN: {
      cs_unpack(bytes, CS_RUN_FULLSCREEN, I);
@ -2192,18 +2297,6 @@ collect_indirect_branch_targets_recurse(struct cs_code_cfg *cfg,
         break;
      }

-      case MALI_CS_OPCODE_PROGRESS_LOAD: {
-         cs_unpack(instr, CS_PROGRESS_LOAD, I);
-         for (unsigned i = 0; i < 16; i++) {
-            if (BITSET_TEST(track_map, I.destination) ||
-                BITSET_TEST(track_map, I.destination + 1)) {
-               ibranch->has_unknown_targets = true;
-               return;
-            }
-         }
-         break;
-      }
-
      default:
         break;
      }
@ -2430,7 +2523,12 @@ print_cs_binary(struct pandecode_context *ctx, uint64_t bin,
 #else
      case MALI_CS_OPCODE_RUN_IDVS:
 #endif
+
+#if PAN_ARCH >= 14
+      case MALI_CS_OPCODE_RUN_FRAGMENT2:
+#else
      case MALI_CS_OPCODE_RUN_FRAGMENT:
+#endif
      case MALI_CS_OPCODE_RUN_FULLSCREEN:
      case MALI_CS_OPCODE_RUN_COMPUTE:
      case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
@ -2539,6 +2637,19 @@ GENX(pandecode_cs_trace)(struct pandecode_context *ctx, uint64_t trace,
      }
 #endif

+#if PAN_ARCH >= 14
+      case MALI_CS_OPCODE_RUN_FRAGMENT2: {
+         struct cs_run_fragment2_trace *frag_trace = trace_data;
+
+         assert(trace_size >= sizeof(*frag_trace));
+         cs_unpack(instr, CS_RUN_FRAGMENT2, I);
+         memcpy(&regs[0], frag_trace->sr, sizeof(frag_trace->sr));
+         pandecode_run_fragment2(ctx, ctx->dump_stream, &qctx, &I);
+         trace_data = frag_trace + 1;
+         trace_size -= sizeof(*frag_trace);
+         break;
+      }
+#else
      case MALI_CS_OPCODE_RUN_FRAGMENT: {
         struct cs_run_fragment_trace *frag_trace = trace_data;

@ -2550,6 +2661,7 @@ GENX(pandecode_cs_trace)(struct pandecode_context *ctx, uint64_t trace,
         trace_size -= sizeof(*frag_trace);
         break;
      }
+#endif

      case MALI_CS_OPCODE_RUN_FULLSCREEN: {
         struct cs_run_fullscreen_trace *fs_trace = trace_data;
--- a/src/panfrost/genxml/gen_macros.h
+++ b/src/panfrost/genxml/gen_macros.h
@ -61,6 +61,12 @@
 #elif (PAN_ARCH == 13)
 #define GENX(X) X##_v13
 #include "genxml/v13_pack.h"
+#elif (PAN_ARCH == 14)
+#define GENX(X) X##_v14
+#include "genxml/v14_pack.h"
+#elif (PAN_ARCH == 15)
+#define GENX(X) X##_v15
+#include "genxml/v15_pack.h"
 #else
 #error "Need to add suffixing macro for this architecture"
 #endif
--- a/src/panfrost/genxml/gen_pack.py
+++ b/src/panfrost/genxml/gen_pack.py
@ -83,23 +83,34 @@ def parse_modifier(modifier):
    if modifier is None:
        return None

-    for mod in MODIFIERS:
-        if modifier[0:len(mod)] == mod:
-            if mod == "log2":
-                assert(len(mod) == len(modifier))
-                return [mod]
+    ret = []
+    split_modifiers = modifier.split()

-            if modifier[len(mod)] == '(' and modifier[-1] == ')':
-                ret = [mod, int(modifier[(len(mod) + 1):-1])]
-                if ret[0] == 'align':
-                    align = ret[1]
-                    # Make sure the alignment is a power of 2
-                    assert(align > 0 and not(align & (align - 1)));
+    for mod in split_modifiers:
+        valid = False
+        for valid_mod in MODIFIERS:
+            if mod[0:len(valid_mod)] == valid_mod:
+                if valid_mod == "log2":
+                    assert(len(valid_mod) == len(modifier))
+                    # Add a number to simplify parsing
+                    ret.extend([valid_mod, 0])
+                    valid = True
+                    break

-                return ret
+                if mod[len(valid_mod)] == '(' and mod[-1] == ')':
+                    mod_arg = [valid_mod, int(mod[(len(valid_mod) + 1):-1])]
+                    if mod_arg[0] == 'align':
+                        align = mod_arg[1]
+                        # Make sure the alignment is a power of 2
+                        assert(align > 0 and not(align & (align - 1)));

-    print("Invalid modifier")
-    assert(False)
+                    ret.extend(mod_arg)
+                    valid = True
+                    break
+
+        assert valid, f"Invalid modifier: {modifier}"
+
+    return ret

 class Aggregate(object):
    def __init__(self, parser, name, attrs):
@ -169,7 +180,7 @@ class Field(object):
        if self.type in self.parser.enums and self.default is not None:
            self.default = safe_name('{}_{}_{}'.format(global_prefix, self.type, self.default)).upper()

-        self.modifier  = parse_modifier(attrs.get("modifier"))
+        self.modifier = parse_modifier(attrs.get("modifier"))

    def emit_template_struct(self, dim):
        if self.type == 'address':
@ -291,14 +302,22 @@ class Group(object):
            if field.modifier is None:
                continue

-            if field.modifier[0] == "shr":
-                shift = field.modifier[1]
-                mask = hex((1 << shift) - 1)
-                print("   assert(((__unpacked)->{} & {}) == 0); \\".format(field.name, mask))
-            elif field.modifier[0] == "minus":
-                print("   assert((__unpacked)->{} >= {}); \\".format(field.name, field.modifier[1]))
-            elif field.modifier[0] == "log2":
-                print("   assert(IS_POT_NONZERO((__unpacked)->{})); \\".format(field.name))
+            value = "(__unpacked)->{}".format(field.name)
+            for mod, mod_val in zip (field.modifier[::2], field.modifier[1::2]):
+                if mod == "shr":
+                    mask = hex((1 << mod_val) - 1)
+                    print("   assert(({} & {}) == 0); \\".format(value, mask))
+                    value = "({} >> {})".format(value, mod_val)
+                elif mod == "minus":
+                    print("   assert({} >= {}); \\".format(value, mod_val))
+                    value = "({} - {})".format(value, mod_val)
+                elif mod == "align":
+                    mask = hex(mod_val - 1)
+                    print('   assert(!({} & {})); \\'.format(value, mask))
+                    value = "(ALIGN_POT({}, {}))".format(value, mod_val)
+                elif mod == "log2":
+                    print("   assert(IS_POT_NONZERO({})); \\".format(value))
+                    value = "(util_logbase2({}))".format(value)

        for index in range(self.length // 4):
            # Handle MBZ words
@ -324,14 +343,15 @@ class Group(object):

                value = "(__unpacked)->{}".format(contributor.path)
                if field.modifier is not None:
-                    if field.modifier[0] == "shr":
-                        value = "{} >> {}".format(value, field.modifier[1])
-                    elif field.modifier[0] == "minus":
-                        value = "{} - {}".format(value, field.modifier[1])
-                    elif field.modifier[0] == "align":
-                        value = "ALIGN_POT({}, {})".format(value, field.modifier[1])
-                    elif field.modifier[0] == "log2":
-                        value = "util_logbase2({})".format(value)
+                    for mod, mod_val in zip(field.modifier[::2], field.modifier[1::2]):
+                        if mod == "shr":
+                            value = "({} >> {})".format(value, mod_val)
+                        elif mod == "minus":
+                            value = "({} - {})".format(value, mod_val)
+                        elif mod == "align":
+                            value = "(ALIGN_POT({}, {}))".format(value, mod_val)
+                        elif mod == "log2":
+                            value = "(util_logbase2({}))".format(value)

                if field.type in ["uint", "hex", "uint/float", "address", "Pixel Format", "Component Swizzle"]:
                    s = "util_bitpack_uint(%s, %d, %d)" % \
@ -435,25 +455,24 @@ class Group(object):
            else:
                s = "/* unhandled field %s, type %s */\n" % (field.name, field.type)

-            suffix = ""
-            prefix = ""
-            if field.modifier:
-                if field.modifier[0] == "minus":
-                    suffix = " + {}".format(field.modifier[1])
-                elif field.modifier[0] == "shr":
-                    suffix = " << {}".format(field.modifier[1])
-                if field.modifier[0] == "log2":
-                    prefix = "1U << "

            print('   {}({}); \\'.format(convert, ', '.join(args)))

-            if len(prefix) != 0 or len(suffix) != 0:
-                print('   (__unpacked)->{} = {}(__unpacked)->{}{}; \\'.format(fieldref.path, prefix, fieldref.path, suffix))
+            value = "(__unpacked)->{}".format(fieldref.path)
+            if field.modifier is not None:
+                # Need to reverse ([::-1]) modifier order when unpacking
+                for mod, mod_val in list(zip(field.modifier[::2], field.modifier[1::2]))[::-1]:
+                    if mod == "shr":
+                        value = "({} << {})".format(value, mod_val)
+                    elif mod == "minus":
+                        value = "({} + {})".format(value, mod_val)
+                    elif mod == "align":
+                        mask = hex(mod_val - 1)
+                        print('   assert(!({} & {})); \\'.format(value, mask))
+                    elif mod == "log2":
+                        value = "(1U << {})".format(value)

-
-            if field.modifier and field.modifier[0] == "align":
-                mask = hex(field.modifier[1] - 1)
-                print('   assert(!((__unpacked)->{} & {})); \\'.format(fieldref.path, mask))
+                print('   (__unpacked)->{} = {}; \\'.format(fieldref.path, value))

    def emit_print_function(self):
        for field in self.fields:
--- a/src/panfrost/genxml/meson.build
+++ b/src/panfrost/genxml/meson.build
@ -3,7 +3,7 @@
 # SPDX-License-Identifier: MIT

 pan_packers = []
-foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v12', 'v13']
+foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v12', 'v13', 'v14', 'v15']
  pan_packers += custom_target(
    packer + '_pack.h',
    input : ['gen_pack.py', packer + '.xml'],
@ -20,7 +20,7 @@ idep_pan_packers = declare_dependency(

 libpanfrost_decode_per_arch = []

-foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13']
+foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15']
  libpanfrost_decode_per_arch += static_library(
    'pandecode-arch-v' + ver,
    ['decode.c', 'decode_jm.c', 'decode_csf.c', pan_packers],
--- a/src/panfrost/genxml/v10.xml
+++ b/src/panfrost/genxml/v10.xml
@ -1,5 +1,6 @@
 <!--
  Copyright (C) 2020 Collabora Ltd.
+  Copyright (C) 2026 Arm Ltd.
  SPDX-License-Identifier: MIT
 -->

@ -84,6 +85,7 @@
  <enum name="Address Mode">
    <value name="Flat" value="0"/>
    <value name="Packed" value="1"/>
+    <value name="Out of bounds" value="8"/>
  </enum>

  <enum name="Format">
@ -132,6 +134,7 @@
    <value name="A2 YUV10" value="41"/>
    <value name="YUYAAYVYAA" value="42"/>
    <!--- TODO: revisit YUV -->
+    <value name="Y10U10V10_420" value="43"/>
    <value name="YUYV10" value="44"/>
    <value name="VYUY10" value="45"/>
    <value name="Y10 UV10 422" value="46"/>
@ -1163,6 +1166,13 @@
  <enum name="Clump Ordering">
    <value name="Tiled U-Interleaved" value="1"/>
    <value name="Linear" value="2"/>
+
+    <!-- Block-linear interleaved clump orderings are not available on
+         all v10 architectures. -->
+    <value name="Block-linear interleaved 16x16" value="3"/>
+    <value name="Block-linear interleaved 8x16" value="4"/>
+    <value name="Block-linear interleaved 8x8" value="5"/>
+
    <value name="Interleaved 64k" value="8"/>
  </enum>

--- a/src/panfrost/genxml/v12.xml
+++ b/src/panfrost/genxml/v12.xml
@ -1,5 +1,6 @@
 <!--
  Copyright (C) 2025 Collabora Ltd.
+  Copyright (C) 2026 Arm Ltd.
  SPDX-License-Identifier: MIT
 -->

@ -84,6 +85,7 @@
  <enum name="Address Mode">
    <value name="Flat" value="0"/>
    <value name="Packed" value="1"/>
+    <value name="Out of bounds" value="8"/>
  </enum>

  <enum name="Format">
@ -132,6 +134,7 @@
    <value name="A2 YUV10" value="41"/>
    <value name="YUYAAYVYAA" value="42"/>
    <!--- TODO: revisit YUV -->
+    <value name="Y10U10V10_420" value="43"/>
    <value name="YUYV10" value="44"/>
    <value name="VYUY10" value="45"/>
    <value name="Y10 UV10 422" value="46"/>
@ -1426,6 +1429,9 @@
  <enum name="Clump Ordering">
    <value name="Tiled U-Interleaved" value="1"/>
    <value name="Linear" value="2"/>
+    <value name="Block-linear interleaved 16x16" value="3"/>
+    <value name="Block-linear interleaved 8x16" value="4"/>
+    <value name="Block-linear interleaved 8x8" value="5"/>
    <value name="Interleaved 64k" value="8"/>
  </enum>

--- a/src/panfrost/genxml/v13.xml
+++ b/src/panfrost/genxml/v13.xml
@ -1,5 +1,6 @@
 <!--
  Copyright (C) 2025 Collabora Ltd.
+  Copyright (C) 2026 Arm Ltd.
  SPDX-License-Identifier: MIT
 -->

@ -84,6 +85,7 @@
  <enum name="Address Mode">
    <value name="Flat" value="0"/>
    <value name="Packed" value="1"/>
+    <value name="Out of bounds" value="8"/>
  </enum>

  <enum name="Format">
@ -132,6 +134,7 @@
    <value name="A2 YUV10" value="41"/>
    <value name="YUYAAYVYAA" value="42"/>
    <!--- TODO: revisit YUV -->
+    <value name="Y10U10V10_420" value="43"/>
    <value name="YUYV10" value="44"/>
    <value name="VYUY10" value="45"/>
    <value name="Y10 UV10 422" value="46"/>
@ -1728,6 +1731,9 @@
  <enum name="Clump Ordering">
    <value name="Tiled U-Interleaved" value="1"/>
    <value name="Linear" value="2"/>
+    <value name="Block-linear interleaved 16x16" value="3"/>
+    <value name="Block-linear interleaved 8x16" value="4"/>
+    <value name="Block-linear interleaved 8x8" value="5"/>
    <value name="Interleaved 64k" value="8"/>
  </enum>

--- a/src/panfrost/genxml/v14.xml
+++ b/src/panfrost/genxml/v14.xml
--- a/src/panfrost/genxml/v15.xml
+++ b/src/panfrost/genxml/v15.xml
--- a/src/panfrost/genxml/v9.xml
+++ b/src/panfrost/genxml/v9.xml
@ -1,5 +1,6 @@
 <!--
  Copyright (C) 2020 Collabora Ltd.
+  Copyright (C) 2026 Arm Ltd.
  SPDX-License-Identifier: MIT
 -->

@ -103,6 +104,7 @@
  <enum name="Address Mode">
    <value name="Flat" value="0"/>
    <value name="Packed" value="1"/>
+    <value name="Out of bounds" value="8"/>
  </enum>

  <enum name="Format">
--- a/src/panfrost/lib/kmod/pan_kmod.h
+++ b/src/panfrost/lib/kmod/pan_kmod.h
@ -206,6 +206,9 @@ struct pan_kmod_dev_props {
   /* Maximum number of threads per workgroup. */
   uint32_t max_threads_per_wg;

+   /* Granularity of number of active threads. */
+   uint32_t num_threads_active_granularity;
+
   /* Number of registers per core. Can be used to determine the maximum
    * number of threads that can be allocated for a specific shader based on
    * the number of registers assigned to this shader.
--- a/src/panfrost/lib/kmod/panthor_kmod.c
+++ b/src/panfrost/lib/kmod/panthor_kmod.c
@ -133,13 +133,17 @@ panthor_dev_query_thread_props(struct panthor_kmod_dev *panthor_dev)
   props->max_tasks_per_core = panthor_dev->props.gpu.thread_features >> 24;
   props->num_registers_per_core =
      panthor_dev->props.gpu.thread_features & 0x3fffff;
+   props->num_threads_active_granularity =
+      panthor_dev->props.gpu.thread_num_active_granularity;

   /* We assume that all thread properties are populated. If we ever have a GPU
    * that have one of the THREAD_xxx register that's zero, we can always add a
    * quirk here.
    */
-   assert(props->max_threads_per_wg && props->max_threads_per_core &&
-          props->max_tasks_per_core && props->num_registers_per_core);
+   assert(
+      (props->max_threads_per_wg || props->num_threads_active_granularity) &&
+      props->max_threads_per_core && props->max_tasks_per_core &&
+      props->num_registers_per_core);

   /* There is no THREAD_TLS_ALLOC register on v10+, and the maximum number
    * of TLS instance per core is assumed to be the maximum number of threads
@ -153,8 +157,12 @@ panthor_dev_query_props(struct panthor_kmod_dev *panthor_dev)
 {
   struct pan_kmod_dev_props *props = &panthor_dev->base.props;

+   bool is_gpu_wide = panthor_dev->props.gpu.gpu_id == 0;
+   assert(!is_gpu_wide || panthor_dev->props.gpu.gpu_wide_id);
+
   *props = (struct pan_kmod_dev_props){
-      .gpu_id = panthor_dev->props.gpu.gpu_id,
+      .gpu_id = is_gpu_wide ? panthor_dev->props.gpu.gpu_wide_id
+                            : panthor_dev->props.gpu.gpu_id,
      .gpu_variant = panthor_dev->props.gpu.core_features & 0xff,
      .shader_present = panthor_dev->props.gpu.shader_present,
      .tiler_features = panthor_dev->props.gpu.tiler_features,
--- a/src/panfrost/lib/meson.build
+++ b/src/panfrost/lib/meson.build
@ -4,7 +4,7 @@

 subdir('kmod')

-pixel_format_versions = ['5', '6', '7', '9', '10', '12', '13']
+pixel_format_versions = ['5', '6', '7', '9', '10', '12', '13', '14', '15']
 libpanfrost_pixel_format = []

 deps_for_libpanfrost = [dep_libdrm, idep_pan_packers, idep_mesautil, libpanfrost_model_dep]
@ -22,7 +22,7 @@ endforeach

 libpanfrost_per_arch = []

-foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13']
+foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15']
  libpanfrost_per_arch += static_library(
    'pan-arch-v' + ver,
    [
--- a/src/panfrost/lib/pan_afbc.h
+++ b/src/panfrost/lib/pan_afbc.h
@ -3,6 +3,7 @@
 * Copyright (C) 2014 Broadcom
 * Copyright (C) 2018-2019 Alyssa Rosenzweig
 * Copyright (C) 2019-2020 Collabora, Ltd.
+ * Copyright (C) 2026 Arm Ltd.
 * SPDX-License-Identifier: MIT
 */

@ -711,6 +712,32 @@ pan_afbc_compression_mode(enum pan_afbc_mode mode)
   case PAN_AFBC_MODE_R16G16B16A16:
      return MALI_AFBC_COMPRESSION_MODE_R16G16B16A16;
 #endif
+#if PAN_ARCH >= 14
+   case PAN_AFBC_MODE_YUV420_6C8:
+      return MALI_AFBC_COMPRESSION_MODE_Y8U8V8_420;
+   case PAN_AFBC_MODE_YUV420_2C8:
+      return MALI_AFBC_COMPRESSION_MODE_R8G8;
+   case PAN_AFBC_MODE_YUV420_1C8:
+      return MALI_AFBC_COMPRESSION_MODE_R8;
+   case PAN_AFBC_MODE_YUV420_6C10:
+      return MALI_AFBC_COMPRESSION_MODE_Y10U10V10_420;
+   case PAN_AFBC_MODE_YUV420_2C10:
+      return MALI_AFBC_COMPRESSION_MODE_R10G10;
+   case PAN_AFBC_MODE_YUV420_1C10:
+      return MALI_AFBC_COMPRESSION_MODE_R10;
+   case PAN_AFBC_MODE_YUV422_4C8:
+      return MALI_AFBC_COMPRESSION_MODE_Y8U8Y8V8_422;
+   case PAN_AFBC_MODE_YUV422_2C8:
+      return MALI_AFBC_COMPRESSION_MODE_R8G8;
+   case PAN_AFBC_MODE_YUV422_1C8:
+      return MALI_AFBC_COMPRESSION_MODE_R8;
+   case PAN_AFBC_MODE_YUV422_4C10:
+      return MALI_AFBC_COMPRESSION_MODE_Y10U10Y10V10_422;
+   case PAN_AFBC_MODE_YUV422_2C10:
+      return MALI_AFBC_COMPRESSION_MODE_R10G10;
+   case PAN_AFBC_MODE_YUV422_1C10:
+      return MALI_AFBC_COMPRESSION_MODE_R10;
+#else
   case PAN_AFBC_MODE_YUV420_6C8:
      return MALI_AFBC_COMPRESSION_MODE_YUV420_6C8;
   case PAN_AFBC_MODE_YUV420_2C8:
@ -735,6 +762,7 @@ pan_afbc_compression_mode(enum pan_afbc_mode mode)
      return MALI_AFBC_COMPRESSION_MODE_YUV422_2C10;
   case PAN_AFBC_MODE_YUV422_1C10:
      return MALI_AFBC_COMPRESSION_MODE_YUV422_1C10;
+#endif /* PAN_ARCH >= 14 */
 #if PAN_ARCH == 9
   case PAN_AFBC_MODE_R16:
   case PAN_AFBC_MODE_R16G16:
--- a/src/panfrost/lib/pan_afrc.h
+++ b/src/panfrost/lib/pan_afrc.h
@ -1,5 +1,6 @@
 /*
 * Copyright (C) 2023 Collabora, Ltd.
+ * Copyright (C) 2026 Arm Ltd.
 * SPDX-License-Identifier: MIT
 */

@ -347,6 +348,25 @@ pan_afrc_format(struct pan_afrc_format_info info, uint64_t modifier,
      return (scan ? MALI_AFRC_FORMAT_R10G10B10A10_SCAN
                   : MALI_AFRC_FORMAT_R10G10B10A10_ROT);

+#if PAN_ARCH >= 14
+   case PAN_AFRC_ICHANGE_FORMAT_YUV444:
+   case PAN_AFRC_ICHANGE_FORMAT_YUV422:
+   case PAN_AFRC_ICHANGE_FORMAT_YUV420:
+      if (info.bpc == 8) {
+         if (plane == 0 || info.num_planes == 3)
+            return (scan ? MALI_AFRC_FORMAT_R8_SCAN : MALI_AFRC_FORMAT_R8_ROT);
+
+         return (scan ? MALI_AFRC_FORMAT_R8G8_SCAN : MALI_AFRC_FORMAT_R8G8_ROT);
+      }
+
+      if (plane == 0 || info.num_planes == 3)
+         return (scan ? MALI_AFRC_FORMAT_R10_SCAN : MALI_AFRC_FORMAT_R10_ROT);
+
+      assert(info.ichange_fmt == PAN_AFRC_ICHANGE_FORMAT_YUV422 ||
+             info.ichange_fmt == PAN_AFRC_ICHANGE_FORMAT_YUV420);
+      return (scan ? MALI_AFRC_FORMAT_R10G10_SCAN
+                   : MALI_AFRC_FORMAT_R10G10_ROT);
+#else
   case PAN_AFRC_ICHANGE_FORMAT_YUV444:
      if (info.bpc == 8) {
         if (plane == 0 || info.num_planes == 3)
@ -394,6 +414,7 @@ pan_afrc_format(struct pan_afrc_format_info info, uint64_t modifier,

      return (scan ? MALI_AFRC_FORMAT_R10G10_420_SCAN
                   : MALI_AFRC_FORMAT_R10G10_420_ROT);
+#endif /* PAN_ARCH >= 14 */

   default:
      return MALI_AFRC_FORMAT_INVALID;
--- a/src/panfrost/lib/pan_desc.c
+++ b/src/panfrost/lib/pan_desc.c
@ -1,5 +1,6 @@
 /*
 * Copyright (C) 2021 Collabora, Ltd.
+ * Copyright (C) 2026 Arm Ltd.
 * SPDX-License-Identifier: MIT
 */

@ -11,6 +12,7 @@
 #include "pan_afrc.h"
 #include "pan_desc.h"
 #include "pan_encoder.h"
+#include "pan_fb.h"
 #include "pan_props.h"
 #include "pan_texture.h"
 #include "pan_trace.h"
@ -1172,11 +1174,156 @@ check_fb_attachments(const struct pan_fb_info *fb)
 #endif
 }

+#if PAN_ARCH >= 14
 unsigned
 GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
                   const struct pan_tls_info *tls,
-                   const struct pan_tiler_context *tiler_ctx, void *out)
+                   const struct pan_tiler_context *tiler_ctx,
+                   const struct pan_ptr framebuffer)
 {
+   void *out = framebuffer.cpu;
+
+   PAN_TRACE_FUNC(PAN_TRACE_LIB_DESC);
+
+   check_fb_attachments(fb);
+
+   const int crc_rt = GENX(pan_select_crc_rt)(fb, fb->tile_size);
+   const bool has_zs_crc_ext = (fb->zs.view.zs || fb->zs.view.s || crc_rt >= 0);
+   const struct pan_clean_tile clean_tile = pan_get_clean_tile_info(fb);
+
+   /* Emit to memory the state that might change per-layer. The static
+    * state is emitted directly to CSF registers by
+    * cs_emit_static_fragment_state().
+    */
+
+   struct pan_fbd_layer fbd_data = {0};
+   fbd_data.tiler = tiler_ctx->valhall.desc;
+
+   /* internal_layer_index in flags0 is used to select the right
+    * primitive list in the tiler context, and frame_arg is the value
+    * that's passed to the fragment shader through r62-r63, which we use
+    * to pass gl_Layer. Since the layer_idx only takes 8-bits, we might
+    * use the extra 56-bits we have in frame_argument to pass other
+    * information to the fragment shader at some point.
+    */
+   assert(layer_idx >= tiler_ctx->valhall.layer_offset);
+   fbd_data.frame_argument = layer_idx;
+
+   pan_pack(&fbd_data.flags0, FRAGMENT_FLAGS_0, cfg) {
+      cfg.pre_frame_0 =
+         pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[0],
+                                   pan_clean_tile_write_any_set(clean_tile));
+      cfg.pre_frame_1 =
+         pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[1],
+                                   pan_clean_tile_write_any_set(clean_tile));
+      cfg.post_frame = fb->bifrost.pre_post.modes[2];
+
+      const unsigned zs_bytes_per_pixel = pan_zsbuf_bytes_per_pixel(fb);
+      /* We can interleave HSR if we have space for two ZS tiles in
+       * the tile buffer. */
+      const unsigned max_zs_tile_size_interleave =
+         fb->z_tile_buf_budget >> util_logbase2_ceil(zs_bytes_per_pixel);
+      const bool hsr_can_interleave =
+         fb->tile_size <= max_zs_tile_size_interleave;
+
+      /* Enabling prepass without interleave is generally not good for
+       * performance, so disable HSR in that case. */
+      cfg.hsr_prepass_enable = fb->allow_hsr_prepass && hsr_can_interleave;
+      cfg.hsr_prepass_interleaving_enable = hsr_can_interleave;
+      cfg.hsr_prepass_filter_enable = true;
+      cfg.hsr_hierarchical_optimizations_enable = true;
+
+      cfg.internal_layer_index = layer_idx - tiler_ctx->valhall.layer_offset;
+   }
+
+   fbd_data.dcd_pointer = fb->bifrost.pre_post.dcds.gpu;
+
+   pan_pack(&fbd_data.flags2, FRAGMENT_FLAGS_2, cfg) {
+      cfg.s_clear = fb->zs.clear_value.stencil;
+      cfg.s_write_enable = (fb->zs.view.s && !fb->zs.discard.s);
+
+      /* Default to 24 bit depth if there's no surface. */
+      cfg.z_internal_format =
+         fb->zs.view.zs ? pan_get_z_internal_format(fb->zs.view.zs->format)
+                        : MALI_Z_INTERNAL_FORMAT_D24;
+      cfg.z_write_enable = (fb->zs.view.zs && !fb->zs.discard.z);
+
+      if (crc_rt >= 0) {
+         bool *valid = fb->rts[crc_rt].crc_valid;
+         bool full = !fb->draw_extent.minx && !fb->draw_extent.miny &&
+                     fb->draw_extent.maxx == (fb->width - 1) &&
+                     fb->draw_extent.maxy == (fb->height - 1);
+
+         /* If the CRC was valid it stays valid, if it wasn't, we must
+          * ensure the render operation covers the full frame, and
+          * clean tiles are pushed to memory. */
+         bool new_valid = *valid | (full && pan_clean_tile_write_rt_enabled(
+                                               clean_tile, crc_rt));
+
+         cfg.crc_read_enable = *valid;
+
+         /* If the data is currently invalid, still write CRC
+          * data if we are doing a full write, so that it is
+          * valid for next time. */
+         cfg.crc_write_enable = new_valid;
+
+         *valid = new_valid;
+      }
+   }
+
+   fbd_data.z_clear = util_bitpack_float(fb->zs.clear_value.depth);
+
+   {
+      /* Set the DBD and RTD pointers. Both must be 64-bytes aligned. */
+      uint64_t out_gpu_addr =
+         framebuffer.gpu + ALIGN_POT(sizeof(struct pan_fbd_layer), 64);
+
+      if (has_zs_crc_ext) {
+         fbd_data.dbd_pointer = out_gpu_addr;
+         assert(fbd_data.dbd_pointer % 64 == 0);
+         out_gpu_addr += pan_size(ZS_CRC_EXTENSION);
+      }
+
+      fbd_data.rtd_pointer = out_gpu_addr;
+      assert(fbd_data.rtd_pointer % 64 == 0);
+   }
+
+   memcpy(out, &fbd_data, sizeof(fbd_data));
+   out += ALIGN_POT(sizeof(fbd_data), 64);
+
+   if (has_zs_crc_ext) {
+      struct mali_zs_crc_extension_packed *zs_crc_ext = out;
+      pan_emit_zs_crc_ext(fb, layer_idx, crc_rt, zs_crc_ext, clean_tile);
+      out += pan_size(ZS_CRC_EXTENSION);
+   }
+
+   const unsigned rt_count = MAX2(fb->rt_count, 1);
+   unsigned cbuf_offset = 0;
+   for (unsigned i = 0; i < rt_count; i++) {
+      pan_emit_rt(fb, layer_idx, i, cbuf_offset, out, clean_tile);
+      out += pan_size(RENDER_TARGET);
+      if (!fb->rts[i].view)
+         continue;
+
+      cbuf_offset += pan_bytes_per_pixel_tib(fb->rts[i].view->format) *
+                     fb->tile_size *
+                     pan_image_view_get_nr_samples(fb->rts[i].view);
+
+      if (i != crc_rt && fb->rts[i].crc_valid != NULL)
+         *(fb->rts[i].crc_valid) = false;
+   }
+
+   return 0;
+}
+#else
+unsigned
+GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
+                   const struct pan_tls_info *tls,
+                   const struct pan_tiler_context *tiler_ctx,
+                   const struct pan_ptr framebuffer)
+{
+   void *out = framebuffer.cpu;
+
   PAN_TRACE_FUNC(PAN_TRACE_LIB_DESC);

   check_fb_attachments(fb);
@ -1351,6 +1498,7 @@ GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
   }
   return tag.opaque[0];
 }
+#endif /* PAN_ARCH >= 14 */
 #else /* PAN_ARCH == 4 */
 static enum mali_color_format
 pan_sfbd_raw_format(unsigned bits)
@ -1378,8 +1526,11 @@ GENX(pan_select_tile_size)(struct pan_fb_info *fb)
 unsigned
 GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
                   const struct pan_tls_info *tls,
-                   const struct pan_tiler_context *tiler_ctx, void *fbd)
+                   const struct pan_tiler_context *tiler_ctx,
+                   const struct pan_ptr framebuffer)
 {
+   void *fbd = framebuffer.cpu;
+
   PAN_TRACE_FUNC(PAN_TRACE_LIB_DESC);

   assert(fb->rt_count <= 1);
--- a/src/panfrost/lib/pan_desc.h
+++ b/src/panfrost/lib/pan_desc.h
@ -196,18 +196,22 @@ pan_wls_adjust_size(unsigned wls_size)

 static inline unsigned
 pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
-                             const struct pan_kmod_dev_props *props)
+                             const struct pan_kmod_dev_props *props,
+                             unsigned work_reg_count)
 {
   /* Each shader core can run N tasks and a total of M threads at any single
    * time, thus each task should ideally have no more than M/N threads. */
   unsigned max_threads_per_task =
      props->max_threads_per_core / props->max_tasks_per_core;

+   ASSERTED unsigned max_threads_per_wg =
+      pan_compute_max_thread_count(props, work_reg_count);
+
   /* To achieve the best utilization, we should aim for as many workgroups
    * per tasks as we can fit without exceeding the above thread limit */
   unsigned threads_per_wg =
      shader_local_size->x * shader_local_size->y * shader_local_size->z;
-   assert(threads_per_wg > 0 && threads_per_wg <= props->max_threads_per_wg);
+   assert(threads_per_wg > 0 && threads_per_wg <= max_threads_per_wg);
   unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg);
   assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task);

@ -217,14 +221,15 @@ pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
 static inline unsigned
 pan_calc_wls_instances(const struct pan_compute_dim *shader_local_size,
                       const struct pan_kmod_dev_props *props,
-                       const struct pan_compute_dim *dim)
+                       const struct pan_compute_dim *dim,
+                       unsigned work_reg_count)
 {
   /* NOTE: If the instance count is lower than the number of workgroups
    * being dispatched, the HW will hold back workgroups until instances
    * can be reused. */
   unsigned instances;
   unsigned wg_per_task =
-      pan_calc_workgroups_per_task(shader_local_size, props);
+      pan_calc_workgroups_per_task(shader_local_size, props, work_reg_count);
   unsigned max_instances_per_core =
      util_next_power_of_two(wg_per_task * props->max_tasks_per_core);

@ -341,7 +346,7 @@ void GENX(pan_emit_afrc_color_attachment)(const struct pan_attachment_info *att,
 unsigned GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
                            const struct pan_tls_info *tls,
                            const struct pan_tiler_context *tiler_ctx,
-                            void *out);
+                            const struct pan_ptr framebuffer);

 #if PAN_ARCH >= 6
 unsigned GENX(pan_select_tiler_hierarchy_mask)(uint32_t width, uint32_t height,
--- a/src/panfrost/lib/pan_fb.c
+++ b/src/panfrost/lib/pan_fb.c
@ -1,5 +1,6 @@
 /*
 * Copyright (C) 2026 Collabora, Ltd.
+ * Copyright (C) 2026 Arm Ltd.
 * SPDX-License-Identifier: MIT
 */
 #include "pan_fb.h"
@ -669,9 +670,124 @@ pan_fix_frame_shader_mode(enum mali_pre_post_frame_shader_mode mode,
 }
 #endif

+#if PAN_ARCH >= 14
 uint32_t
-GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, void *out)
+GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info,
+                       const struct pan_ptr framebuffer)
 {
+   /* Emit the dynamic framebuffer state. That is, state that may change per-layer. */
+
+   void *out = framebuffer.cpu;
+   const struct pan_fb_layout *fb = info->fb;
+   const struct pan_fb_load *load = info->load;
+   const struct pan_fb_store *store = info->store;
+   const struct pan_fb_clean_tile ct = pan_fb_get_clean_tile(info);
+   const bool has_zs_crc_ext = pan_fb_has_zs(fb);
+
+   struct pan_fbd_layer fbd_data = {0};
+   fbd_data.tiler = info->tiler_ctx->valhall.desc;
+
+   /* layer_index in flags0 is used to select the right primitive list in
+    * the tiler context, and frame_arg is the value that's passed to the
+    * fragment shader through r62-r63, which we use to pass gl_Layer. Since
+    * the layer_idx only takes 8-bits, we might use the extra 56-bits we
+    * have in frame_argument to pass other information to the fragment
+    * shader at some point.
+    */
+   assert(info->layer >= info->tiler_ctx->valhall.layer_offset);
+   fbd_data.frame_argument = info->layer;
+
+   pan_pack(&fbd_data.flags0, FRAGMENT_FLAGS_0, cfg) {
+      cfg.pre_frame_0 = pan_fix_frame_shader_mode(info->frame_shaders.modes[0],
+                                                  ct.rts || ct.zs || ct.s);
+      cfg.pre_frame_1 = pan_fix_frame_shader_mode(info->frame_shaders.modes[1],
+                                                  ct.rts || ct.zs || ct.s);
+      cfg.post_frame = info->frame_shaders.modes[2];
+
+      /* Enabling prepass without pipelineing is generally not good for
+       * performance, so disable HSR in that case.
+       */
+      cfg.hsr_prepass_enable = info->allow_hsr_prepass &&
+                               pan_fb_can_pipeline_zs(fb);
+      cfg.hsr_prepass_interleaving_enable = pan_fb_can_pipeline_zs(fb);
+      cfg.hsr_prepass_filter_enable = true;
+      cfg.hsr_hierarchical_optimizations_enable = true;
+
+      cfg.internal_layer_index =
+         info->layer - info->tiler_ctx->valhall.layer_offset;
+   }
+
+   pan_pack(&fbd_data.flags2, FRAGMENT_FLAGS_2, cfg) {
+      if (fb->s_format != PIPE_FORMAT_NONE) {
+         cfg.s_clear = load && target_has_clear(&load->s) ?
+                       load->s.clear.stencil : 0;
+         cfg.s_write_enable = store && store->s.store;
+      }
+
+      if (fb->z_format != PIPE_FORMAT_NONE) {
+         cfg.z_internal_format = pan_get_z_internal_format(fb->z_format);
+         cfg.z_write_enable = store && store->zs.store;
+      } else {
+         cfg.z_internal_format = MALI_Z_INTERNAL_FORMAT_D24;
+         assert(!store || !store->zs.store);
+      }
+   }
+
+   fbd_data.z_clear =
+      util_bitpack_float(fb->z_format != PIPE_FORMAT_NONE && load && load &&
+                               target_has_clear(&load->z)
+                            ? load->z.clear.depth
+                            : 0);
+
+   fbd_data.dcd_pointer = info->frame_shaders.dcd_pointer;
+
+   {
+      /* Set the DBD and RTD pointers. Both must be 64-bytes aligned. */
+      uint64_t out_gpu_addr =
+         framebuffer.gpu + ALIGN_POT(sizeof(struct pan_fbd_layer), 64);
+
+      if (has_zs_crc_ext) {
+         fbd_data.dbd_pointer = out_gpu_addr;
+         assert(fbd_data.dbd_pointer % 64 == 0);
+         out_gpu_addr += pan_size(ZS_CRC_EXTENSION);
+      }
+
+      fbd_data.rtd_pointer = out_gpu_addr;
+      assert(fbd_data.rtd_pointer % 64 == 0);
+   }
+
+   memcpy(out, &fbd_data, sizeof(fbd_data));
+   out += ALIGN_POT(sizeof(fbd_data), 64);
+
+   if (has_zs_crc_ext) {
+      struct mali_zs_crc_extension_packed zs_crc;
+      emit_zs_crc_desc(info, ct, &zs_crc);
+      memcpy(out, &zs_crc, sizeof(zs_crc));
+      out += sizeof(zs_crc);
+   }
+
+   uint32_t tile_rt_offset_B = 0;
+   for (unsigned rt = 0; rt < fb->rt_count; rt++) {
+      struct mali_rgb_render_target_packed rgb_rt;
+      emit_rgb_rt_desc(info, ct, rt, tile_rt_offset_B, &rgb_rt);
+      memcpy(out, &rgb_rt, sizeof(rgb_rt));
+      out += sizeof(rgb_rt);
+
+      if (fb->rt_formats[rt] != PIPE_FORMAT_NONE) {
+         tile_rt_offset_B += pan_bytes_per_pixel_tib(fb->rt_formats[rt]) *
+                             fb->tile_size_px * fb->sample_count;
+      }
+   }
+   assert(tile_rt_offset_B <= fb->tile_rt_alloc_B);
+
+   return 0;
+}
+#else /* PAN_ARCH < 14 */
+uint32_t
+GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info,
+                       const struct pan_ptr framebuffer)
+{
+   void *out = framebuffer.cpu;
   const struct pan_fb_layout *fb = info->fb;
   const struct pan_fb_load *load = info->load;
   const struct pan_fb_store *store = info->store;
@ -823,4 +939,5 @@ GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, void *out)
   }
   return tag.opaque[0];
 }
-#endif
+#endif /* PAN_ARCH >= 14 */
+#endif /* PAN_ARCH >= 5 */
--- a/src/panfrost/lib/pan_fb.h
+++ b/src/panfrost/lib/pan_fb.h
@ -1,14 +1,20 @@
 /*
 * Copyright (C) 2026 Collabora, Ltd.
+ * Copyright (C) 2026 Arm Ltd.
 * SPDX-License-Identifier: MIT
 */

 #ifndef __PAN_FB_H
 #define __PAN_FB_H

+#if PAN_ARCH >= 14
+#include "genxml/cs_builder.h"
+#endif
+
+#include "compiler/shader_enums.h"
 #include "genxml/gen_macros.h"
 #include "util/format/u_formats.h"
-#include "compiler/shader_enums.h"
+#include "pan_pool.h"

 struct nir_shader;
 struct nir_shader_compiler_options;
@ -481,7 +487,7 @@ void GENX(pan_fill_fb_info)(const struct pan_fb_desc_info *info,
                            struct pan_fb_info *fbinfo);

 uint32_t GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info,
-                                void *out);
+                                const struct pan_ptr framebuffer);
 #endif

 enum ENUM_PACKED pan_fb_shader_op {
@ -620,4 +626,35 @@ GENX(pan_get_fb_shader)(const struct pan_fb_shader_key *key,
                        const struct nir_shader_compiler_options *nir_options);
 #endif

+#if PAN_ARCH >= 14
+/* Framebuffer per-layer state. Keep this structure 64-byte aligned, since
+ * we want the adjacent ZS_CRC_EXTENSION and RENDER_TARGET descriptors
+ * aligned. */
+struct pan_fbd_layer {
+   /** GPU address to the tiler descriptor. */
+   uint64_t tiler;
+
+   /** Frame argument. */
+   uint64_t frame_argument;
+
+   /** An instance of Fragment Flags 0. */
+   struct mali_fragment_flags_0_packed flags0;
+
+   /** An instance of Fragment Flags 2. */
+   struct mali_fragment_flags_2_packed flags2;
+
+   /** Z clear value. */
+   uint32_t z_clear;
+
+   /** GPU address to the draw call descriptors. It may be 0. */
+   uint64_t dcd_pointer;
+
+   /** GPU address to the ZS_CRC_EXTENSION descriptor. It may be 0. */
+   uint64_t dbd_pointer;
+
+   /** GPU address to the RENDER_TARGET descriptors. */
+   uint64_t rtd_pointer;
+} __attribute__((aligned(64)));
+#endif /* PAN_ARCH >= 14 */
+
 #endif /* __PAN_FB_H */
--- a/src/panfrost/lib/pan_format.c
+++ b/src/panfrost/lib/pan_format.c
@ -1,5 +1,6 @@
 /*
 * Copyright (C) 2019 Collabora, Ltd.
+ * Copyright (C) 2026 Arm Ltd.
 * SPDX-License-Identifier: MIT
 */

@ -184,7 +185,27 @@ const struct pan_blendable_format
 const struct pan_format GENX(pan_pipe_format)[PIPE_FORMAT_COUNT] = {
   FMT(NONE,                    CONSTANT,        0000, L, VTR_IB),

-#if PAN_ARCH >= 7
+#if PAN_ARCH >= 14
+   /* Multiplane formats */
+   FMT_YUV(R8G8_R8B8_UNORM, Y8U8Y8V8_422, UVYA, NO_SWAP, CENTER_422, _T____),
+   FMT_YUV(G8R8_B8R8_UNORM, U8Y8V8Y8_422, UYVA, SWAP,    CENTER_422, _T____),
+   FMT_YUV(R8B8_R8G8_UNORM, Y8U8Y8V8_422, VYUA, NO_SWAP, CENTER_422, _T____),
+   FMT_YUV(B8R8_G8R8_UNORM, U8Y8V8Y8_422, VUYA, SWAP,    CENTER_422, _T____),
+   FMT_YUV(R8_G8B8_420_UNORM, Y8U8V8_420, YUVA, NO_SWAP, CENTER, _T____),
+   FMT_YUV(R8_B8G8_420_UNORM, Y8U8V8_420, YVUA, NO_SWAP, CENTER, _T____),
+   FMT_YUV(R8_G8_B8_420_UNORM, Y8U8V8_420, YUVA, NO_SWAP, CENTER, _T____),
+   FMT_YUV(R8_B8_G8_420_UNORM, Y8U8V8_420, YVUA, NO_SWAP, CENTER, _T____),
+
+   FMT_YUV(R8_G8B8_422_UNORM, Y8U8Y8V8_422, YUVA, NO_SWAP, CENTER_422, _T____),
+   FMT_YUV(R8_B8G8_422_UNORM, U8Y8V8Y8_422, YVUA, NO_SWAP, CENTER_422, _T____),
+
+   FMT_YUV(R10_G10B10_420_UNORM, YUYAAYVYAA_420, YUVA, NO_SWAP, CENTER, _T____),
+   FMT_YUV(R10_G10B10_422_UNORM, Y10X6U10X6Y10X6V10X6_422, YUVA, NO_SWAP, CENTER_422, _T____),
+   /* special internal formats */
+   FMT_YUV(R8G8B8_420_UNORM_PACKED, Y8U8V8_420, YUVA, NO_SWAP, CENTER, _T____),
+   FMT_YUV(R10G10B10_420_UNORM_PACKED, Y10U10V10_420, YUVA, NO_SWAP, CENTER, _T____),
+   FMT_YUV(X6R10X6G10_X6R10X6B10_422_UNORM, Y10X6U10X6Y10X6V10X6_422, UVYA, NO_SWAP, CENTER_422, _T____),
+#elif PAN_ARCH >= 7
   /* Multiplane formats */
   FMT_YUV(R8G8_R8B8_UNORM, YUYV8, UVYA, NO_SWAP, CENTER_422, _T____),
   FMT_YUV(G8R8_B8R8_UNORM, VYUY8, UYVA, SWAP,    CENTER_422, _T____),
--- a/src/panfrost/lib/pan_format.h
+++ b/src/panfrost/lib/pan_format.h
@ -168,6 +168,10 @@ extern const struct pan_blendable_format
   pan_blendable_formats_v12[PIPE_FORMAT_COUNT];
 extern const struct pan_blendable_format
   pan_blendable_formats_v13[PIPE_FORMAT_COUNT];
+extern const struct pan_blendable_format
+   pan_blendable_formats_v14[PIPE_FORMAT_COUNT];
+extern const struct pan_blendable_format
+   pan_blendable_formats_v15[PIPE_FORMAT_COUNT];

 uint8_t pan_raw_format_mask_midgard(enum pipe_format *formats);

@ -184,6 +188,8 @@ pan_blendable_format_table(unsigned arch)
   FMT_TABLE(10);
   FMT_TABLE(12);
   FMT_TABLE(13);
+   FMT_TABLE(14);
+   FMT_TABLE(15);
 #undef FMT_TABLE
   default:
      assert(!"Unsupported architecture");
@ -199,6 +205,8 @@ extern const struct pan_format pan_pipe_format_v9[PIPE_FORMAT_COUNT];
 extern const struct pan_format pan_pipe_format_v10[PIPE_FORMAT_COUNT];
 extern const struct pan_format pan_pipe_format_v12[PIPE_FORMAT_COUNT];
 extern const struct pan_format pan_pipe_format_v13[PIPE_FORMAT_COUNT];
+extern const struct pan_format pan_pipe_format_v14[PIPE_FORMAT_COUNT];
+extern const struct pan_format pan_pipe_format_v15[PIPE_FORMAT_COUNT];

 static inline const struct pan_format *
 pan_format_table(unsigned arch)
@ -213,6 +221,8 @@ pan_format_table(unsigned arch)
   FMT_TABLE(10);
   FMT_TABLE(12);
   FMT_TABLE(13);
+   FMT_TABLE(14);
+   FMT_TABLE(15);
 #undef FMT_TABLE
   default:
      assert(!"Unsupported architecture");
--- a/src/panfrost/lib/pan_mod.h
+++ b/src/panfrost/lib/pan_mod.h
@ -84,6 +84,8 @@ const struct pan_mod_handler *pan_mod_get_handler_v9(uint64_t modifier);
 const struct pan_mod_handler *pan_mod_get_handler_v10(uint64_t modifier);
 const struct pan_mod_handler *pan_mod_get_handler_v12(uint64_t modifier);
 const struct pan_mod_handler *pan_mod_get_handler_v13(uint64_t modifier);
+const struct pan_mod_handler *pan_mod_get_handler_v14(uint64_t modifier);
+const struct pan_mod_handler *pan_mod_get_handler_v15(uint64_t modifier);

 static inline const struct pan_mod_handler *
 pan_mod_get_handler(unsigned arch, uint64_t modifier)
@ -105,6 +107,10 @@ pan_mod_get_handler(unsigned arch, uint64_t modifier)
      return pan_mod_get_handler_v12(modifier);
   case 13:
      return pan_mod_get_handler_v13(modifier);
+   case 14:
+      return pan_mod_get_handler_v14(modifier);
+   case 15:
+      return pan_mod_get_handler_v15(modifier);
   default:
      UNREACHABLE("Unsupported arch");
   }
--- a/src/panfrost/lib/pan_props.c
+++ b/src/panfrost/lib/pan_props.c
@ -70,6 +70,15 @@ pan_compute_max_thread_count(const struct pan_kmod_dev_props *props,
      aligned_reg_count = work_reg_count <= 32 ? 32 : 64;
   }

+   if (pan_arch(props->gpu_id) >= 15) {
+      assert(props->num_threads_active_granularity);
+      unsigned max_treads_per_wg =
+         ROUND_DOWN_TO(props->num_registers_per_core / aligned_reg_count,
+                       props->num_threads_active_granularity);
+      return MIN2(max_treads_per_wg, props->max_threads_per_core);
+   }
+
+   assert(props->max_threads_per_wg);
   return MIN3(props->max_threads_per_wg, props->max_threads_per_core,
               props->num_registers_per_core / aligned_reg_count);
 }
--- a/src/panfrost/lib/pan_texture.c
+++ b/src/panfrost/lib/pan_texture.c
@ -223,6 +223,25 @@ pan_clump_format(enum pipe_format format)
   /* YUV-sampling has special cases */
   if (pan_format_is_yuv(format)) {
      switch (format) {
+#if PAN_ARCH >= 14
+      case PIPE_FORMAT_R8G8_R8B8_UNORM:
+      case PIPE_FORMAT_G8R8_B8R8_UNORM:
+      case PIPE_FORMAT_R8B8_R8G8_UNORM:
+      case PIPE_FORMAT_B8R8_G8R8_UNORM:
+      case PIPE_FORMAT_R8_G8B8_422_UNORM:
+      case PIPE_FORMAT_R8_B8G8_422_UNORM:
+      case PIPE_FORMAT_R8_G8B8_420_UNORM:
+      case PIPE_FORMAT_R8_B8G8_420_UNORM:
+      case PIPE_FORMAT_R8_G8_B8_420_UNORM:
+      case PIPE_FORMAT_R8_B8_G8_420_UNORM:
+      case PIPE_FORMAT_R8G8B8_420_UNORM_PACKED:
+         return MALI_CLUMP_FORMAT_RAW8;
+      case PIPE_FORMAT_R10_G10B10_420_UNORM:
+      case PIPE_FORMAT_R10G10B10_420_UNORM_PACKED:
+      case PIPE_FORMAT_R10_G10B10_422_UNORM:
+      case PIPE_FORMAT_X6R10X6G10_X6R10X6B10_422_UNORM:
+         return MALI_CLUMP_FORMAT_R10_PACKED;
+#else
      case PIPE_FORMAT_R8G8_R8B8_UNORM:
      case PIPE_FORMAT_G8R8_B8R8_UNORM:
      case PIPE_FORMAT_R8B8_R8G8_UNORM:
@ -242,6 +261,7 @@ pan_clump_format(enum pipe_format format)
      case PIPE_FORMAT_R10_G10B10_422_UNORM:
      case PIPE_FORMAT_X6R10X6G10_X6R10X6B10_422_UNORM:
         return MALI_CLUMP_FORMAT_Y10_UV10_422;
+#endif /* PAN_ARCH >= 14 */
      default:
         UNREACHABLE("unhandled clump format");
      }
--- a/src/panfrost/libpan/libpan.h
+++ b/src/panfrost/libpan/libpan.h
@ -28,6 +28,10 @@
 #include "libpan_v12.h"
 #elif (PAN_ARCH == 13)
 #include "libpan_v13.h"
+#elif (PAN_ARCH == 14)
+#include "libpan_v14.h"
+#elif (PAN_ARCH == 15)
+#include "libpan_v15.h"
 #else
 #error "Unsupported architecture for libpan"
 #endif
--- a/src/panfrost/libpan/libpan_shaders.h
+++ b/src/panfrost/libpan/libpan_shaders.h
@ -26,6 +26,10 @@
 #include "libpan_shaders_v12.h"
 #elif (PAN_ARCH == 13)
 #include "libpan_shaders_v13.h"
+#elif (PAN_ARCH == 14)
+#include "libpan_shaders_v14.h"
+#elif (PAN_ARCH == 15)
+#include "libpan_shaders_v15.h"
 #else
 #error "Unsupported architecture for libpan"
 #endif
--- a/src/panfrost/libpan/meson.build
+++ b/src/panfrost/libpan/meson.build
@ -11,7 +11,7 @@ libpan_shader_files = files(

 idep_libpan_per_arch = {}

-foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13']
+foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15']
  libpan_spv = custom_target(
    input : libpan_shader_files,
    output : 'libpan_v' + ver + '.spv',
--- a/src/panfrost/model/pan_model.c
+++ b/src/panfrost/model/pan_model.c
@ -95,6 +95,14 @@ const struct pan_model pan_model_list[] = {
                                              MODEL_RATES(4, 8, 128)),
   FIFTHGEN_MODEL(PAN_PROD_ID(13, 8, 0), 4, "G725",  "TKRx", MODEL_ANISO(ALL),  MODEL_TB_SIZES(65536, 65536),
                                              MODEL_RATES(4, 8, 128)),
+   FIFTHGEN_MODEL(PAN_PROD_ID(14, 8, 3), 1, "G1-Pro",  "TDRx", MODEL_ANISO(ALL),  MODEL_TB_SIZES(65536, 65536),
+                                              MODEL_RATES(4, 8, 64)),
+   FIFTHGEN_MODEL(PAN_PROD_ID(14, 8, 3), 4, "G1-Pro",  "TDRx", MODEL_ANISO(ALL),  MODEL_TB_SIZES(65536, 65536),
+                                              MODEL_RATES(4, 8, 128)),
+   FIFTHGEN_MODEL(PAN_PROD_ID(15, 8, 3), 0, "TMAx",  "TMAx", MODEL_ANISO(ALL),  MODEL_TB_SIZES(65536, 65536),
+                                              MODEL_RATES(4, 8, 64)),
+   FIFTHGEN_MODEL(PAN_PROD_ID(15, 8, 3), 4, "TMAx",  "TMAx", MODEL_ANISO(ALL),  MODEL_TB_SIZES(65536, 65536),
+                                              MODEL_RATES(4, 8, 128)),
 };
 /* clang-format on */

--- a/src/panfrost/model/pan_model.h
+++ b/src/panfrost/model/pan_model.h
@ -31,6 +31,15 @@ struct pan_tiler_features {
 #define PAN_VERSION_MINOR(x)  (((x) & BITFIELD_RANGE(4, 8)) >> 4)
 #define PAN_VERSION_STATUS(x) ((x) & BITFIELD_RANGE(0, 4))

+#define PAN_ID64_COMPAT            0xFull
+#define PAN_ID64_ARCH_MAJOR(x)     (((x) & BITFIELD64_RANGE(56, 8)) >> 56)
+#define PAN_ID64_ARCH_MINOR(x)     (((x) & BITFIELD64_RANGE(48, 8)) >> 48)
+#define PAN_ID64_ARCH_REV(x)       (((x) & BITFIELD64_RANGE(40, 8)) >> 40)
+#define PAN_ID64_PRODUCT_MAJOR(x)  (((x) & BITFIELD64_RANGE(32, 8)) >> 32)
+#define PAN_ID64_VERSION_MAJOR(x)  (((x) & BITFIELD64_RANGE(16, 8)) >> 16)
+#define PAN_ID64_VERSION_MINOR(x)  (((x) & BITFIELD64_RANGE(8, 8)) >> 8)
+#define PAN_ID64_VERSION_STATUS(x) ((x) & BITFIELD64_RANGE(0, 8))
+
 /* GPU product id for Midgard */
 #define MIDGARD_PROD_ID(x) (((x) & BITFIELD_RANGE(16, 16)) >> 16)

@ -108,8 +117,12 @@ pan_arch(uint64_t gpu_id)
   case 0x860:
   case 0x880:
      return 5;
-   default:
-      return PAN_ARCH_MAJOR(gpu_id);
+   default: {
+      unsigned gpu_arch = PAN_ARCH_MAJOR(gpu_id);
+      if (gpu_arch == PAN_ID64_COMPAT)
+         return PAN_ID64_ARCH_MAJOR(gpu_id);
+      return gpu_arch;
+   }
   }
 }

@ -119,14 +132,21 @@ pan_prod_id(uint64_t gpu_id)
   unsigned arch = pan_arch(gpu_id);
   if (arch < 6)
      return MIDGARD_PROD_ID(gpu_id);
-   return PAN_PROD_ID(PAN_ARCH_MAJOR(gpu_id), PAN_ARCH_MINOR(gpu_id),
-                      PAN_PRODUCT_MAJOR(gpu_id));
+   else if (arch < PAN_ID64_COMPAT)
+      return PAN_PROD_ID(PAN_ARCH_MAJOR(gpu_id), PAN_ARCH_MINOR(gpu_id),
+                         PAN_PRODUCT_MAJOR(gpu_id));
+   return PAN_PROD_ID(PAN_ID64_ARCH_MAJOR(gpu_id), PAN_ID64_ARCH_MINOR(gpu_id),
+                      PAN_ID64_PRODUCT_MAJOR(gpu_id));
 }

 static inline uint32_t
 pan_rev(uint64_t gpu_id)
 {
-   return PAN_REV(PAN_VERSION_MAJOR(gpu_id), PAN_VERSION_MINOR(gpu_id));
+   unsigned arch = pan_arch(gpu_id);
+   if (arch < PAN_ID64_COMPAT)
+      return PAN_REV(PAN_VERSION_MAJOR(gpu_id), PAN_VERSION_MINOR(gpu_id));
+   return PAN_REV(PAN_ID64_VERSION_MAJOR(gpu_id),
+                  PAN_ID64_VERSION_MINOR(gpu_id));
 }

 #endif
--- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h
+++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h
@ -74,7 +74,11 @@ static inline uint32_t
 get_fbd_size(bool has_zs_ext, uint32_t rt_count)
 {
   assert(rt_count >= 1 && rt_count <= MAX_RTS);
+#if PAN_ARCH >= 14
+   uint32_t fbd_size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64);
+#else
   uint32_t fbd_size = pan_size(FRAMEBUFFER);
+#endif
   if (has_zs_ext)
      fbd_size += pan_size(ZS_CRC_EXTENSION);
   fbd_size += pan_size(RENDER_TARGET) * rt_count;
@ -209,13 +213,27 @@ enum panvk_cs_regs {
   PANVK_CS_REG_RUN_IDVS_SR_END = 60,
 #endif

+#if PAN_ARCH >= 14
+   /* RUN_FRAGMENT2 staging regs.
+    * SW ABI:
+    * - r54:55 contain the pointer to the current FBD layer state.
+    * - r58:59 contain the pointer to the first tiler descriptor. This is
+    *   needed to gather completed heap chunks after a run_fragment2.
+    */
+   PANVK_CS_REG_RUN_FRAGMENT_SR_START = 0,
+   PANVK_CS_REG_RUN_FRAGMENT_SR_END = 55,
+   PANVK_CS_REG_FBD_LAYER_PTR = 54,
+   PANVK_CS_REG_TILER_DESC_PTR = 58,
+#else
   /* RUN_FRAGMENT staging regs.
    * SW ABI:
-    * - r38:39 contain the pointer to the first tiler descriptor. This is
+    * - r58:59 contain the pointer to the first tiler descriptor. This is
    *   needed to gather completed heap chunks after a run_fragment.
    */
   PANVK_CS_REG_RUN_FRAGMENT_SR_START = 38,
   PANVK_CS_REG_RUN_FRAGMENT_SR_END = 46,
+   PANVK_CS_REG_TILER_DESC_PTR = 58,
+#endif

   /* RUN_COMPUTE staging regs. */
   PANVK_CS_REG_RUN_COMPUTE_SR_START = 0,
@ -870,4 +888,31 @@ vk_stages_to_subqueue_mask(VkPipelineStageFlags2 vk_stages,
 void panvk_per_arch(emit_barrier)(struct panvk_cmd_buffer *cmdbuf,
                                  struct panvk_cs_deps deps);

+#if PAN_ARCH >= 14
+static inline void
+cs_emit_layer_fragment_state(struct cs_builder *b, struct cs_index fbd_ptr)
+{
+   /* Emit the dynamic fragment state. This state may change per-layer. */
+
+   cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_0), fbd_ptr,
+                offsetof(struct pan_fbd_layer, flags0));
+   cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_2), fbd_ptr,
+                offsetof(struct pan_fbd_layer, flags2));
+   cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, Z_CLEAR), fbd_ptr,
+                offsetof(struct pan_fbd_layer, z_clear));
+   cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, TILER_DESCRIPTOR_POINTER), fbd_ptr,
+                offsetof(struct pan_fbd_layer, tiler));
+   cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, RTD_POINTER), fbd_ptr,
+                offsetof(struct pan_fbd_layer, rtd_pointer));
+   cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, DBD_POINTER), fbd_ptr,
+                offsetof(struct pan_fbd_layer, dbd_pointer));
+   cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_ARG), fbd_ptr,
+                offsetof(struct pan_fbd_layer, frame_argument));
+   cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_SHADER_DCD_POINTER), fbd_ptr,
+                offsetof(struct pan_fbd_layer, dcd_pointer));
+
+   cs_flush_loads(b);
+}
+#endif /* PAN_ARCH >= 14 */
+
 #endif /* PANVK_CMD_BUFFER_H */
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
@ -89,8 +89,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(
      unsigned core_id_range;
      pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);

-      tlsinfo.wls.instances = pan_calc_wls_instances(
-         &cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim);
+      tlsinfo.wls.instances =
+         pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props,
+                                indirect ? NULL : dim, cs->info.work_reg_count);

      unsigned wls_total_size = pan_calc_total_wls_size(
         tlsinfo.wls.size, tlsinfo.wls.instances, core_id_range);
@ -156,7 +157,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
   unsigned wg_per_task = 0;
   if (indirect)
      wg_per_task = pan_calc_workgroups_per_task(&cs->cs.local_size,
-                                                 &phys_dev->kmod.dev->props);
+                                                 &phys_dev->kmod.dev->props,
+                                                 cs->info.work_reg_count);

   if (compute_state_dirty(cmdbuf, DESC_STATE) ||
       compute_state_dirty(cmdbuf, CS)) {
@ -207,9 +209,20 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
         cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_FAU), fau_ptr);
      }

-      if (compute_state_dirty(cmdbuf, CS))
+      if (compute_state_dirty(cmdbuf, CS)) {
+#if PAN_ARCH >= 15
+         struct mali_shader_program_pointer_packed spp;
+         pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) {
+            ctx.register_count = cs->info.work_reg_count;
+            ctx.pointer = panvk_priv_mem_dev_addr(cs->spd);
+         }
+         uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0];
+         cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_SPD), ptr);
+#else
         cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_SPD),
                      panvk_priv_mem_dev_addr(cs->spd));
+#endif
+      }

      cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_TSD), tsd);

--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
@ -51,6 +51,7 @@
 #include "vk_render_pass.h"
 #include "poly/geometry.h"

+#if PAN_ARCH < 14
 static enum cs_reg_perm
 provoking_vertex_fn_reg_perm_cb(struct cs_builder *b, unsigned reg)
 {
@ -202,6 +203,7 @@ panvk_per_arch(device_draw_context_cleanup)(struct panvk_device *dev)
   panvk_priv_bo_unref(dev->draw_ctx->fns_bo);
   vk_free(&dev->vk.alloc, dev->draw_ctx);
 }
+#endif /* PAN_ARCH < 14 */

 static void
 emit_vs_attrib(struct panvk_cmd_buffer *cmdbuf,
@ -1245,8 +1247,13 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
   uint32_t fbd_sz = calc_fbd_size(cmdbuf);
   uint32_t fbds_sz = enabled_layer_count * fbd_sz;

-   cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem(
-      cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER));
+#if PAN_ARCH >= 14
+   const unsigned fbds_alignment = alignof(struct pan_fbd_layer);
+#else
+   const unsigned fbds_alignment = pan_alignment(FRAMEBUFFER);
+#endif
+   cmdbuf->state.gfx.render.fbds =
+      panvk_cmd_alloc_dev_mem(cmdbuf, desc, fbds_sz, fbds_alignment);
   if (!cmdbuf->state.gfx.render.fbds.gpu)
      return VK_ERROR_OUT_OF_DEVICE_MEMORY;

@ -1316,14 +1323,23 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
      tiler_ctx = get_tiler_context(cmdbuf, layer_idx);

      uint32_t new_fbd_flags =
-         GENX(pan_emit_fb_desc)(&fbd_info, fbds.cpu + fbd_sz * i);
+         GENX(pan_emit_fb_desc)(&fbd_info, pan_ptr_offset(fbds, fbd_sz * i));

      /* Make sure all FBDs have the same flags. */
      assert(i == 0 || new_fbd_flags == fbd_flags);
      fbd_flags = new_fbd_flags;
   }

+#if PAN_ARCH >= 14
+   /* fbd_flags is unused on v14+. */
+   assert(!fbd_flags);
+#endif
+
   struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
+
+#if PAN_ARCH >= 14
+   // TODO: Implement IR support for v14.
+#else
   for (uint32_t ir_pass = 0; ir_pass < PANVK_IR_PASS_COUNT; ir_pass++) {
      struct pan_ptr ir_fbds = panvk_cmd_alloc_dev_mem(
         cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER));
@ -1335,7 +1351,6 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)

      for (uint32_t i = 0; i < enabled_layer_count; i++) {
         uint32_t layer_idx = multiview ? u_bit_scan(&ir_view_mask_temp) : i;
-         void *ir_fbd = (void *)((uint8_t *)ir_fbds.cpu + (i * fbd_sz));

         fbd_info.layer = layer_idx;
         tiler_ctx = get_tiler_context(cmdbuf, layer_idx);
@ -1353,8 +1368,8 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
         if (result != VK_SUCCESS)
            return result;

-         ASSERTED uint32_t new_fbd_flags =
-            GENX(pan_emit_fb_desc)(&fbd_info, ir_fbd);
+         ASSERTED uint32_t new_fbd_flags = GENX(pan_emit_fb_desc)(
+            &fbd_info, pan_ptr_offset(ir_fbds, fbd_sz * i));

         /* Make sure all FBDs have the same flags. */
         assert(new_fbd_flags == fbd_flags);
@ -1367,16 +1382,18 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)

   /* Wait for IR info push to complete */
   cs_wait_slot(b, SB_ID(LS));
-
-   bool unset_provoking_vertex =
-      cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET;
+#endif /* PAN_ARCH >= 14 */

   if (copy_fbds) {
-      struct cs_index cur_tiler = cs_reg64(b, 38);
+      struct cs_index cur_tiler = cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR);
+#if PAN_ARCH >= 14
+      struct cs_index dst_fbd_ptr = cs_reg64(b, PANVK_CS_REG_FBD_LAYER_PTR);
+#else
      struct cs_index dst_fbd_ptr = cs_sr_reg64(b, FRAGMENT, FBD_POINTER);
-      struct cs_index fbd_idx = cs_reg32(b, 47);
-      struct cs_index src_fbd_ptr = cs_reg64(b, 48);
-      struct cs_index remaining_layers_in_td = cs_reg32(b, 50);
+#endif
+      struct cs_index fbd_idx = cs_reg32(b, 60);
+      struct cs_index src_fbd_ptr = cs_reg64(b, 64);
+      struct cs_index remaining_layers_in_td = cs_reg32(b, 61);
      uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
                                       MAX_LAYERS_PER_TILER_DESC);

@ -1400,10 +1417,27 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
          * framebuffer size is aligned on 64-bytes. */
         assert(fbd_sz == ALIGN_POT(fbd_sz, 64));

+#if PAN_ARCH >= 14
+         for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) {
+            cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16), src_fbd_ptr,
+                       BITFIELD_MASK(16), fbd_off);
+
+            /* Patch the Tiler pointer. */
+            if (fbd_off == 0)
+               cs_add64(b, cs_scratch_reg64(b, 0), cur_tiler, 0);
+
+            cs_store(b, cs_scratch_reg_tuple(b, 0, 16), dst_fbd_ptr,
+                     BITFIELD_MASK(16), fbd_off);
+         }
+#else
+         bool unset_provoking_vertex =
+            cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET;
         for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) {
            if (fbd_off == 0) {
               cs_load_to(b, cs_scratch_reg_tuple(b, 0, 14), src_fbd_ptr,
                          BITFIELD_MASK(14), fbd_off);
+
+               /* Patch the Tiler pointer. */
               cs_add64(b, cs_scratch_reg64(b, 14), cur_tiler, 0);

               /* If we don't know what provoking vertex mode the
@ -1423,6 +1457,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
            cs_store(b, cs_scratch_reg_tuple(b, 0, 16), dst_fbd_ptr,
                     BITFIELD_MASK(16), fbd_off);
         }
+#endif

         /* Finish stores to pass_dst_fbd_ptr. */
         cs_flush_stores(b);
@ -1456,12 +1491,19 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
                  -(full_td_count * pan_size(TILER_CONTEXT)));
      }
   } else {
+#if PAN_ARCH >= 14
+      struct cs_index fbd_pointer = cs_reg64(b, PANVK_CS_REG_FBD_LAYER_PTR);
+#else
+      struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER);
+#endif
+
      cs_update_frag_ctx(b) {
-         cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER),
-                      fbds.gpu | fbd_flags);
-         cs_move64_to(b, cs_reg64(b, 38), cmdbuf->state.gfx.render.tiler);
+         cs_move64_to(b, fbd_pointer, fbds.gpu | fbd_flags);
+         cs_move64_to(b, cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR),
+                      cmdbuf->state.gfx.render.tiler);
      }

+#if PAN_ARCH < 14
      /* If we don't know what provoking vertex mode the application wants yet,
       * leave space to patch it later */
      if (cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET) {
@ -1483,6 +1525,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
         cs_maybe(b, &cmdbuf->state.gfx.render.maybe_set_fbds_provoking_vertex)
            cs_call(b, addr_reg, length_reg);
      }
+#endif
   }

   return VK_SUCCESS;
@ -3299,6 +3342,9 @@ calc_tiler_oom_handler_idx(struct panvk_cmd_buffer *cmdbuf)
 static void
 setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf)
 {
+#if PAN_ARCH >= 14
+   // TODO: Implement IR support for v14.
+#else
   struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
   const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout;
   const bool has_zs_ext = pan_fb_has_zs(fb);
@ -3343,6 +3389,7 @@ setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf)
              TILER_OOM_CTX_FIELD_OFFSET(layer_count));

   cs_flush_stores(b);
+#endif /* PAN_ARCH >= 14 */
 }

 static uint32_t
@ -3351,17 +3398,98 @@ pack_32_2x16(uint16_t lo, uint16_t hi)
   return (((uint32_t)hi) << 16) | (uint32_t)lo;
 }

+#if PAN_ARCH >= 14
+static void
+cs_emit_static_fragment_state(struct cs_builder *b,
+                              struct panvk_cmd_buffer *cmdbuf)
+{
+   /* Emit the static fragment staging registers. These don't change per-layer. */
+
+   const struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
+   const struct panvk_rendering_state *render = &cmdbuf->state.gfx.render;
+   const struct pan_fb_layout *fb = &render->fb.layout;
+
+   const uint8_t sample_count = render->fb.layout.sample_count;
+
+   const struct pan_fb_bbox fb_area_px =
+      pan_fb_bbox_from_xywh(0, 0, fb->width_px, fb->height_px);
+   const struct pan_fb_bbox bbox_px =
+      pan_fb_bbox_clamp(fb->tiling_area_px, fb_area_px);
+
+   assert(pan_fb_bbox_is_valid(fb->tiling_area_px));
+
+   struct mali_fragment_bounding_box_packed bbox;
+   pan_pack(&bbox, FRAGMENT_BOUNDING_BOX, cfg) {
+      cfg.bound_min_x = bbox_px.min_x;
+      cfg.bound_min_y = bbox_px.min_y;
+      cfg.bound_max_x = bbox_px.max_x;
+      cfg.bound_max_y = bbox_px.max_y;
+   }
+
+   struct mali_frame_size_packed frame_size;
+   pan_pack(&frame_size, FRAME_SIZE, cfg) {
+      cfg.width = fb->width_px;
+      cfg.height = fb->height_px;
+   }
+
+   cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, BOUNDING_BOX),
+                bbox.opaque[0] | (uint64_t)bbox.opaque[1] << 32);
+   cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FRAME_SIZE), frame_size.opaque[0]);
+   cs_move64_to(
+      b, cs_sr_reg64(b, FRAGMENT, SAMPLE_POSITION_ARRAY_POINTER),
+      dev->sample_positions->addr.dev +
+         pan_sample_positions_offset(pan_sample_pattern(sample_count)));
+
+   /* Flags 1 */
+   struct mali_fragment_flags_1_packed flags1;
+   pan_pack(&flags1, FRAGMENT_FLAGS_1, cfg) {
+      cfg.sample_count = fb->sample_count;
+      cfg.sample_pattern = pan_sample_pattern(fb->sample_count);
+      cfg.effective_tile_size = fb->tile_size_px;
+      cfg.point_sprite_coord_origin_max_y = false;
+      cfg.first_provoking_vertex = get_first_provoking_vertex(cmdbuf);
+
+      assert(fb->rt_count > 0);
+      cfg.render_target_count = fb->rt_count;
+      cfg.color_buffer_allocation = fb->tile_rt_alloc_B;
+   }
+   cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), flags1.opaque[0]);
+
+   /* If we don't know what provoking vertex mode the application wants yet,
+    * leave space to patch it later */
+   if (cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET) {
+      cs_maybe(b, &cmdbuf->state.gfx.render.maybe_set_fbds_provoking_vertex)
+      {
+         /* provoking_vertex flag is bit 14 of Fragment Flags 1. */
+         cs_add32(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1),
+                  cs_sr_reg32(b, FRAGMENT, FLAGS_1), -(1 << 14));
+      }
+   }
+
+   /* Leave the remaining RUN_FRAGMENT2 staging registers as zero. */
+}
+#endif /* PAN_ARCH >= 14 */
+
 static VkResult
 issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
 {
+#if PAN_ARCH < 14
   struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
+#endif
   const struct cs_tracing_ctx *tracing_ctx =
      &cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].tracing;
-   const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout;
   struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
   bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0;

   /* Now initialize the fragment bits. */
+#if PAN_ARCH >= 14
+   struct cs_index fbd_pointer = cs_reg64(b, PANVK_CS_REG_FBD_LAYER_PTR);
+   cs_update_frag_ctx(b) {
+      cs_emit_static_fragment_state(b, cmdbuf);
+      cs_emit_layer_fragment_state(b, fbd_pointer);
+   }
+#else
+   const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout;
   cs_update_frag_ctx(b) {
      cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN),
                   pack_32_2x16(fb->tiling_area_px.min_x,
@ -3370,6 +3498,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
                   pack_32_2x16(fb->tiling_area_px.max_x,
                                fb->tiling_area_px.max_y));
   }
+#endif

   bool simul_use =
      cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
@ -3401,6 +3530,9 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
    * state for this renderpass, so it's safe to enable. */
   struct cs_index addr_reg = cs_scratch_reg64(b, 0);
   struct cs_index length_reg = cs_scratch_reg32(b, 2);
+#if PAN_ARCH >= 14
+   // TODO: Implement IR support for v14.
+#else
   uint32_t handler_idx = calc_tiler_oom_handler_idx(cmdbuf);
   uint64_t handler_addr = dev->tiler_oom.handlers_bo->addr.dev +
                           handler_idx * dev->tiler_oom.handler_stride;
@ -3408,6 +3540,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
   cs_move32_to(b, length_reg, dev->tiler_oom.handler_stride);
   cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
                            length_reg);
+#endif

   /* Wait for the tiling to be done before submitting the fragment job. */
   wait_finish_tiling(cmdbuf);
@ -3422,8 +3555,12 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
    * up. */
   cs_move64_to(b, addr_reg, 0);
   cs_move32_to(b, length_reg, 0);
+#if PAN_ARCH >= 14
+   // TODO: Implement IR support for v14.
+#else
   cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
                            length_reg);
+#endif

   /* Applications tend to forget to describe subpass dependencies, especially
    * when it comes to write -> read dependencies on attachments. The
@ -3439,8 +3576,13 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
   }

   if (cmdbuf->state.gfx.render.layer_count <= 1) {
+#if PAN_ARCH >= 14
+      cs_trace_run_fragment2(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
+                             false, MALI_TILE_RENDER_ORDER_Z_ORDER);
+#else
      cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
                            false, MALI_TILE_RENDER_ORDER_Z_ORDER);
+#endif
   } else {
      struct cs_index run_fragment_regs = cs_scratch_reg_tuple(b, 0, 4);
      struct cs_index remaining_layers = cs_scratch_reg32(b, 4);
@ -3449,12 +3591,18 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
      cs_while(b, MALI_CS_CONDITION_GREATER, remaining_layers) {
         cs_add32(b, remaining_layers, remaining_layers, -1);

+#if PAN_ARCH >= 14
+         cs_emit_layer_fragment_state(b, fbd_pointer);
+         cs_trace_run_fragment2(b, tracing_ctx, run_fragment_regs, false,
+                                MALI_TILE_RENDER_ORDER_Z_ORDER);
+#else
         cs_trace_run_fragment(b, tracing_ctx, run_fragment_regs, false,
                               MALI_TILE_RENDER_ORDER_Z_ORDER);
+         struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER);
+#endif

         cs_update_frag_ctx(b)
-            cs_add64(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER),
-                     cs_sr_reg64(b, FRAGMENT, FBD_POINTER), fbd_sz);
+            cs_add64(b, fbd_pointer, fbd_pointer, fbd_sz);
      }
   }

@ -3468,8 +3616,8 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
   struct cs_index completed = cs_scratch_reg_tuple(b, 10, 4);
   struct cs_index completed_top = cs_scratch_reg64(b, 10);
   struct cs_index completed_bottom = cs_scratch_reg64(b, 12);
-   struct cs_index cur_tiler = cs_reg64(b, 38);
-   struct cs_index tiler_count = cs_reg32(b, 47);
+   struct cs_index cur_tiler = cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR);
+   struct cs_index tiler_count = cs_reg32(b, 60);
   struct cs_index oq_chain = cs_scratch_reg64(b, 10);
   struct cs_index oq_chain_lo = cs_scratch_reg32(b, 10);
   struct cs_index oq_syncobj = cs_scratch_reg64(b, 12);
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c
@ -82,8 +82,18 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
      uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56);
      cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_FAU), fau_ptr);

+#if PAN_ARCH >= 15
+      struct mali_shader_program_pointer_packed spp;
+      pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) {
+         ctx.register_count = shader->info.work_reg_count;
+         ctx.pointer = panvk_priv_mem_dev_addr(shader->spd);
+      }
+      uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0];
+      cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_SPD), ptr);
+#else
      cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_SPD),
                   panvk_priv_mem_dev_addr(shader->spd));
+#endif

      cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_TSD), tsd);

@ -155,7 +165,8 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
       * increment/axis parameters requires knowledge of job dimensions, but
       * this is somewhat offset by run_compute being a native instruction. */
      task_increment = pan_calc_workgroups_per_task(
-         &shader->cs.local_size, &phys_dev->kmod.dev->props);
+         &shader->cs.local_size, &phys_dev->kmod.dev->props,
+         shader->info.work_reg_count);
   } else {
      panvk_per_arch(calculate_task_axis_and_increment)(
         shader, phys_dev, &dim, &task_axis, &task_increment);
--- a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c
@ -13,8 +13,13 @@ tiler_oom_reg_perm_cb(struct cs_builder *b, unsigned reg)
 {
   switch (reg) {
   /* The bbox is set up by the fragment subqueue, we should not modify it. */
+#if PAN_ARCH >= 14
+   case 28:
+   case 29:
+#else
   case 42:
   case 43:
+#endif
   /* We should only load from the subqueue context. */
   case PANVK_CS_REG_SUBQUEUE_CTX_START:
   case PANVK_CS_REG_SUBQUEUE_CTX_END:
@ -42,8 +47,14 @@ copy_fbd(struct cs_builder *b, bool has_zs_ext, uint32_t rt_count,
   cs_store(b, cs_scratch_reg_tuple(b, 0, 8), dst, BITFIELD_MASK(8),
            8 * sizeof(uint32_t));

+#if PAN_ARCH >= 14
+   const size_t fbd_size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64);
+#else
+   const size_t fbd_size = sizeof(struct mali_framebuffer_packed);
+#endif
+
   if (has_zs_ext) {
-      const uint16_t dbd_offset = sizeof(struct mali_framebuffer_packed);
+      const uint16_t dbd_offset = fbd_size;

      /* Copy the whole DBD. */
      cs_load_to(b, cs_scratch_reg_tuple(b, 0, 8), src_other,
@ -57,8 +68,7 @@ copy_fbd(struct cs_builder *b, bool has_zs_ext, uint32_t rt_count,
   }

   const uint16_t rts_offset =
-      sizeof(struct mali_framebuffer_packed) +
-      (has_zs_ext ? sizeof(struct mali_zs_crc_extension_packed) : 0);
+      fbd_size + (has_zs_ext ? sizeof(struct mali_zs_crc_extension_packed) : 0);

   for (uint32_t rt = 0; rt < rt_count; rt++) {
      const uint16_t rt_offset =
@ -110,12 +120,14 @@ generate_tiler_oom_handler(struct panvk_device *dev,
      .tracebuf_addr_offset =
         offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs),
   };
-   struct mali_framebuffer_pointer_packed fb_tag;

+#if PAN_ARCH < 14
+   struct mali_framebuffer_pointer_packed fb_tag;
   pan_pack(&fb_tag, FRAMEBUFFER_POINTER, cfg) {
      cfg.zs_crc_extension_present = has_zs_ext;
      cfg.render_target_count = rt_count;
   }
+#endif

   cs_function_def(&b, &handler, handler_ctx) {
      struct cs_index subqueue_ctx = cs_subqueue_ctx_reg(&b);
@ -140,7 +152,7 @@ generate_tiler_oom_handler(struct panvk_device *dev,
      struct cs_index run_fragment_regs = cs_scratch_reg_tuple(&b, 0, 4);

      /* The tiler pointer is pre-filled. */
-      struct cs_index tiler_ptr = cs_reg64(&b, 38);
+      struct cs_index tiler_ptr = cs_reg64(&b, PANVK_CS_REG_TILER_DESC_PTR);

      cs_load64_to(&b, scratch_fbd_ptr_reg, subqueue_ctx,
                   TILER_OOM_CTX_FIELD_OFFSET(ir_scratch_fbd_ptr));
@ -175,12 +187,22 @@ generate_tiler_oom_handler(struct panvk_device *dev,
         /* Flush copies before the RUN_FRAGMENT. */
         cs_wait_slot(&b, SB_ID(LS));

+#if PAN_ARCH >= 14
+         /* Set FBD pointer to the scratch fbd */
+         struct cs_index fbd_pointer = cs_reg64(&b, PANVK_CS_REG_FBD_LAYER_PTR);
+         cs_add64(&b, fbd_pointer, scratch_fbd_ptr_reg, 0);
+         cs_emit_layer_fragment_state(&b, fbd_pointer);
+
+         cs_trace_run_fragment2(&b, &tracing_ctx, run_fragment_regs, false,
+                                MALI_TILE_RENDER_ORDER_Z_ORDER);
+#else
         /* Set FBD pointer to the scratch fbd */
         cs_add64(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER),
                  scratch_fbd_ptr_reg, fb_tag.opaque[0]);

         cs_trace_run_fragment(&b, &tracing_ctx, run_fragment_regs, false,
                               MALI_TILE_RENDER_ORDER_Z_ORDER);
+#endif

         /* Serialize run fragments since we reuse FBD for the runs */
         cs_wait_slots(&b, dev->csf.sb.all_iters_mask);
--- a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c
@ -717,7 +717,12 @@ init_tiler(struct panvk_gpu_queue *queue)
   tiler_heap->chunk_size = phys_dev->csf.tiler.chunk_size;

   alloc_info.size = get_fbd_size(true, MAX_RTS);
-   alloc_info.alignment = pan_alignment(FRAMEBUFFER);
+#if PAN_ARCH >= 14
+   const unsigned fbds_alignment = alignof(struct pan_fbd_layer);
+#else
+   const unsigned fbds_alignment = pan_alignment(FRAMEBUFFER);
+#endif
+   alloc_info.alignment = fbds_alignment;
   tiler_heap->oom_fbd = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
   if (!panvk_priv_mem_check_alloc(tiler_heap->oom_fbd)) {
      result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
--- a/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c
+++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c
@ -181,7 +181,7 @@ panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf)
         fbd_info.layer = layer_id;
         fbd_info.frame_shaders = fs;
         fbd_info.frame_shaders.dcd_pointer += layer_id * 3 * pan_size(DRAW);
-         tagged_fbd_ptr |= GENX(pan_emit_fb_desc)(&fbd_info, fbd.cpu);
+         tagged_fbd_ptr |= GENX(pan_emit_fb_desc)(&fbd_info, fbd);

         result = panvk_cmd_prepare_fragment_job(cmdbuf, tagged_fbd_ptr);
         if (result != VK_SUCCESS)
--- a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c
@ -51,8 +51,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(
      unsigned core_id_range;

      pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);
-      batch->tlsinfo.wls.instances = pan_calc_wls_instances(
-         &cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim);
+      batch->tlsinfo.wls.instances =
+         pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props,
+                                indirect ? NULL : dim, cs->info.work_reg_count);
      batch->wls_total_size = pan_calc_total_wls_size(
         batch->tlsinfo.wls.size, batch->tlsinfo.wls.instances, core_id_range);
   }
--- a/src/panfrost/vulkan/meson.build
+++ b/src/panfrost/vulkan/meson.build
@ -14,6 +14,7 @@ panvk_entrypoints = custom_target(
    '--device-prefix', 'panvk_v6', '--device-prefix', 'panvk_v7',
    '--device-prefix', 'panvk_v9', '--device-prefix', 'panvk_v10',
    '--device-prefix', 'panvk_v12', '--device-prefix', 'panvk_v13',
+    '--device-prefix', 'panvk_v14', '--device-prefix', 'panvk_v15',
    '--beta', with_vulkan_beta.to_string()
  ],
  depend_files : vk_entrypoints_gen_depend_files,
@ -65,7 +66,7 @@ valhall_archs = [9, 10]
 valhall_inc_dir = ['valhall']
 valhall_files = []

-fifthgen_archs = [12, 13]
+fifthgen_archs = [12, 13, 14, 15]
 fifthgen_inc_dir = ['fifthgen']
 fifthgen_files = []

@ -83,7 +84,7 @@ jm_files = [
  'jm/panvk_vX_gpu_queue.c',
 ]

-csf_archs = [10, 12, 13]
+csf_archs = [10, 12, 13, 14, 15]
 csf_inc_dir = ['csf']
 csf_files = [
  'csf/panvk_vX_bind_queue.c',
@ -126,7 +127,7 @@ common_per_arch_files = [
  sha1_h,
 ]

-foreach arch : [6, 7, 10, 12, 13]
+foreach arch : [6, 7, 10, 12, 13, 14, 15]
  per_arch_files = common_per_arch_files
  inc_panvk_per_arch = []

--- a/src/panfrost/vulkan/panvk_cmd_draw.h
+++ b/src/panfrost/vulkan/panvk_cmd_draw.h
@ -243,7 +243,7 @@ struct panvk_cmd_graphics_state {
      }                                                                        \
   } while (0)

-#if PAN_ARCH >= 10
+#if PAN_ARCH >= 10 && PAN_ARCH < 14
 struct panvk_device_draw_context {
   struct panvk_priv_bo *fns_bo;
   uint64_t fn_set_fbds_provoking_vertex_stride;
@ -376,8 +376,7 @@ cached_fs_required(ASSERTED const struct panvk_cmd_graphics_state *state,
         gfx_state_set_dirty(__cmdbuf, FS_PUSH_UNIFORMS);                      \
   } while (0)

-
-#if PAN_ARCH >= 10
+#if PAN_ARCH >= 10 && PAN_ARCH < 14
 VkResult
 panvk_per_arch(device_draw_context_init)(struct panvk_device *dev);

--- a/src/panfrost/vulkan/panvk_macros.h
+++ b/src/panfrost/vulkan/panvk_macros.h
@ -61,6 +61,12 @@ panvk_catch_indirect_alloc_failure(VkResult error)
      case 13:                                                                 \
         panvk_arch_name(name, v13)(__VA_ARGS__);                              \
         break;                                                                \
+      case 14:                                                                 \
+         panvk_arch_name(name, v14)(__VA_ARGS__);                              \
+         break;                                                                \
+      case 15:                                                                 \
+         panvk_arch_name(name, v15)(__VA_ARGS__);                              \
+         break;                                                                \
      default:                                                                 \
         UNREACHABLE("Unsupported architecture");                              \
      }                                                                        \
@ -84,6 +90,12 @@ panvk_catch_indirect_alloc_failure(VkResult error)
      case 13:                                                                 \
         ret = panvk_arch_name(name, v13)(__VA_ARGS__);                        \
         break;                                                                \
+      case 14:                                                                 \
+         ret = panvk_arch_name(name, v14)(__VA_ARGS__);                        \
+         break;                                                                \
+      case 15:                                                                 \
+         ret = panvk_arch_name(name, v15)(__VA_ARGS__);                        \
+         break;                                                                \
      default:                                                                 \
         UNREACHABLE("Unsupported architecture");                              \
      }                                                                        \
@ -102,6 +114,10 @@ panvk_catch_indirect_alloc_failure(VkResult error)
 #define panvk_per_arch(name) panvk_arch_name(name, v12)
 #elif PAN_ARCH == 13
 #define panvk_per_arch(name) panvk_arch_name(name, v13)
+#elif PAN_ARCH == 14
+#define panvk_per_arch(name) panvk_arch_name(name, v14)
+#elif PAN_ARCH == 15
+#define panvk_per_arch(name) panvk_arch_name(name, v15)
 #else
 #error "Unsupported arch"
 #endif
--- a/src/panfrost/vulkan/panvk_physical_device.c
+++ b/src/panfrost/vulkan/panvk_physical_device.c
@ -64,6 +64,8 @@ PER_ARCH_FUNCS(7);
 PER_ARCH_FUNCS(10);
 PER_ARCH_FUNCS(12);
 PER_ARCH_FUNCS(13);
+PER_ARCH_FUNCS(14);
+PER_ARCH_FUNCS(15);

 static VkResult
 create_kmod_dev(struct panvk_physical_device *device,
@ -411,6 +413,8 @@ panvk_physical_device_init(struct panvk_physical_device *device,
   switch (arch) {
   case 6:
   case 7:
+   case 14:
+   case 15:
      if (!os_get_option("PAN_I_WANT_A_BROKEN_VULKAN_DRIVER")) {
         result = panvk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
                               "WARNING: panvk is not well-tested on v%d, "
--- a/src/panfrost/vulkan/panvk_vX_cmd_frame_shaders.c
+++ b/src/panfrost/vulkan/panvk_vX_cmd_frame_shaders.c
@ -239,10 +239,15 @@ get_frame_shader(struct panvk_device *dev,
   panvk_priv_mem_write_desc(shader->spd, 0, SHADER_PROGRAM, cfg) {
      cfg.stage = MALI_SHADER_STAGE_FRAGMENT;
      cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
+#if PAN_ARCH >= 15
+      cfg.register_count = shader->info.work_reg_count;
+      cfg.preload.r0_r15 = shader->info.preload;
+#else
      cfg.register_allocation =
         pan_register_allocation(shader->info.work_reg_count);
-      cfg.binary = panvk_priv_mem_dev_addr(shader->code_mem);
      cfg.preload.r48_r63 = shader->info.preload >> 48;
+#endif
+      cfg.binary = panvk_priv_mem_dev_addr(shader->code_mem);
   }
 #endif

--- a/src/panfrost/vulkan/panvk_vX_device.c
+++ b/src/panfrost/vulkan/panvk_vX_device.c
@ -550,7 +550,7 @@ panvk_per_arch(create_device)(struct panvk_physical_device *physical_device,
      goto err_free_precomp;
   }

-#if PAN_ARCH >= 10
+#if PAN_ARCH >= 10 && PAN_ARCH < 14
   result = panvk_per_arch(device_draw_context_init)(device);
   if (result != VK_SUCCESS)
      goto err_free_mem_cache;
@ -616,7 +616,7 @@ err_finish_queues:
   panvk_meta_cleanup(device);

 err_free_draw_ctx:
-#if PAN_ARCH >= 10
+#if PAN_ARCH >= 10 && PAN_ARCH < 14
   panvk_per_arch(device_draw_context_cleanup)(device);
 err_free_mem_cache:
 #endif
@ -679,7 +679,7 @@ panvk_per_arch(destroy_device)(struct panvk_device *device,
   }

   panvk_precomp_cleanup(device);
-#if PAN_ARCH >= 10
+#if PAN_ARCH >= 10 && PAN_ARCH < 14
   panvk_per_arch(device_draw_context_cleanup)(device);
 #endif
   panvk_meta_cleanup(device);
--- a/src/panfrost/vulkan/panvk_vX_physical_device.c
+++ b/src/panfrost/vulkan/panvk_vX_physical_device.c
@ -732,6 +732,18 @@ get_conformance_version()
   return (VkConformanceVersion){0, 0, 0, 0};
 }

+static uint32_t
+get_device_id(uint64_t gpu_id)
+{
+   if (PAN_ARCH >= PAN_ID64_COMPAT)
+      return ((PAN_ID64_COMPAT << 28) | (PAN_ID64_ARCH_MAJOR(gpu_id) << 20) |
+              (PAN_ID64_ARCH_MINOR(gpu_id) << 12) |
+              ((PAN_ID64_PRODUCT_MAJOR(gpu_id) & 0xF) << 8) |
+              ((PAN_ID64_VERSION_MAJOR(gpu_id) & 0xF) << 4) |
+              (PAN_ID64_VERSION_MINOR(gpu_id) & 0xF));
+   return (gpu_id & 0xFFFFFFFF);
+}
+
 void
 panvk_per_arch(get_physical_device_properties)(
   const struct panvk_instance *instance,
@ -750,8 +762,17 @@ panvk_per_arch(get_physical_device_properties)(

   const bool has_disk_cache = device->vk.disk_cache != NULL;

+   /* Calculate the value using register count on v15+.
+    * TODO: As this requires register allocation changes ensuring we don't
+    * violate the limits based on the workgroup size, clamp the value to half of
+    * the max threads value (always safe and matches previous GPUs) for now. */
+   unsigned max_threads_per_wg =
+      (PAN_ARCH >= 15)
+         ? MIN2(pan_compute_max_thread_count(&device->kmod.dev->props, 32),
+                device->kmod.dev->props.max_threads_per_core / 2)
+         : device->kmod.dev->props.max_threads_per_wg;
   /* Ensure that the max threads count per workgroup is valid for Bifrost */
-   assert(PAN_ARCH > 8 || device->kmod.dev->props.max_threads_per_wg <= 1024);
+   assert(PAN_ARCH > 8 || max_threads_per_wg <= 1024);

   float pointSizeRangeMin;
   float pointSizeRangeMax;
@ -770,7 +791,7 @@ panvk_per_arch(get_physical_device_properties)(
      .driverVersion = vk_get_driver_version(),
      .vendorID =
         instance->force_vk_vendor ? instance->force_vk_vendor : ARM_VENDOR_ID,
-      .deviceID = device->kmod.dev->props.gpu_id,
+      .deviceID = get_device_id(device->kmod.dev->props.gpu_id),
      .deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,

      /* Vulkan 1.0 limits */
@ -880,11 +901,9 @@ panvk_per_arch(get_physical_device_properties)(
      /* We could also split into serveral jobs but this has many limitations.
       * As such we limit to the max threads per workgroup supported by the GPU.
       */
-      .maxComputeWorkGroupInvocations =
-         device->kmod.dev->props.max_threads_per_wg,
-      .maxComputeWorkGroupSize = {device->kmod.dev->props.max_threads_per_wg,
-                                  device->kmod.dev->props.max_threads_per_wg,
-                                  device->kmod.dev->props.max_threads_per_wg},
+      .maxComputeWorkGroupInvocations = max_threads_per_wg,
+      .maxComputeWorkGroupSize = {max_threads_per_wg, max_threads_per_wg,
+                                  max_threads_per_wg},
      /* 8-bit subpixel precision. */
      .subPixelPrecisionBits = 8,
      .subTexelPrecisionBits = 8,
@ -1075,8 +1094,7 @@ panvk_per_arch(get_physical_device_properties)(
      .minSubgroupSize = pan_subgroup_size(PAN_ARCH),
      .maxSubgroupSize = pan_subgroup_size(PAN_ARCH),
      .maxComputeWorkgroupSubgroups =
-         device->kmod.dev->props.max_threads_per_wg /
-         pan_subgroup_size(PAN_ARCH),
+         max_threads_per_wg / pan_subgroup_size(PAN_ARCH),
      .requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT,
      .maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE,
      .maxPerStageDescriptorInlineUniformBlocks =
--- a/src/panfrost/vulkan/panvk_vX_shader.c
+++ b/src/panfrost/vulkan/panvk_vX_shader.c
@ -1172,10 +1172,15 @@ panvk_shader_upload(struct panvk_device *dev,
            cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
 #endif

+#if PAN_ARCH >= 15
+         cfg.register_count = shader->info.work_reg_count;
+         cfg.preload.r0_r15 = shader->info.preload;
+#else
         cfg.register_allocation =
            pan_register_allocation(shader->info.work_reg_count);
-         cfg.binary = panvk_shader_variant_get_dev_addr(shader);
         cfg.preload.r48_r63 = (shader->info.preload >> 48);
+#endif
+         cfg.binary = panvk_shader_variant_get_dev_addr(shader);
         cfg.flush_to_zero_mode = shader_ftz_mode(shader);

         if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
@ -1191,10 +1196,15 @@ panvk_shader_upload(struct panvk_device *dev,
      panvk_priv_mem_write_desc(shader->spds.all_points, 0, SHADER_PROGRAM,
                                cfg) {
         cfg.stage = pan_shader_stage(&shader->info);
+#if PAN_ARCH >= 15
+         cfg.register_count = shader->info.work_reg_count;
+         cfg.preload.r0_r15 = shader->info.preload;
+#else
         cfg.register_allocation =
            pan_register_allocation(shader->info.work_reg_count);
-         cfg.binary = panvk_shader_variant_get_dev_addr(shader);
         cfg.preload.r48_r63 = (shader->info.preload >> 48);
+#endif
+         cfg.binary = panvk_shader_variant_get_dev_addr(shader);
         cfg.flush_to_zero_mode = shader_ftz_mode(shader);
      }

@ -1206,11 +1216,16 @@ panvk_shader_upload(struct panvk_device *dev,
      panvk_priv_mem_write_desc(shader->spds.all_triangles, 0, SHADER_PROGRAM,
                                cfg) {
         cfg.stage = pan_shader_stage(&shader->info);
+#if PAN_ARCH >= 15
+         cfg.register_count = shader->info.work_reg_count;
+         cfg.preload.r0_r15 = shader->info.preload;
+#else
         cfg.register_allocation =
            pan_register_allocation(shader->info.work_reg_count);
+         cfg.preload.r48_r63 = (shader->info.preload >> 48);
+#endif
         cfg.binary = panvk_shader_variant_get_dev_addr(shader) +
                      shader->info.vs.no_psiz_offset;
-         cfg.preload.r48_r63 = (shader->info.preload >> 48);
         cfg.flush_to_zero_mode = shader_ftz_mode(shader);
      }
 #else