panvk: Use WB mappings for the global RW and executable memory pools

This implies relying on all users of these pools to do the flushing explicitly. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Christoph Pillmayer <christoph.pillmayer@arm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36385>
2026-01-07 02:10:12 +01:00 · 2025-09-17 11:11:40 +02:00 · 2025-09-17 11:11:40 +02:00 · c0d982751c
commit c0d982751c
parent 2dd27c647b
10 changed files with 255 additions and 222 deletions
--- a/src/panfrost/vulkan/bifrost/panvk_vX_meta_desc_copy.c
+++ b/src/panfrost/vulkan/bifrost/panvk_vX_meta_desc_copy.c
@ -326,8 +326,7 @@ panvk_meta_desc_copy_rsd(struct panvk_device *dev)
      return 0;
   }

-   pan_cast_and_pack(panvk_priv_mem_host_addr(shader->rsd), RENDERER_STATE,
-                     cfg) {
+   panvk_priv_mem_write_desc(shader->rsd, 0, RENDERER_STATE, cfg) {
      pan_shader_prepare_rsd(&shader->info,
                             panvk_priv_mem_dev_addr(shader->code_mem), &cfg);
   }
--- a/src/panfrost/vulkan/csf/panvk_vX_event.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_event.c
@ -34,8 +34,10 @@ panvk_per_arch(CreateEvent)(VkDevice _device,
      return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
   }

-   memset(panvk_priv_mem_host_addr(event->syncobjs), 0,
-          sizeof(struct panvk_cs_sync32) * PANVK_SUBQUEUE_COUNT);
+   panvk_priv_mem_write_array(event->syncobjs, 0, struct panvk_cs_sync32,
+                              PANVK_SUBQUEUE_COUNT, sobjs) {
+      memset(sobjs, 0, sizeof(struct panvk_cs_sync32) * PANVK_SUBQUEUE_COUNT);
+   }

   *pEvent = panvk_event_to_handle(event);
   return VK_SUCCESS;
@ -61,11 +63,12 @@ panvk_per_arch(GetEventStatus)(VkDevice _device, VkEvent _event)
 {
   VK_FROM_HANDLE(panvk_event, event, _event);

-   struct panvk_cs_sync32 *syncobjs = panvk_priv_mem_host_addr(event->syncobjs);
-
-   for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
-      if (!syncobjs[i].seqno)
-         return VK_EVENT_RESET;
+   panvk_priv_mem_readback_array(event->syncobjs, 0, struct panvk_cs_sync32,
+                                 PANVK_SUBQUEUE_COUNT, syncobjs) {
+      for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
+         if (!syncobjs[i].seqno)
+            return VK_EVENT_RESET;
+      }
   }

   return VK_EVENT_SET;
@ -76,10 +79,11 @@ panvk_per_arch(SetEvent)(VkDevice _device, VkEvent _event)
 {
   VK_FROM_HANDLE(panvk_event, event, _event);

-   struct panvk_cs_sync32 *syncobjs = panvk_priv_mem_host_addr(event->syncobjs);
-
-   for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
-      syncobjs[i].seqno = 1;
+   panvk_priv_mem_write_array(event->syncobjs, 0, struct panvk_cs_sync32,
+                              PANVK_SUBQUEUE_COUNT, syncobjs) {
+      for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
+         syncobjs[i].seqno = 1;
+   }

   return VK_SUCCESS;
 }
@ -89,8 +93,10 @@ panvk_per_arch(ResetEvent)(VkDevice _device, VkEvent _event)
 {
   VK_FROM_HANDLE(panvk_event, event, _event);

-   struct panvk_cs_sync32 *syncobjs = panvk_priv_mem_host_addr(event->syncobjs);
+   panvk_priv_mem_write_array(event->syncobjs, 0, struct panvk_cs_sync32,
+                              PANVK_SUBQUEUE_COUNT, syncobjs) {
+      memset(syncobjs, 0, sizeof(*syncobjs) * PANVK_SUBQUEUE_COUNT);
+   }

-   memset(syncobjs, 0, sizeof(*syncobjs) * PANVK_SUBQUEUE_COUNT);
   return VK_SUCCESS;
 }
--- a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c
@ -168,11 +168,11 @@ init_render_desc_ringbuf(struct panvk_gpu_queue *queue)
      return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                          "Failed to create the render desc ringbuf context");

-   struct panvk_cs_sync32 *syncobj = panvk_priv_mem_host_addr(ringbuf->syncobj);
-
-   *syncobj = (struct panvk_cs_sync32){
-      .seqno = RENDER_DESC_RINGBUF_SIZE,
-   };
+   panvk_priv_mem_write(ringbuf->syncobj, 0, struct panvk_cs_sync32, syncobj) {
+      *syncobj = (struct panvk_cs_sync32){
+         .seqno = RENDER_DESC_RINGBUF_SIZE,
+      };
+   }

   return VK_SUCCESS;
 }
@ -350,7 +350,6 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
   struct panvk_subqueue *subq = &queue->subqueues[subqueue];
   const struct panvk_physical_device *phys_dev =
      to_panvk_physical_device(queue->vk.base.device->physical);
-   struct panvk_cs_sync64 *syncobjs = panvk_priv_mem_host_addr(queue->syncobjs);

   VkResult result = init_subqueue_tracing(queue, subqueue);
   if (result != VK_SUCCESS)
@ -401,6 +400,8 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
   assert(cs_is_valid(&b));
   subq->req_resource.cs_buffer_size = cs_root_chunk_size(&b);
   subq->req_resource.cs_buffer_addr = cs_root_chunk_gpu_addr(&b);
+   panvk_priv_mem_flush(subq->req_resource.buf, 0,
+                        subq->req_resource.cs_buffer_size);

   alloc_info.size = sizeof(struct panvk_cs_subqueue_context);
   alloc_info.alignment = 64;
@ -410,25 +411,43 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
      return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                          "Failed to create a queue context");

-   struct panvk_cs_subqueue_context *cs_ctx =
-      panvk_priv_mem_host_addr(subq->context);
-
-   *cs_ctx = (struct panvk_cs_subqueue_context){
-      .syncobjs = panvk_priv_mem_dev_addr(queue->syncobjs),
-      .debug.tracebuf.cs = subq->tracebuf.addr.dev,
+   panvk_priv_mem_write(subq->context, 0, struct panvk_cs_subqueue_context,
+                        cs_ctx) {
+      *cs_ctx = (struct panvk_cs_subqueue_context){
+         .syncobjs = panvk_priv_mem_dev_addr(queue->syncobjs),
+         .debug.tracebuf.cs = subq->tracebuf.addr.dev,
 #if PAN_ARCH == 10
-      /* On the VT/COMPUTE queue, the first iter_sb will skipped since
-       * cs_next_iter_sb() is called before the first use, but that's okay,
-       * because the next slot will be equally free, and the skipped one will
-       * be re-used at some point.
-       * On the fragment queue, we increment the iterator when the
-       * FINISH_FRAGMENT job is issued, which is why we need this value
-       * to point to a valid+free scoreboard from the start.
-       */
-      .iter_sb = SB_ITER(0),
+         /* On the VT/COMPUTE queue, the first iter_sb will skipped since
+          * cs_next_iter_sb() is called before the first use, but that's okay,
+          * because the next slot will be equally free, and the skipped one will
+          * be re-used at some point.
+          * On the fragment queue, we increment the iterator when the
+          * FINISH_FRAGMENT job is issued, which is why we need this value
+          * to point to a valid+free scoreboard from the start.
+          */
+         .iter_sb = SB_ITER(0),
 #endif
-      .reg_dump_addr = panvk_priv_mem_dev_addr(subq->regs_save),
-   };
+         .reg_dump_addr = panvk_priv_mem_dev_addr(subq->regs_save),
+      };
+
+      if (subqueue != PANVK_SUBQUEUE_COMPUTE) {
+         cs_ctx->render.tiler_heap =
+            panvk_priv_mem_dev_addr(queue->tiler_heap.desc);
+         /* Our geometry buffer comes 4k after the tiler heap, and we encode the
+          * size in the lower 12 bits so the address can be copied directly
+          * to the tiler descriptors. */
+         cs_ctx->render.geom_buf =
+            (cs_ctx->render.tiler_heap + 4096) | ((64 * 1024) >> 12);
+
+         /* Initialize the ringbuf */
+         cs_ctx->render.desc_ringbuf = (struct panvk_cs_desc_ringbuf){
+            .syncobj =
+               panvk_priv_mem_dev_addr(queue->render_desc_ringbuf.syncobj),
+            .ptr = queue->render_desc_ringbuf.addr.dev,
+            .pos = 0,
+         };
+      }
+   }

   /* We use the geometry buffer for our temporary CS buffer. */
   root_cs = (struct cs_buffer){
@ -465,24 +484,13 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
   /* We do greater than test on sync objects, and given the reference seqno
    * registers are all zero at init time, we need to initialize all syncobjs
    * with a seqno of one. */
-   syncobjs[subqueue].seqno = 1;
+   panvk_priv_mem_write(queue->syncobjs,
+                        subqueue * sizeof(struct panvk_cs_sync64),
+                        struct panvk_cs_sync64, syncobj) {
+      syncobj->seqno = 1;
+   }

   if (subqueue != PANVK_SUBQUEUE_COMPUTE) {
-      cs_ctx->render.tiler_heap =
-         panvk_priv_mem_dev_addr(queue->tiler_heap.desc);
-      /* Our geometry buffer comes 4k after the tiler heap, and we encode the
-       * size in the lower 12 bits so the address can be copied directly
-       * to the tiler descriptors. */
-      cs_ctx->render.geom_buf =
-         (cs_ctx->render.tiler_heap + 4096) | ((64 * 1024) >> 12);
-
-      /* Initialize the ringbuf */
-      cs_ctx->render.desc_ringbuf = (struct panvk_cs_desc_ringbuf){
-         .syncobj = panvk_priv_mem_dev_addr(queue->render_desc_ringbuf.syncobj),
-         .ptr = queue->render_desc_ringbuf.addr.dev,
-         .pos = 0,
-      };
-
      struct cs_index heap_ctx_addr = cs_scratch_reg64(&b, 0);

      /* Pre-set the heap context on the vertex-tiler/fragment queues. */
@ -493,6 +501,8 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)

   assert(cs_is_valid(&b));

+   panvk_priv_mem_flush(queue->tiler_heap.desc, 4096, cs_root_chunk_size(&b));
+
   struct drm_panthor_sync_op syncop = {
      .flags =
         DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ | DRM_PANTHOR_SYNC_OP_SIGNAL,
@ -700,8 +710,7 @@ init_tiler(struct panvk_gpu_queue *queue)
   tiler_heap->context.handle = thc.handle;
   tiler_heap->context.dev_addr = thc.tiler_heap_ctx_gpu_va;

-   pan_cast_and_pack(panvk_priv_mem_host_addr(tiler_heap->desc), TILER_HEAP,
-                     cfg) {
+   panvk_priv_mem_write_desc(tiler_heap->desc, 0, TILER_HEAP, cfg) {
      cfg.size = tiler_heap->chunk_size;
      cfg.base = thc.first_heap_chunk_gpu_va;
      cfg.bottom = cfg.base + 64;
@ -1125,16 +1134,17 @@ panvk_queue_submit_ioctl(struct panvk_queue_submit *submit)
      /* If we're tracing, we need to reset the desc ringbufs and the CS
       * tracebuf. */
      for (uint32_t i = 0; i < ARRAY_SIZE(queue->subqueues); i++) {
-         struct panvk_cs_subqueue_context *ctx =
-            panvk_priv_mem_host_addr(queue->subqueues[i].context);
+         panvk_priv_mem_rmw(queue->subqueues[i].context, 0,
+                            struct panvk_cs_subqueue_context, ctx) {
+            if (ctx->render.desc_ringbuf.ptr) {
+               ctx->render.desc_ringbuf.ptr =
+                  queue->render_desc_ringbuf.addr.dev;
+               ctx->render.desc_ringbuf.pos = 0;
+            }

-         if (ctx->render.desc_ringbuf.ptr) {
-            ctx->render.desc_ringbuf.ptr = queue->render_desc_ringbuf.addr.dev;
-            ctx->render.desc_ringbuf.pos = 0;
+            if (ctx->debug.tracebuf.cs)
+               ctx->debug.tracebuf.cs = queue->subqueues[i].tracebuf.addr.dev;
         }
-
-         if (ctx->debug.tracebuf.cs)
-            ctx->debug.tracebuf.cs = queue->subqueues[i].tracebuf.addr.dev;
      }
   }

@ -1235,28 +1245,30 @@ panvk_queue_submit_process_debug(const struct panvk_queue_submit *submit,
      }

      for (uint32_t i = 0; i < ARRAY_SIZE(queue->subqueues); i++) {
-         struct panvk_cs_subqueue_context *ctx =
-            panvk_priv_mem_host_addr(queue->subqueues[i].context);
+         panvk_priv_mem_readback(queue->subqueues[i].context, 0,
+                                 struct panvk_cs_subqueue_context, ctx) {
+            size_t trace_size = trace_size =
+               ctx->debug.tracebuf.cs - queue->subqueues[i].tracebuf.addr.dev;

-         size_t trace_size =
-            ctx->debug.tracebuf.cs - queue->subqueues[i].tracebuf.addr.dev;
-         if (!trace_size)
-            continue;
+            if (trace_size) {
+               assert(
+                  trace_size <= queue->subqueues[i].tracebuf.size ||
+                  !"OOB access on the CS tracebuf, pass a bigger PANVK_CS_TRACEBUF_SIZE");

-         assert(
-            trace_size <= queue->subqueues[i].tracebuf.size ||
-            !"OOB access on the CS tracebuf, pass a bigger PANVK_CS_TRACEBUF_SIZE");
+               assert(
+                  !ctx->render.desc_ringbuf.ptr ||
+                  ctx->render.desc_ringbuf.pos <=
+                     queue->render_desc_ringbuf.size ||
+                  !"OOB access on the desc tracebuf, pass a bigger PANVK_DESC_TRACEBUF_SIZE");

-         assert(
-            !ctx->render.desc_ringbuf.ptr ||
-            ctx->render.desc_ringbuf.pos <= queue->render_desc_ringbuf.size ||
-            !"OOB access on the desc tracebuf, pass a bigger PANVK_DESC_TRACEBUF_SIZE");
+               uint64_t trace = queue->subqueues[i].tracebuf.addr.dev;

-         uint64_t trace = queue->subqueues[i].tracebuf.addr.dev;
-
-         pandecode_user_msg(decode_ctx, "\nCS traces on subqueue %d\n\n", i);
-         pandecode_cs_trace(decode_ctx, trace, trace_size, props->gpu_id);
-         pandecode_user_msg(decode_ctx, "\n");
+               pandecode_user_msg(decode_ctx, "\nCS traces on subqueue %d\n\n",
+                                  i);
+               pandecode_cs_trace(decode_ctx, trace, trace_size, props->gpu_id);
+               pandecode_user_msg(decode_ctx, "\n");
+            }
+         }
      }
   }

@ -1407,10 +1419,11 @@ panvk_per_arch(gpu_queue_check_status)(struct vk_queue *vk_queue)

   /* check for CS error and treat it as device lost */
   for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
-      const struct panvk_cs_subqueue_context *subq_ctx =
-         panvk_priv_mem_host_addr(queue->subqueues[i].context);
-      if (subq_ctx->last_error != 0)
-         return vk_queue_set_lost(&queue->vk, "CS_FAULT");
+      panvk_priv_mem_readback(queue->subqueues[i].context, 0,
+                              struct panvk_cs_subqueue_context, subq_ctx) {
+         if (subq_ctx->last_error != 0)
+            return vk_queue_set_lost(&queue->vk, "CS_FAULT");
+      }
   }

   int ret = pan_kmod_ioctl(dev->drm_fd, DRM_IOCTL_PANTHOR_GROUP_GET_STATE,
--- a/src/panfrost/vulkan/panvk_query_pool.h
+++ b/src/panfrost/vulkan/panvk_query_pool.h
@ -47,38 +47,31 @@ struct panvk_query_pool {
 VK_DEFINE_NONDISP_HANDLE_CASTS(panvk_query_pool, vk.base, VkQueryPool,
                               VK_OBJECT_TYPE_QUERY_POOL)

-static uint64_t
+static inline uint32_t
+panvk_query_available_offset(struct panvk_query_pool *pool, uint32_t query)
+{
+   assert(query < pool->vk.query_count);
+   return query * sizeof(struct panvk_query_available_obj);
+}
+
+static inline uint64_t
 panvk_query_available_dev_addr(struct panvk_query_pool *pool, uint32_t query)
 {
-   assert(query < pool->vk.query_count);
-   return panvk_priv_mem_dev_addr(pool->available_mem) + query * sizeof(struct panvk_query_available_obj);
+   return panvk_priv_mem_dev_addr(pool->available_mem) +
+          panvk_query_available_offset(pool, query);
 }

-static struct panvk_query_available_obj *
-panvk_query_available_host_addr(struct panvk_query_pool *pool, uint32_t query)
-{
-   assert(query < pool->vk.query_count);
-   return (struct panvk_query_available_obj *)panvk_priv_mem_host_addr(pool->available_mem) + query;
-}
-
-static uint64_t
+static inline uint64_t
 panvk_query_offset(struct panvk_query_pool *pool, uint32_t query)
 {
   assert(query < pool->vk.query_count);
   return query * (uint64_t)pool->query_stride;
 }

-static uint64_t
+static inline uint64_t
 panvk_query_report_dev_addr(struct panvk_query_pool *pool, uint32_t query)
 {
   return panvk_priv_mem_dev_addr(pool->mem) + panvk_query_offset(pool, query);
 }

-static struct panvk_query_report *
-panvk_query_report_host_addr(struct panvk_query_pool *pool, uint32_t query)
-{
-   return (void *)((char *)panvk_priv_mem_host_addr(pool->mem) +
-                   panvk_query_offset(pool, query));
-}
-
 #endif
--- a/src/panfrost/vulkan/panvk_vX_buffer_view.c
+++ b/src/panfrost/vulkan/panvk_vX_buffer_view.c
@ -70,11 +70,14 @@ panvk_per_arch(CreateBufferView)(VkDevice _device,
      if (!panvk_priv_mem_check_alloc(view->mem))
         return panvk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);

-      struct pan_ptr ptr = {
-         .gpu = panvk_priv_mem_dev_addr(view->mem),
-         .cpu = panvk_priv_mem_host_addr(view->mem),
-      };
-      GENX(pan_buffer_texture_emit)(&bview, &view->descs.tex, &ptr);
+      panvk_priv_mem_write(view->mem, 0, struct mali_surface_with_stride_packed, sd) {
+         struct pan_ptr ptr = {
+            .gpu = panvk_priv_mem_dev_addr(view->mem),
+            .cpu = sd,
+         };
+
+         GENX(pan_buffer_texture_emit)(&bview, &view->descs.tex, &ptr);
+      }
 #endif
   }

--- a/src/panfrost/vulkan/panvk_vX_cmd_fb_preload.c
+++ b/src/panfrost/vulkan/panvk_vX_cmd_fb_preload.c
@ -173,8 +173,7 @@ get_preload_shader(struct panvk_device *dev,
      return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
   }

-   pan_cast_and_pack(panvk_priv_mem_host_addr(shader->spd), SHADER_PROGRAM,
-                     cfg) {
+   panvk_priv_mem_write_desc(shader->spd, 0, SHADER_PROGRAM, cfg) {
      cfg.stage = MALI_SHADER_STAGE_FRAGMENT;
      cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
      cfg.register_allocation = MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD;
--- a/src/panfrost/vulkan/panvk_vX_device.c
+++ b/src/panfrost/vulkan/panvk_vX_device.c
@ -70,7 +70,8 @@ static void
 panvk_device_init_mempools(struct panvk_device *dev)
 {
   struct panvk_pool_properties rw_pool_props = {
-      .create_flags = 0,
+      .create_flags =
+         panvk_device_adjust_bo_flags(dev, PAN_KMOD_BO_FLAG_WB_MMAP),
      .slab_size = 16 * 1024,
      .label = "Device RW cached memory pool",
      .owns_bos = false,
@ -93,7 +94,8 @@ panvk_device_init_mempools(struct panvk_device *dev)
   panvk_pool_init(&dev->mempools.rw_nc, dev, NULL, NULL, &rw_nc_pool_props);

   struct panvk_pool_properties exec_pool_props = {
-      .create_flags = PAN_KMOD_BO_FLAG_EXECUTABLE,
+      .create_flags = panvk_device_adjust_bo_flags(
+         dev, PAN_KMOD_BO_FLAG_EXECUTABLE | PAN_KMOD_BO_FLAG_WB_MMAP),
      .slab_size = 16 * 1024,
      .label = "Device executable memory pool (shaders)",
      .owns_bos = false,
--- a/src/panfrost/vulkan/panvk_vX_image_view.c
+++ b/src/panfrost/vulkan/panvk_vX_image_view.c
@ -153,69 +153,73 @@ prepare_tex_descs(struct panvk_image_view *view)
   if (!panvk_priv_mem_check_alloc(view->mem))
      return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);

-   struct pan_ptr ptr = {
-      .gpu = panvk_priv_mem_dev_addr(view->mem),
-      .cpu = panvk_priv_mem_host_addr(view->mem),
-   };
+   panvk_priv_mem_write_array(view->mem, 0, uint8_t, alloc_info.size, cpu_ptr) {
+      struct pan_ptr ptr = {
+         .gpu = panvk_priv_mem_dev_addr(view->mem),
+         .cpu = cpu_ptr,
+      };

 #if PAN_ARCH >= 9
-   struct pan_ptr storage_ptr = ptr;
-   if (view->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT) {
-      uint32_t storage_payload_offset = alloc_info.size - storage_payload_size;
-      storage_ptr.gpu += storage_payload_offset;
-      storage_ptr.cpu += storage_payload_offset;
-   }
+      struct pan_ptr storage_ptr = ptr;
+      if (view->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT) {
+         uint32_t storage_payload_offset =
+            alloc_info.size - storage_payload_size;
+         storage_ptr.gpu += storage_payload_offset;
+         storage_ptr.cpu += storage_payload_offset;
+      }
 #endif

-   if (plane_count > 1) {
-      memset(pview.planes, 0, sizeof(pview.planes));
+      if (plane_count > 1) {
+         memset(pview.planes, 0, sizeof(pview.planes));

-      for (uint32_t plane = 0; plane < plane_count; plane++) {
-         VkFormat plane_format =
-            vk_format_get_plane_format(view->vk.view_format, plane);
+         for (uint32_t plane = 0; plane < plane_count; plane++) {
+            VkFormat plane_format =
+               vk_format_get_plane_format(view->vk.view_format, plane);

-         /* We need a per-plane pview. */
-         pview.planes[0] = view->pview.planes[plane];
-         pview.format = vk_format_to_pipe_format(plane_format);
+            /* We need a per-plane pview. */
+            pview.planes[0] = view->pview.planes[plane];
+            pview.format = vk_format_to_pipe_format(plane_format);

-         GENX(pan_sampled_texture_emit)(&pview, &view->descs.tex[plane], &ptr);
+            GENX(pan_sampled_texture_emit)(&pview, &view->descs.tex[plane],
+                                           &ptr);
 #if PAN_ARCH >= 9
-         if (view->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT) {
-            GENX(pan_storage_texture_emit)(
-               &pview, &view->descs.storage_tex[plane], &storage_ptr);
-            storage_ptr.cpu += tex_payload_size;
-            storage_ptr.gpu += tex_payload_size;
+            if (view->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT) {
+               GENX(pan_storage_texture_emit)(
+                  &pview, &view->descs.storage_tex[plane], &storage_ptr);
+               storage_ptr.cpu += tex_payload_size;
+               storage_ptr.gpu += tex_payload_size;
+            }
+#endif
+
+            ptr.cpu += tex_payload_size;
+            ptr.gpu += tex_payload_size;
         }
+      } else {
+         GENX(pan_sampled_texture_emit)(&pview, &view->descs.tex[0], &ptr);
+#if PAN_ARCH >= 9
+         if (view->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)
+            GENX(pan_storage_texture_emit)(&pview, &view->descs.storage_tex[0],
+                                           &storage_ptr);
 #endif
+      }
+
+      if (can_preload_other_aspect) {
+         /* If the depth was present in the aspects mask, we've handled it
+          * already, so move on to the stencil. If it wasn't present, it's the
+          * stencil texture we create first, and we need t handle the depth here.
+          */
+         pview.format = (view->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
+                           ? panvk_image_stencil_only_pfmt(image)
+                           : panvk_image_depth_only_pfmt(image);

         ptr.cpu += tex_payload_size;
         ptr.gpu += tex_payload_size;
+
+         GENX(pan_sampled_texture_emit)(&pview,
+                                        &view->descs.zs.other_aspect_tex, &ptr);
      }
-   } else {
-      GENX(pan_sampled_texture_emit)(&pview, &view->descs.tex[0], &ptr);
-#if PAN_ARCH >= 9
-      if (view->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)
-         GENX(pan_storage_texture_emit)(&pview, &view->descs.storage_tex[0],
-                                        &storage_ptr);
-#endif
   }

-   if (!can_preload_other_aspect)
-      return VK_SUCCESS;
-
-   /* If the depth was present in the aspects mask, we've handled it already, so
-    * move on to the stencil. If it wasn't present, it's the stencil texture we
-    * create first, and we need t handle the depth here.
-    */
-   pview.format = (view->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
-                     ? panvk_image_stencil_only_pfmt(image)
-                     : panvk_image_depth_only_pfmt(image);
-
-   ptr.cpu += tex_payload_size;
-   ptr.gpu += tex_payload_size;
-
-   GENX(pan_sampled_texture_emit)(&pview, &view->descs.zs.other_aspect_tex,
-                                  &ptr);
   return VK_SUCCESS;
 }

--- a/src/panfrost/vulkan/panvk_vX_query_pool.c
+++ b/src/panfrost/vulkan/panvk_vX_query_pool.c
@ -24,9 +24,11 @@ static void
 reset_query_pool(struct panvk_query_pool *pool, uint32_t firstQuery,
                 uint32_t queryCount)
 {
-   struct panvk_query_available_obj *available =
-      panvk_query_available_host_addr(pool, firstQuery);
-   memset(available, 0, queryCount * sizeof(*available));
+   panvk_priv_mem_write_array(pool->available_mem,
+                              panvk_query_available_offset(pool, firstQuery),
+                              struct panvk_query_available_obj, queryCount,
+                              available)
+      memset(available, 0, queryCount * sizeof(*available));
 }

 VKAPI_ATTR VkResult VKAPI_CALL
@ -131,14 +133,19 @@ panvk_per_arch(ResetQueryPool)(VkDevice device, VkQueryPool queryPool,
 static bool
 panvk_query_is_available(struct panvk_query_pool *pool, uint32_t query)
 {
-   struct panvk_query_available_obj *available =
-      panvk_query_available_host_addr(pool, query);
+   bool res = false;

+   panvk_priv_mem_readback(pool->available_mem,
+                           panvk_query_available_offset(pool, query),
+                           struct panvk_query_available_obj, available) {
 #if PAN_ARCH >= 10
-   return p_atomic_read(&available->sync_obj.seqno) != 0;
+      res = p_atomic_read(&available->sync_obj.seqno) != 0;
 #else
-   return p_atomic_read(&available->value) != 0;
+      res = p_atomic_read(&available->value) != 0;
 #endif
+   }
+
+   return res;
 }

 static VkResult
@ -248,28 +255,29 @@ panvk_per_arch(GetQueryPoolResults)(VkDevice _device, VkQueryPool queryPool,

      bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);

-      const struct panvk_query_report *src =
-         panvk_query_report_host_addr(pool, query);
      assert(i * stride < dataSize);
      void *dst = (char *)pData + i * stride;

-      switch (pool->vk.query_type) {
-      case VK_QUERY_TYPE_OCCLUSION: {
-         if (write_results)
-            cpu_write_occlusion_query_result(dst, 0, flags, src,
-                                             pool->reports_per_query);
-         break;
-      }
+      panvk_priv_mem_readback(pool->mem, panvk_query_offset(pool, query),
+                              struct panvk_query_report, src) {
+         switch (pool->vk.query_type) {
+         case VK_QUERY_TYPE_OCCLUSION: {
+            if (write_results)
+               cpu_write_occlusion_query_result(dst, 0, flags, src,
+                                                pool->reports_per_query);
+            break;
+         }
 #if PAN_ARCH >= 10
-      case VK_QUERY_TYPE_TIMESTAMP: {
-         if (write_results)
-            cpu_write_timestamp_query_result(dst, 0, flags, src,
-                                             pool->reports_per_query);
-         break;
-      }
+         case VK_QUERY_TYPE_TIMESTAMP: {
+            if (write_results)
+               cpu_write_timestamp_query_result(dst, 0, flags, src,
+                                                pool->reports_per_query);
+            break;
+         }
 #endif
-      default:
-         UNREACHABLE("Unsupported query type");
+         default:
+            UNREACHABLE("Unsupported query type");
+         }
      }

      if (!write_results)
--- a/src/panfrost/vulkan/panvk_vX_shader.c
+++ b/src/panfrost/vulkan/panvk_vX_shader.c
@ -1093,8 +1093,7 @@ panvk_shader_upload(struct panvk_device *dev,
   if (!panvk_priv_mem_check_alloc(shader->rsd))
      return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);

-   pan_cast_and_pack(panvk_priv_mem_host_addr(shader->rsd), RENDERER_STATE,
-                     cfg) {
+   panvk_priv_mem_write_desc(shader->rsd, 0, RENDERER_STATE, cfg) {
      pan_shader_prepare_rsd(&shader->info,
                             panvk_shader_variant_get_dev_addr(shader), &cfg);
   }
@ -1104,8 +1103,7 @@ panvk_shader_upload(struct panvk_device *dev,
      if (!panvk_priv_mem_check_alloc(shader->spd))
         return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);

-      pan_cast_and_pack(panvk_priv_mem_host_addr(shader->spd), SHADER_PROGRAM,
-                        cfg) {
+      panvk_priv_mem_write_desc(shader->spd, 0, SHADER_PROGRAM, cfg) {
         cfg.stage = pan_shader_stage(&shader->info);

         if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
@ -1131,8 +1129,8 @@ panvk_shader_upload(struct panvk_device *dev,
      if (!panvk_priv_mem_check_alloc(shader->spds.all_points))
         return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);

-      pan_cast_and_pack(panvk_priv_mem_host_addr(shader->spds.all_points),
-                        SHADER_PROGRAM, cfg) {
+      panvk_priv_mem_write_desc(shader->spds.all_points, 0, SHADER_PROGRAM,
+                                cfg) {
         cfg.stage = pan_shader_stage(&shader->info);
         cfg.register_allocation =
            pan_register_allocation(shader->info.work_reg_count);
@ -1146,8 +1144,8 @@ panvk_shader_upload(struct panvk_device *dev,
      if (!panvk_priv_mem_check_alloc(shader->spds.all_triangles))
         return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);

-      pan_cast_and_pack(panvk_priv_mem_host_addr(shader->spds.all_triangles),
-                        SHADER_PROGRAM, cfg) {
+      panvk_priv_mem_write_desc(shader->spds.all_triangles, 0, SHADER_PROGRAM,
+                                cfg) {
         cfg.stage = pan_shader_stage(&shader->info);
         cfg.register_allocation =
            pan_register_allocation(shader->info.work_reg_count);
@ -1162,8 +1160,8 @@ panvk_shader_upload(struct panvk_device *dev,
      if (!panvk_priv_mem_check_alloc(shader->spds.pos_points))
         return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);

-      pan_cast_and_pack(panvk_priv_mem_host_addr(shader->spds.pos_points),
-                        SHADER_PROGRAM, cfg) {
+      panvk_priv_mem_write_desc(shader->spds.pos_points, 0, SHADER_PROGRAM,
+                                cfg) {
         cfg.stage = pan_shader_stage(&shader->info);
         cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
         cfg.register_allocation =
@ -1178,8 +1176,8 @@ panvk_shader_upload(struct panvk_device *dev,
      if (!panvk_priv_mem_check_alloc(shader->spds.pos_triangles))
         return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);

-      pan_cast_and_pack(panvk_priv_mem_host_addr(shader->spds.pos_triangles),
-                        SHADER_PROGRAM, cfg) {
+      panvk_priv_mem_write_desc(shader->spds.pos_triangles, 0, SHADER_PROGRAM,
+                                cfg) {
         cfg.stage = pan_shader_stage(&shader->info);
         cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
         cfg.register_allocation =
@ -1196,8 +1194,7 @@ panvk_shader_upload(struct panvk_device *dev,
         if (!panvk_priv_mem_check_alloc(shader->spds.var))
            return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);

-         pan_cast_and_pack(panvk_priv_mem_host_addr(shader->spds.var),
-                           SHADER_PROGRAM, cfg) {
+         panvk_priv_mem_write_desc(shader->spds.var, 0, SHADER_PROGRAM, cfg) {
            unsigned work_count = shader->info.vs.secondary_work_reg_count;

            cfg.stage = pan_shader_stage(&shader->info);
@ -1583,13 +1580,13 @@ shader_desc_info_deserialize(struct panvk_device *dev,
      };
      shader->desc_info.others.map =
         panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
-      uint32_t *copy_table =
-         panvk_priv_mem_host_addr(shader->desc_info.others.map);
-
-      if (!copy_table)
+      if (!panvk_priv_mem_check_alloc(shader->desc_info.others.map))
         return panvk_error(shader, VK_ERROR_OUT_OF_DEVICE_MEMORY);

-      blob_copy_bytes(blob, copy_table, others_count * sizeof(*copy_table));
+      panvk_priv_mem_write_array(shader->desc_info.others.map, 0, uint32_t,
+                                 others_count, copy_table) {
+         blob_copy_bytes(blob, copy_table, others_count * sizeof(*copy_table));
+      }
   }
 #else
   shader->desc_info.dyn_bufs.count = blob_read_uint32(blob);
@ -1738,6 +1735,8 @@ shader_desc_info_serialize(struct blob *blob,
      others_count += shader->desc_info.others.count[i];
   }

+   /* No need to wrap this one in panvk_priv_mem_readback(), because the
+    * GPU is not supposed to touch it. */
   blob_write_bytes(blob,
                    panvk_priv_mem_host_addr(shader->desc_info.others.map),
                    sizeof(uint32_t) * others_count);
@ -2080,33 +2079,40 @@ emit_varying_attrs(struct panvk_pool *desc_pool,
                   unsigned varying_count, const struct varyings_info *info,
                   unsigned *buf_offsets, struct panvk_priv_mem *mem)
 {
-   *mem = panvk_pool_alloc_desc_array(desc_pool, varying_count, ATTRIBUTE);
+   if (!varying_count) {
+      *mem = (struct panvk_priv_mem){0};
+      return VK_SUCCESS;
+   }

-   if (varying_count && !panvk_priv_mem_check_alloc(*mem))
+   *mem = panvk_pool_alloc_desc_array(desc_pool, varying_count, ATTRIBUTE);
+   if (!panvk_priv_mem_check_alloc(*mem))
      return VK_ERROR_OUT_OF_DEVICE_MEMORY;

-   struct mali_attribute_packed *attrs = panvk_priv_mem_host_addr(*mem);
-   unsigned attr_idx = 0;
+   panvk_priv_mem_write_array(*mem, 0, struct mali_attribute_packed,
+                              varying_count, attrs) {
+      unsigned attr_idx = 0;

-   for (unsigned i = 0; i < varying_count; i++) {
-      pan_pack(&attrs[attr_idx++], ATTRIBUTE, cfg) {
-         gl_varying_slot loc = varyings[i].location;
-         enum pipe_format pfmt = varyings[i].format != PIPE_FORMAT_NONE
-                                    ? info->fmts[loc]
-                                    : PIPE_FORMAT_NONE;
+      for (unsigned i = 0; i < varying_count; i++) {
+         pan_pack(&attrs[attr_idx++], ATTRIBUTE, cfg) {
+            gl_varying_slot loc = varyings[i].location;
+            enum pipe_format pfmt = varyings[i].format != PIPE_FORMAT_NONE
+                                       ? info->fmts[loc]
+                                       : PIPE_FORMAT_NONE;

-         if (pfmt == PIPE_FORMAT_NONE) {
+            if (pfmt == PIPE_FORMAT_NONE) {
 #if PAN_ARCH >= 7
-            cfg.format = (MALI_CONSTANT << 12) | MALI_RGB_COMPONENT_ORDER_0000;
+               cfg.format =
+                  (MALI_CONSTANT << 12) | MALI_RGB_COMPONENT_ORDER_0000;
 #else
-            cfg.format = (MALI_CONSTANT << 12) | PAN_V6_SWIZZLE(0, 0, 0, 0);
+               cfg.format = (MALI_CONSTANT << 12) | PAN_V6_SWIZZLE(0, 0, 0, 0);
 #endif
-         } else {
-            cfg.buffer_index = varying_buf_id(loc);
-            cfg.offset = buf_offsets[loc];
-            cfg.format = varying_format(loc, info->fmts[loc]);
+            } else {
+               cfg.buffer_index = varying_buf_id(loc);
+               cfg.offset = buf_offsets[loc];
+               cfg.format = varying_format(loc, info->fmts[loc]);
+            }
+            cfg.offset_enable = false;
         }
-         cfg.offset_enable = false;
      }
   }