v3dv: fix CLE MMU errors avoiding using last bytes of CL BOs.

The last V3D_CLE_READAHEAD bytes of the CLE buffer are unusable because using them would prefetch the next readahead bytes of the CL that would be outside the allocated BO. To guarantee that we can chain a BO to the current CL we always reserve space for the BRANCH or RETURN_FROM_SUB_LIST packets. Not taking this into account has been generating kernel dmesg errors like "MMU error from client CLE". As V3D_CLE_READAHEAD is different from RPi4 (256 bytes) to RPi5 (1024 bytes). So we needed to rename v3dv_cl.c to v3dvX_cl.c to have different objects per V3D_VERSION. Extra assertions have been included to validate that we don't write packets over the usable size of the CL silently. v2: - Do not declare unusable the space needed for the BRANCH packet, but take it into account for all space reservations. v3: - Squash here ("v3dv: Secondary CL needs also to handle CLE readahead") - Remove spureous parenthesis (Iago Toral) - Refactor to avoid checking for needs_return_from_sub_list inside cl_alloc_bo adding unusable_space as new parameter. v4: - Improved logic for chaining BOs moving it to cl_alloc_bo using a new enum v3dv_cl_chain_type to identify the different kinds of BO chaining. Now we increase the size of the BO just before submitting the BRACH/RETURN_FROM_SUB_LIST packages. v5: - Assert on BO size updates that we are within the BO size. (Iago Toral) v6: - Remove changes at cmd_buffer_end_render_pass_secondary as we assumed that cl->bo was already allocated when ending the secondary CL, but it can be NULL. And this was already handle by current code. Reviewed-by: Iago Toral Quiroga <itoral@igalia.com> cc: mesa-stable Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29023>
2026-02-23 11:50:42 +01:00 · 2024-04-30 16:28:50 +02:00 · 2024-04-30 16:28:50 +02:00 · 26c8a5cd72
commit 26c8a5cd72
parent 11dce2ac81
3 changed files with 81 additions and 31 deletions
--- a/src/broadcom/vulkan/meson.build
+++ b/src/broadcom/vulkan/meson.build
@ -34,7 +34,6 @@ v3dv_entrypoints = custom_target(

 libv3dv_files = files(
  'v3dv_bo.c',
-  'v3dv_cl.c',
  'v3dv_cmd_buffer.c',
  'v3dv_debug.c',
  'v3dv_debug.h',
@ -57,6 +56,7 @@ libv3dv_files = files(
 ) + [v3d_xml_pack]

 files_per_version = files(
+  'v3dvx_cl.c',
  'v3dvx_cmd_buffer.c',
  'v3dvx_descriptor_set.c',
  'v3dvx_device.c',
--- a/src/broadcom/vulkan/v3dv_cl.h
+++ b/src/broadcom/vulkan/v3dv_cl.h
@ -182,6 +182,7 @@ void v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space);
                cl_packet_pack(packet)(cl, (uint8_t *)cl_out, &name); \
                cl_advance_and_end(cl, cl_packet_length(packet)); \
                _loop_terminate = NULL;                          \
+                assert(v3dv_cl_offset(cl) <= (cl)->size);        \
        }))                                                      \

 #define cl_emit_with_prepacked(cl, packet, prepacked, name)      \
@ -215,9 +216,10 @@ cl_pack_emit_reloc(struct v3dv_cl *cl, const struct v3dv_cl_reloc *reloc)
                v3dv_job_add_bo(cl->job, reloc->bo);
 }

-#define cl_emit_prepacked_sized(cl, packet, size) do {                \
-        memcpy((cl)->next, packet, size);             \
-        cl_advance(&(cl)->next, size);                \
+#define cl_emit_prepacked_sized(cl, packet, psize) do {          \
+        memcpy((cl)->next, packet, psize);                       \
+        cl_advance(&(cl)->next, psize);                          \
+        assert(v3dv_cl_offset(cl) <= (cl)->size);                \
 } while (0)

 #define cl_emit_prepacked(cl, packet) \
--- a/src/broadcom/vulkan/v3dvx_cl.c
+++ b/src/broadcom/vulkan/v3dvx_cl.c
@ -1,4 +1,4 @@
-/*
+ /*
 * Copyright © 2019 Raspberry Pi Ltd
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -22,15 +22,21 @@
 */

 #include "v3dv_private.h"
-
-/* We don't expect that the packets we use in this file change across hw
- * versions, so we just explicitly set the V3D_VERSION and include v3dx_pack
- * here
- */
-#define V3D_VERSION 42
 #include "broadcom/common/v3d_macros.h"
 #include "broadcom/cle/v3dx_pack.h"

+/* The Control List Executor (CLE) pre-fetches V3D_CLE_READAHEAD bytes from
+ * the Control List buffer. The usage of these last bytes should be avoided or
+ * the CLE would pre-fetch the data after the end of the CL buffer, reporting
+ * the kernel "MMU error from client CLE".
+ */
+#if V3D_VERSION == 42
+#define V3D_CLE_READAHEAD 256
+#endif
+#if V3D_VERSION >= 71
+#define V3D_CLE_READAHEAD 1024
+#endif
+
 void
 v3dv_cl_init(struct v3dv_job *job, struct v3dv_cl *cl)
 {
@ -55,14 +61,40 @@ v3dv_cl_destroy(struct v3dv_cl *cl)
   v3dv_cl_init(NULL, cl);
 }

+enum v3dv_cl_chain_type {
+   V3D_CL_BO_CHAIN_NONE = 0,
+   V3D_CL_BO_CHAIN_WITH_BRANCH,
+   V3D_CL_BO_CHAIN_WITH_RETURN_FROM_SUB_LIST,
+};
+
 static bool
-cl_alloc_bo(struct v3dv_cl *cl, uint32_t space, bool use_branch)
+cl_alloc_bo(struct v3dv_cl *cl, uint32_t space, enum
+            v3dv_cl_chain_type chain_type)
 {
+   /* The last bytes of a CLE buffer are unusable because of readahead
+    * prefetch, so we need to take it into account when allocating a new BO
+    * for the CL. We also reserve space for the BRANCH/RETURN_FROM_SUB_LIST
+    * packet so we can always emit these last packets to the BO when
+    * needed. We will need to increase cl->size by the packet length before
+    * calling cl_submit to use this reserved space.
+    */
+   uint32_t unusable_space = 0;
+   switch (chain_type) {
+   case V3D_CL_BO_CHAIN_WITH_BRANCH:
+      unusable_space = V3D_CLE_READAHEAD + cl_packet_length(BRANCH);
+      break;
+   case V3D_CL_BO_CHAIN_WITH_RETURN_FROM_SUB_LIST:
+      unusable_space = V3D_CLE_READAHEAD + cl_packet_length(RETURN_FROM_SUB_LIST);
+      break;
+   case V3D_CL_BO_CHAIN_NONE:
+      break;
+   }
+
   /* If we are growing, double the BO allocation size to reduce the number
    * of allocations with large command buffers. This has a very significant
    * impact on the number of draw calls per second reported by vkoverhead.
    */
-   space = align(space, 4096);
+   space = align(space + unusable_space, 4096);
   if (cl->bo)
      space = MAX2(cl->bo->size * 2, space);

@ -83,10 +115,29 @@ cl_alloc_bo(struct v3dv_cl *cl, uint32_t space, bool use_branch)
   }

   /* Chain to the new BO from the old one if requested */
-   if (use_branch && cl->bo) {
-      cl->bo->cl_branch_offset = v3dv_cl_offset(cl);
-      cl_emit(cl, BRANCH, branch) {
-         branch.address = v3dv_cl_address(bo, 0);
+   if (cl->bo) {
+      switch (chain_type) {
+      case V3D_CL_BO_CHAIN_WITH_BRANCH:
+         cl->bo->cl_branch_offset = v3dv_cl_offset(cl);
+         cl->size += cl_packet_length(BRANCH);
+         assert(cl->size + V3D_CLE_READAHEAD <= cl->bo->size);
+         cl_emit(cl, BRANCH, branch) {
+            branch.address = v3dv_cl_address(bo, 0);
+         }
+         break;
+      case V3D_CL_BO_CHAIN_WITH_RETURN_FROM_SUB_LIST:
+         /* We do not want to emit branches from secondary command lists, instead,
+          * we will branch to them when we execute them in a primary using
+          * 'branch to sub list' commands, expecting each linked secondary to
+          * end with a 'return from sub list' command.
+          */
+         cl->size += cl_packet_length(RETURN_FROM_SUB_LIST);
+         assert(cl->size + V3D_CLE_READAHEAD <= cl->bo->size);
+         cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
+         FALLTHROUGH;
+      case V3D_CL_BO_CHAIN_NONE:
+         v3dv_job_add_bo_unchecked(cl->job, bo);
+         break;
      }
   } else {
      v3dv_job_add_bo_unchecked(cl->job, bo);
@ -94,7 +145,11 @@ cl_alloc_bo(struct v3dv_cl *cl, uint32_t space, bool use_branch)

   cl->bo = bo;
   cl->base = cl->bo->map;
-   cl->size = cl->bo->size;
+   /* Take only into account the usable size of the BO to guarantee that
+    * we never write in the last bytes of the CL buffer because of the
+    * readahead of the CLE
+    */
+   cl->size = cl->bo->size - unusable_space;
   cl->next = cl->base;

   return true;
@ -110,22 +165,14 @@ v3dv_cl_ensure_space(struct v3dv_cl *cl, uint32_t space, uint32_t alignment)
      return offset;
   }

-   cl_alloc_bo(cl, space, false);
+   cl_alloc_bo(cl, space, V3D_CL_BO_CHAIN_NONE);
+
   return 0;
 }

 void
 v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space)
 {
-   /* We do not want to emit branches from secondary command lists, instead,
-    * we will branch to them when we execute them in a primary using
-    * 'branch to sub list' commands, expecting each linked secondary to
-    * end with a 'return from sub list' command.
-    */
-   bool needs_return_from_sub_list = false;
-   if (cl->job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE && cl->size > 0)
-         needs_return_from_sub_list = true;
-
   /*
    * The CLE processor in the simulator tries to read V3D_CL_MAX_INSTR_SIZE
    * bytes form the CL for each new instruction. If the last instruction in our
@ -139,8 +186,9 @@ v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space)
   if (v3dv_cl_offset(cl) + space <= cl->size)
      return;

-   if (needs_return_from_sub_list)
-      cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
+   enum v3dv_cl_chain_type  chain_type = V3D_CL_BO_CHAIN_WITH_BRANCH;
+   if (cl->job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE)
+      chain_type = V3D_CL_BO_CHAIN_WITH_RETURN_FROM_SUB_LIST;

-   cl_alloc_bo(cl, space, !needs_return_from_sub_list);
+   cl_alloc_bo(cl, space, chain_type);
 }