winsys/amdgpu: fwm packet pre-emption for gfx 11.5

gfx 11.5 uses f32 firmware. f32 firmware requires COND_EXEC packet to flush the ring buffer when pre-emption occured. Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36700>
2026-01-08 15:00:11 +01:00 · 2025-07-18 11:01:55 +05:30 · 2025-07-18 11:01:55 +05:30 · 9beb668d8d
commit 9beb668d8d
parent 37c7d19e46
4 changed files with 76 additions and 10 deletions
--- a/src/amd/common/sid.h
+++ b/src/amd/common/sid.h
@ -70,6 +70,7 @@
 #define     PREDICATION_OP_BOOL32                     0x4
 #define   PREDICATION_CONTINUE                        (1 << 31)
 #define PKT3_COND_EXEC                             0x22
+#define   COND_EXEC_USERQ_OVERRULE_CMD                (1 << 31)
 #define PKT3_PRED_EXEC                             0x23
 #define PKT3_DRAW_INDIRECT                         0x24
 #define PKT3_DRAW_INDEX_INDIRECT                   0x25
@ -111,6 +112,9 @@
 #define   STRMOUT_SELECT_BUFFER(x)                    (((unsigned)(x)&0x3) << 8)
 #define PKT3_DRAW_INDEX_OFFSET_2                   0x35
 #define PKT3_WRITE_DATA                            0x37
+#define   WRITE_DATA_DST_SEL(x)                       (((unsigned)(x)&0xf) << 8)
+#define   WRITE_DATA_WR_CONFIRM                       (1 << 20)
+#define   WRITE_DATA_CACHE_POLICY(x)                  (x << 25)
 #define PKT3_DRAW_INDEX_INDIRECT_MULTI             0x38
 #define PKT3_MEM_SEMAPHORE                         0x39
 #define PKT3_MPEG_INDEX                            0x3A /* GFX6 only */
@ -250,6 +254,8 @@
 #define PKT3_INCREMENT_CE_COUNTER                  0x84
 #define PKT3_INCREMENT_DE_COUNTER                  0x85
 #define PKT3_WAIT_ON_CE_COUNTER                    0x86
+#define PKT3_FRAME_CONTROL                         0x90
+#define   S_FRAME_CONTROL_CMD(x)                      ((x) << 28)
 #define PKT3_HDP_FLUSH                             0x95
 #define PKT3_SET_SH_REG_INDEX                      0x9B
 #define PKT3_LOAD_CONTEXT_REG_INDEX                0x9F /* GFX8+ */
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp
@ -1408,6 +1408,11 @@ static int amdgpu_cs_submit_ib_kernelq(struct amdgpu_cs *acs,
   return r;
 }

+struct cond_exec_skip_count {
+   uint32_t *count_dw_ptr;
+   uint64_t start_wptr;
+};
+
 static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws,
                                        struct amdgpu_userq *userq,
                                        struct amdgpu_cs_context *csc,
@ -1417,13 +1422,31 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws,
   amdgpu_pkt_begin();

   if (userq->ip_type == AMD_IP_GFX || userq->ip_type == AMD_IP_COMPUTE) {
+      struct cond_exec_skip_count *cond_exec_skip_counts = NULL;
+
+      if (csc->aws->info.gfx_level == GFX11_5 && userq->ip_type == AMD_IP_GFX) {
+         /* index 0 holds skip count for skipping the entire job. Rest for FENCE_WAIT_MULTI
+          * packet pre-emption going to end of the job.
+          */
+         cond_exec_skip_counts = (struct cond_exec_skip_count*)alloca(
+            sizeof(struct cond_exec_skip_count) * (1 + DIV_ROUND_UP(num_fences, 4)));
+         amdgpu_pkt_add_dw(PKT3(PKT3_COND_EXEC, 3, 0));
+         amdgpu_pkt_add_dw(0);
+         amdgpu_pkt_add_dw(0);
+         amdgpu_pkt_add_dw(0);
+         cond_exec_skip_counts[0].count_dw_ptr = amdgpu_pkt_get_ptr_skip_dw();
+         cond_exec_skip_counts[0].start_wptr = amdgpu_pkt_get_next_wptr();
+      }
+
      if (num_fences) {
         unsigned max_num_fences_fwm;
         unsigned num_fences_in_iter;
+
         if (csc->aws->info.has_dedicated_vram || csc->aws->info.gfx_level >= GFX12)
            max_num_fences_fwm = 32;
         else
            max_num_fences_fwm = 4;
+
         for (unsigned i = 0; i < num_fences; i = i + max_num_fences_fwm) {
            num_fences_in_iter = (i + max_num_fences_fwm > num_fences) ?
                                    num_fences - i : max_num_fences_fwm;
@ -1435,6 +1458,15 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws,
               amdgpu_pkt_add_dw(fence_info[i + j].value);
               amdgpu_pkt_add_dw(fence_info[i + j].value >> 32);
            }
+
+            if (csc->aws->info.gfx_level == GFX11_5 && userq->ip_type == AMD_IP_GFX) {
+               amdgpu_pkt_add_dw(PKT3(PKT3_COND_EXEC, 3, 0));
+               amdgpu_pkt_add_dw(0);
+               amdgpu_pkt_add_dw(0);
+               amdgpu_pkt_add_dw(0);
+               cond_exec_skip_counts[1 + i].count_dw_ptr = amdgpu_pkt_get_ptr_skip_dw();
+               cond_exec_skip_counts[1 + i].start_wptr = amdgpu_pkt_get_next_wptr();
+            }
         }
      }

@ -1469,11 +1501,31 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws,
      amdgpu_pkt_add_dw(userq->user_fence_seq_num >> 32);
      amdgpu_pkt_add_dw(0);

-      /* protected signal packet. This is trusted RELEASE_MEM packet. i.e. fence buffer
-       * is only accessible from kernel through VMID 0.
+      /* protected signal packet. This is trusted RELEASE_MEM packet.
+       *
+       * Kernel allocates the memory for the protected fence and passes the protected fence address
+       * in MQD (memory queue descriptor - where static and dynamic queue states are stored). This
+       * fence memory is mapped as write, only for VMID 0. This packet writes the ring buffer
+       * monotonic (non-wrapping) read pointer value to the fence address passed in MQD when the
+       * job is completed.
+       *
+       * The protected fence memory is mapped as read only to the user VMID. The
+       * DRM_AMDGPU_USERQ_WAIT ioctl will return read only fence memory address along with protected
+       * fence sequence number to wait which is used in FENCE_WAIT_MULTI packet.
+       *
+       * PKT3_PROTECTED_FENCE_SIGNAL packet should be the last packet before ringing doorbell so
+       * that mesa user fence sequence number matches with protected fence sequence number. This
+       * is helpful in debugging.
       */
      amdgpu_pkt_add_dw(PKT3(PKT3_PROTECTED_FENCE_SIGNAL, 0, 0));
      amdgpu_pkt_add_dw(0);
+
+      if (csc->aws->info.gfx_level == GFX11_5 && userq->ip_type == AMD_IP_GFX) {
+         for (unsigned i = 0; i < 1 + DIV_ROUND_UP(num_fences, 4); i++)
+            *cond_exec_skip_counts[i].count_dw_ptr = (amdgpu_pkt_get_next_wptr() -
+                                                         cond_exec_skip_counts[i].start_wptr) |
+                                                         COND_EXEC_USERQ_OVERRULE_CMD;
+      }
   } else {
      mesa_loge("amdgpu: unsupported userq ip submission = %d\n", userq->ip_type);
   }
@ -1496,7 +1548,7 @@ static int amdgpu_cs_submit_ib_userq(struct amdgpu_userq *userq,

   /* Syncobj dependencies. */
   unsigned num_syncobj_dependencies = csc->syncobj_dependencies.num;
-   uint32_t *syncobj_dependencies_list =
+    uint32_t *syncobj_dependencies_list =
      (uint32_t*)alloca(num_syncobj_dependencies * sizeof(uint32_t));

   /* Currently only 1 vm timeline syncobj can be a dependency. */
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c
@ -58,14 +58,16 @@ amdgpu_userq_ring_init(struct amdgpu_winsys *aws, struct amdgpu_userq *userq,
   *userq->wptr_bo_map = 0;
   userq->next_wptr = 0;

-   userq->rptr_bo = amdgpu_bo_create(aws, aws->info.gart_page_size, 256, RADEON_DOMAIN_VRAM,
+   /* Allocate memory for rptr. */
+   userq->vram_bo = amdgpu_bo_create(aws, aws->info.gart_page_size, 256, RADEON_DOMAIN_VRAM,
                                     RADEON_FLAG_CLEAR_VRAM | RADEON_FLAG_GL2_BYPASS |
                                        RADEON_FLAG_NO_SUBALLOC |
                                        RADEON_FLAG_NO_INTERPROCESS_SHARING);
-   if (!userq->rptr_bo)
+   if (!userq->vram_bo)
      return false;

-   update_vm_timeline_point_to_wait(vm_timeline_point_to_wait, userq->rptr_bo);
+   update_vm_timeline_point_to_wait(vm_timeline_point_to_wait, userq->vram_bo);
+   userq->rptr_va = amdgpu_bo_get_va(userq->vram_bo);
   return true;
 }

@ -77,7 +79,7 @@ amdgpu_userq_deinit(struct amdgpu_winsys *aws, struct amdgpu_userq *userq)

   radeon_bo_reference(&aws->dummy_sws.base, &userq->gtt_bo, NULL);
   radeon_bo_reference(&aws->dummy_sws.base, &userq->wptr_bo, NULL);
-   radeon_bo_reference(&aws->dummy_sws.base, &userq->rptr_bo, NULL);
+   radeon_bo_reference(&aws->dummy_sws.base, &userq->vram_bo, NULL);
   radeon_bo_reference(&aws->dummy_sws.base, &userq->doorbell_bo, NULL);

   switch (userq->ip_type) {
@ -206,8 +208,8 @@ amdgpu_userq_init(struct amdgpu_winsys *aws, struct amdgpu_userq *userq, enum am
      r = ac_drm_create_userqueue(aws->dev, hw_ip_type,
                                  get_real_bo(amdgpu_winsys_bo(userq->doorbell_bo))->kms_handle,
                                  AMDGPU_USERQ_DOORBELL_INDEX, ring_va, AMDGPU_USERQ_RING_SIZE,
-                                  amdgpu_bo_get_va(userq->wptr_bo), amdgpu_bo_get_va(userq->rptr_bo),
-                                  mqd, priority, &userq->userq_handle);
+                                  amdgpu_bo_get_va(userq->wptr_bo), userq->rptr_va, mqd, priority,
+                                  &userq->userq_handle);
      if (r == -EACCES && priority == AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_HIGH) {
         /* Try again with a lower priority. */
         priority = AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_NORMAL_HIGH;
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h
@ -32,6 +32,11 @@ extern "C" {
   userq->next_wptr = __next_wptr; \
 } while (0)

+#define amdgpu_pkt_get_ptr_skip_dw() \
+   (__ring_ptr + (__next_wptr++ & AMDGPU_USERQ_RING_SIZE_DW_MASK))
+
+#define amdgpu_pkt_get_next_wptr() __next_wptr
+
 struct amdgpu_winsys;
 struct amdgpu_screen_winsys;

@ -68,7 +73,8 @@ struct amdgpu_userq {
    * (this avoids writing multiple times to the door bell for the same
    * submission) */
   uint64_t next_wptr;
-   struct pb_buffer_lean *rptr_bo;
+   struct pb_buffer_lean *vram_bo;
+   uint64_t rptr_va;

   struct pb_buffer_lean *doorbell_bo;
   uint64_t *doorbell_bo_map;