winsys/amdgpu: pad gfx and compute IBs with only 1 NOP

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25578>
2025-12-24 08:50:13 +01:00 · 2023-10-05 06:50:29 -04:00 · 2023-10-05 06:50:29 -04:00 · d8a41a27b9
commit d8a41a27b9
parent 87e9fafaa8
1 changed files with 29 additions and 15 deletions
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@ -328,6 +328,31 @@ static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
   amdgpu_ctx_unref((struct amdgpu_ctx*)rwctx);
 }

+static void amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys *ws, enum amd_ip_type ip_type,
+                                      uint32_t *ib, uint32_t *num_dw, unsigned leave_dw_space)
+{
+   unsigned pad_dw_mask = ws->info.ip[ip_type].ib_pad_dw_mask;
+   unsigned unaligned_dw = (*num_dw + leave_dw_space) & pad_dw_mask;
+
+   if (unaligned_dw) {
+      int remaining = pad_dw_mask + 1 - unaligned_dw;
+
+      /* Only pad by 1 dword with the type-2 NOP if necessary. */
+      if (remaining == 1 && ws->info.gfx_ib_pad_with_type2) {
+         ib[(*num_dw)++] = PKT2_NOP_PAD;
+      } else {
+         /* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
+          * packet. The size of the packet body after the header is always count + 1.
+          * If count == -1, there is no packet body. NOP is the only packet that can have
+          * count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
+          */
+         ib[(*num_dw)++] = PKT3(PKT3_NOP, remaining - 2, 0);
+         *num_dw += remaining - 1;
+      }
+   }
+   assert(((*num_dw + leave_dw_space) & pad_dw_mask) == 0);
+}
+
 static int amdgpu_submit_gfx_nop(struct amdgpu_ctx *ctx)
 {
   struct amdgpu_bo_alloc_request request = {0};
@ -1169,9 +1194,7 @@ amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_i
   memcpy(map, preamble_ib, preamble_num_dw * 4);

   /* Pad the IB. */
-   uint32_t ib_pad_dw_mask = ws->info.ip[cs->ip_type].ib_pad_dw_mask;
-   while (preamble_num_dw & ib_pad_dw_mask)
-      map[preamble_num_dw++] = PKT3_NOP_PAD;
+   amdgpu_pad_gfx_compute_ib(ws, cs->ip_type, map, &preamble_num_dw, 0);
   amdgpu_bo_unmap(&ws->dummy_ws.base, preamble_bo);

   for (unsigned i = 0; i < 2; i++) {
@ -1245,17 +1268,14 @@ static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
   rcs->current.max_dw += cs_epilog_dw;

   /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
-   uint32_t ib_pad_dw_mask = cs->ws->info.ip[cs->ip_type].ib_pad_dw_mask;
-   while ((rcs->current.cdw & ib_pad_dw_mask) != ib_pad_dw_mask - 3)
-      radeon_emit(rcs, PKT3_NOP_PAD);
+   amdgpu_pad_gfx_compute_ib(cs->ws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 4);

   radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
   radeon_emit(rcs, va);
   radeon_emit(rcs, va >> 32);
   uint32_t *new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++];
-   assert((rcs->current.cdw & ib_pad_dw_mask) == 0);

-   assert((rcs->current.cdw & 7) == 0);
+   assert((rcs->current.cdw & cs->ws->info.ip[cs->ip_type].ib_pad_dw_mask) == 0);
   assert(rcs->current.cdw <= rcs->current.max_dw);

   amdgpu_set_ib_size(rcs, ib);
@ -1821,13 +1841,7 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
      break;
   case AMD_IP_GFX:
   case AMD_IP_COMPUTE:
-      if (ws->info.gfx_ib_pad_with_type2) {
-         while (rcs->current.cdw & ib_pad_dw_mask)
-            radeon_emit(rcs, PKT2_NOP_PAD);
-      } else {
-         while (rcs->current.cdw & ib_pad_dw_mask)
-            radeon_emit(rcs, PKT3_NOP_PAD);
-      }
+      amdgpu_pad_gfx_compute_ib(ws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 0);
      if (cs->ip_type == AMD_IP_GFX)
         ws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
      break;