ac/llvm: implement WA in nir to llvm

LLVM implements multiple workarounds for gfx11. The problem is that they're not applied for shaders built in parts. LLVM will be modified to be more conservative and apply the workaround in more places but in the meantime, add a simpler implementation in the NIR to LLVM backend: insert a wait at the end of each shader part. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10785 Cc: mesa-stable Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29304> (cherry picked from commit 14974fd097)
2026-01-03 15:50:17 +01:00 · 2024-06-13 17:57:00 +02:00 · 2024-06-13 17:57:00 +02:00 · 53d1306fe5
commit 53d1306fe5
parent bf86fa1b7d
6 changed files with 30 additions and 1 deletions
--- a/.pick_status.json
+++ b/.pick_status.json
@ -1004,7 +1004,7 @@
        "description": "ac/llvm: implement WA in nir to llvm",
        "nominated": true,
        "nomination_type": 0,
-        "resolution": 0,
+        "resolution": 1,
        "main_sha": null,
        "because_sha": null,
        "notes": null
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@ -1320,6 +1320,12 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,

   info->has_export_conflict_bug = info->gfx_level == GFX11;

+   /* When LLVM is fixed to handle multiparts shaders, this value will depend
+    * on the known good versions of LLVM. Until then, enable the equivalent WA
+    * in the nir -> llvm backend.
+    */
+   info->needs_llvm_wait_wa = info->gfx_level == GFX11;
+
   /* Convert the SDMA version in the current GPU to an enum. */
   info->sdma_ip_version =
      (enum sdma_version)SDMA_VERSION_VALUE(info->ip[AMD_IP_SDMA].ver_major,
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@ -120,6 +120,9 @@ struct radeon_info {
   bool sdma_supports_compression; /* Whether SDMA supports DCC and HTILE. */
   bool has_set_context_pairs_packed;
   bool has_set_sh_pairs_packed;
+   bool needs_llvm_wait_wa; /* True if the chip needs to workarounds based on s_waitcnt_deptr but
+                             * the LLVM version doesn't work with multiparts shaders.
+                             */

   /* conformant_trunc_coord is equal to TA_CNTL2.TRUNCATE_COORD_MODE, which exists since gfx11.
    *
--- a/src/amd/common/ac_rtld.c
+++ b/src/amd/common/ac_rtld.c
@ -320,6 +320,7 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i)
      report_if(!part->sections);

      Elf_Scn *section = NULL;
+      bool first_section = true;
      while ((section = elf_nextscn(part->elf, section))) {
         Elf64_Shdr *shdr = elf64_getshdr(section);
         struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)];
@ -348,6 +349,13 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i)
            }

            if (s->is_pasted_text) {
+               if (part_idx > 0 && first_section && binary->options.waitcnt_wa) {
+                  /* Reserve a dword at the beginning of this part. */
+                  exec_size += 4;
+                  pasted_text_size += 4;
+                  first_section = false;
+               }
+
               s->offset = pasted_text_size;
               pasted_text_size += shdr->sh_size;
            } else {
@ -715,6 +723,7 @@ int ac_rtld_upload(struct ac_rtld_upload_info *u)
   for (unsigned i = 0; i < u->binary->num_parts; ++i) {
      struct ac_rtld_part *part = &u->binary->parts[i];

+      bool first_section = true;
      Elf_Scn *section = NULL;
      while ((section = elf_nextscn(part->elf, section))) {
         Elf64_Shdr *shdr = elf64_getshdr(section);
@ -727,6 +736,13 @@ int ac_rtld_upload(struct ac_rtld_upload_info *u)

         Elf_Data *data = elf_getdata(section, NULL);
         report_elf_if(!data || data->d_size != shdr->sh_size);
+
+         if (i > 0 && first_section && u->binary->options.waitcnt_wa) {
+            assert(s->offset >= 4);
+            *(uint32_t *)(u->rx_ptr + s->offset - 4) = util_cpu_to_le32(0xbf880fff);
+            first_section = false;
+         }
+
         memcpy(u->rx_ptr + s->offset, data->d_buf, shdr->sh_size);

         size = MAX2(size, s->offset + shdr->sh_size);
--- a/src/amd/common/ac_rtld.h
+++ b/src/amd/common/ac_rtld.h
@ -35,6 +35,8 @@ struct ac_rtld_options {
   /* Loader will insert an s_sethalt 1 instruction as the
    * first instruction. */
   bool halt_at_entry : 1;
+
+   bool waitcnt_wa : 1;
 };

 /* Lightweight wrapper around underlying ELF objects. */
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -788,6 +788,8 @@ bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
                                       .options =
                                          {
                                             .halt_at_entry = screen->options.halt_shaders,
+                                             .waitcnt_wa = num_parts > 1 &&
+                                                           screen->info.needs_llvm_wait_wa,
                                          },
                                       .shader_type = sel->stage,
                                       .wave_size = shader->wave_size,