diff --git a/.pick_status.json b/.pick_status.json
index 16a064ae1b6..18776c6fc0a 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -1004,7 +1004,7 @@
         "description": "ac/llvm: implement WA in nir to llvm",
         "nominated": true,
         "nomination_type": 0,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": null,
         "notes": null
diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
index ff16585ab80..7a83c7babaf 100644
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -1320,6 +1320,12 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
 
    info->has_export_conflict_bug = info->gfx_level == GFX11;
 
+   /* When LLVM is fixed to handle multiparts shaders, this value will depend
+    * on the known good versions of LLVM. Until then, enable the equivalent WA
+    * in the nir -> llvm backend.
+    */
+   info->needs_llvm_wait_wa = info->gfx_level == GFX11;
+
    /* Convert the SDMA version in the current GPU to an enum. */
    info->sdma_ip_version =
       (enum sdma_version)SDMA_VERSION_VALUE(info->ip[AMD_IP_SDMA].ver_major,
diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h
index 2c292ee265a..c19d5163da7 100644
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -120,6 +120,9 @@ struct radeon_info {
    bool sdma_supports_compression; /* Whether SDMA supports DCC and HTILE. */
    bool has_set_context_pairs_packed;
    bool has_set_sh_pairs_packed;
+   bool needs_llvm_wait_wa; /* True if the chip needs to workarounds based on s_waitcnt_deptr but
+                             * the LLVM version doesn't work with multiparts shaders.
+                             */
 
    /* conformant_trunc_coord is equal to TA_CNTL2.TRUNCATE_COORD_MODE, which exists since gfx11.
     *
diff --git a/src/amd/common/ac_rtld.c b/src/amd/common/ac_rtld.c
index 8faee660871..879afd5b370 100644
--- a/src/amd/common/ac_rtld.c
+++ b/src/amd/common/ac_rtld.c
@@ -320,6 +320,7 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i)
       report_if(!part->sections);
 
       Elf_Scn *section = NULL;
+      bool first_section = true;
       while ((section = elf_nextscn(part->elf, section))) {
          Elf64_Shdr *shdr = elf64_getshdr(section);
          struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)];
@@ -348,6 +349,13 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i)
             }
 
             if (s->is_pasted_text) {
+               if (part_idx > 0 && first_section && binary->options.waitcnt_wa) {
+                  /* Reserve a dword at the beginning of this part. */
+                  exec_size += 4;
+                  pasted_text_size += 4;
+                  first_section = false;
+               }
+
                s->offset = pasted_text_size;
                pasted_text_size += shdr->sh_size;
             } else {
@@ -715,6 +723,7 @@ int ac_rtld_upload(struct ac_rtld_upload_info *u)
    for (unsigned i = 0; i < u->binary->num_parts; ++i) {
       struct ac_rtld_part *part = &u->binary->parts[i];
 
+      bool first_section = true;
       Elf_Scn *section = NULL;
       while ((section = elf_nextscn(part->elf, section))) {
          Elf64_Shdr *shdr = elf64_getshdr(section);
@@ -727,6 +736,13 @@ int ac_rtld_upload(struct ac_rtld_upload_info *u)
 
          Elf_Data *data = elf_getdata(section, NULL);
          report_elf_if(!data || data->d_size != shdr->sh_size);
+
+         if (i > 0 && first_section && u->binary->options.waitcnt_wa) {
+            assert(s->offset >= 4);
+            *(uint32_t *)(u->rx_ptr + s->offset - 4) = util_cpu_to_le32(0xbf880fff);
+            first_section = false;
+         }
+
          memcpy(u->rx_ptr + s->offset, data->d_buf, shdr->sh_size);
 
          size = MAX2(size, s->offset + shdr->sh_size);
diff --git a/src/amd/common/ac_rtld.h b/src/amd/common/ac_rtld.h
index c40145cb11d..cedb246980a 100644
--- a/src/amd/common/ac_rtld.h
+++ b/src/amd/common/ac_rtld.h
@@ -35,6 +35,8 @@ struct ac_rtld_options {
    /* Loader will insert an s_sethalt 1 instruction as the
     * first instruction. */
    bool halt_at_entry : 1;
+
+   bool waitcnt_wa : 1;
 };
 
 /* Lightweight wrapper around underlying ELF objects. */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4c96a07c66a..182f116f403 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -788,6 +788,8 @@ bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
                                        .options =
                                           {
                                              .halt_at_entry = screen->options.halt_shaders,
+                                             .waitcnt_wa = num_parts > 1 &&
+                                                           screen->info.needs_llvm_wait_wa,
                                           },
                                        .shader_type = sel->stage,
                                        .wave_size = shader->wave_size,