From 53d1306fe5cb2001a3aecad4b937cef147ea5467 Mon Sep 17 00:00:00 2001
From: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Date: Thu, 13 Jun 2024 17:57:00 +0200
Subject: [PATCH] ac/llvm: implement WA in nir to llvm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LLVM implements multiple workarounds for gfx11.
The problem is that they're not applied for shaders built in
parts.

LLVM will be modified to be more conservative and apply the
workaround in more places but in the meantime, add a simpler
implementation in the NIR to LLVM backend: insert a wait at
the end of each shader part.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10785
Cc: mesa-stable
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29304>
(cherry picked from commit 14974fd097f57814eac18648cfc07dd02c350d57)
---
 .pick_status.json                        |  2 +-
 src/amd/common/ac_gpu_info.c             |  6 ++++++
 src/amd/common/ac_gpu_info.h             |  3 +++
 src/amd/common/ac_rtld.c                 | 16 ++++++++++++++++
 src/amd/common/ac_rtld.h                 |  2 ++
 src/gallium/drivers/radeonsi/si_shader.c |  2 ++
 6 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/.pick_status.json b/.pick_status.json
index 16a064ae1b6..18776c6fc0a 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -1004,7 +1004,7 @@
         "description": "ac/llvm: implement WA in nir to llvm",
         "nominated": true,
         "nomination_type": 0,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": null,
         "notes": null
diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
index ff16585ab80..7a83c7babaf 100644
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -1320,6 +1320,12 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
 
    info->has_export_conflict_bug = info->gfx_level == GFX11;
 
+   /* When LLVM is fixed to handle multiparts shaders, this value will depend
+    * on the known good versions of LLVM. Until then, enable the equivalent WA
+    * in the nir -> llvm backend.
+    */
+   info->needs_llvm_wait_wa = info->gfx_level == GFX11;
+
    /* Convert the SDMA version in the current GPU to an enum. */
    info->sdma_ip_version =
       (enum sdma_version)SDMA_VERSION_VALUE(info->ip[AMD_IP_SDMA].ver_major,
diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h
index 2c292ee265a..c19d5163da7 100644
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -120,6 +120,9 @@ struct radeon_info {
    bool sdma_supports_compression; /* Whether SDMA supports DCC and HTILE. */
    bool has_set_context_pairs_packed;
    bool has_set_sh_pairs_packed;
+   bool needs_llvm_wait_wa; /* True if the chip needs to workarounds based on s_waitcnt_deptr but
+                             * the LLVM version doesn't work with multiparts shaders.
+                             */
 
    /* conformant_trunc_coord is equal to TA_CNTL2.TRUNCATE_COORD_MODE, which exists since gfx11.
     *
diff --git a/src/amd/common/ac_rtld.c b/src/amd/common/ac_rtld.c
index 8faee660871..879afd5b370 100644
--- a/src/amd/common/ac_rtld.c
+++ b/src/amd/common/ac_rtld.c
@@ -320,6 +320,7 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i)
       report_if(!part->sections);
 
       Elf_Scn *section = NULL;
+      bool first_section = true;
       while ((section = elf_nextscn(part->elf, section))) {
          Elf64_Shdr *shdr = elf64_getshdr(section);
          struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)];
@@ -348,6 +349,13 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i)
             }
 
             if (s->is_pasted_text) {
+               if (part_idx > 0 && first_section && binary->options.waitcnt_wa) {
+                  /* Reserve a dword at the beginning of this part. */
+                  exec_size += 4;
+                  pasted_text_size += 4;
+                  first_section = false;
+               }
+
                s->offset = pasted_text_size;
                pasted_text_size += shdr->sh_size;
             } else {
@@ -715,6 +723,7 @@ int ac_rtld_upload(struct ac_rtld_upload_info *u)
    for (unsigned i = 0; i < u->binary->num_parts; ++i) {
       struct ac_rtld_part *part = &u->binary->parts[i];
 
+      bool first_section = true;
       Elf_Scn *section = NULL;
       while ((section = elf_nextscn(part->elf, section))) {
          Elf64_Shdr *shdr = elf64_getshdr(section);
@@ -727,6 +736,13 @@ int ac_rtld_upload(struct ac_rtld_upload_info *u)
 
          Elf_Data *data = elf_getdata(section, NULL);
          report_elf_if(!data || data->d_size != shdr->sh_size);
+
+         if (i > 0 && first_section && u->binary->options.waitcnt_wa) {
+            assert(s->offset >= 4);
+            *(uint32_t *)(u->rx_ptr + s->offset - 4) = util_cpu_to_le32(0xbf880fff);
+            first_section = false;
+         }
+
          memcpy(u->rx_ptr + s->offset, data->d_buf, shdr->sh_size);
 
          size = MAX2(size, s->offset + shdr->sh_size);
diff --git a/src/amd/common/ac_rtld.h b/src/amd/common/ac_rtld.h
index c40145cb11d..cedb246980a 100644
--- a/src/amd/common/ac_rtld.h
+++ b/src/amd/common/ac_rtld.h
@@ -35,6 +35,8 @@ struct ac_rtld_options {
    /* Loader will insert an s_sethalt 1 instruction as the
     * first instruction. */
    bool halt_at_entry : 1;
+
+   bool waitcnt_wa : 1;
 };
 
 /* Lightweight wrapper around underlying ELF objects. */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4c96a07c66a..182f116f403 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -788,6 +788,8 @@ bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
                                        .options =
                                           {
                                              .halt_at_entry = screen->options.halt_shaders,
+                                             .waitcnt_wa = num_parts > 1 &&
+                                                           screen->info.needs_llvm_wait_wa,
                                           },
                                        .shader_type = sel->stage,
                                        .wave_size = shader->wave_size,