ac/llvm: implement WA in nir to llvm

LLVM implements multiple workarounds for gfx11.
The problem is that they're not applied for shaders built in
parts.

LLVM will be modified to be more conservative and apply the
workaround in more places but in the meantime, add a simpler
implementation in the NIR to LLVM backend: insert a wait at
the end of each shader part.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10785
Cc: mesa-stable
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29304>
(cherry picked from commit 14974fd097)
This commit is contained in:
Pierre-Eric Pelloux-Prayer 2024-06-13 17:57:00 +02:00 committed by Eric Engestrom
parent bf86fa1b7d
commit 53d1306fe5
6 changed files with 30 additions and 1 deletions

View file

@ -1004,7 +1004,7 @@
"description": "ac/llvm: implement WA in nir to llvm",
"nominated": true,
"nomination_type": 0,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": null,
"notes": null

View file

@ -1320,6 +1320,12 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
info->has_export_conflict_bug = info->gfx_level == GFX11;
/* When LLVM is fixed to handle multiparts shaders, this value will depend
* on the known good versions of LLVM. Until then, enable the equivalent WA
* in the nir -> llvm backend.
*/
info->needs_llvm_wait_wa = info->gfx_level == GFX11;
/* Convert the SDMA version in the current GPU to an enum. */
info->sdma_ip_version =
(enum sdma_version)SDMA_VERSION_VALUE(info->ip[AMD_IP_SDMA].ver_major,

View file

@ -120,6 +120,9 @@ struct radeon_info {
bool sdma_supports_compression; /* Whether SDMA supports DCC and HTILE. */
bool has_set_context_pairs_packed;
bool has_set_sh_pairs_packed;
bool needs_llvm_wait_wa; /* True if the chip needs to workarounds based on s_waitcnt_deptr but
* the LLVM version doesn't work with multiparts shaders.
*/
/* conformant_trunc_coord is equal to TA_CNTL2.TRUNCATE_COORD_MODE, which exists since gfx11.
*

View file

@ -320,6 +320,7 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i)
report_if(!part->sections);
Elf_Scn *section = NULL;
bool first_section = true;
while ((section = elf_nextscn(part->elf, section))) {
Elf64_Shdr *shdr = elf64_getshdr(section);
struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)];
@ -348,6 +349,13 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i)
}
if (s->is_pasted_text) {
if (part_idx > 0 && first_section && binary->options.waitcnt_wa) {
/* Reserve a dword at the beginning of this part. */
exec_size += 4;
pasted_text_size += 4;
first_section = false;
}
s->offset = pasted_text_size;
pasted_text_size += shdr->sh_size;
} else {
@ -715,6 +723,7 @@ int ac_rtld_upload(struct ac_rtld_upload_info *u)
for (unsigned i = 0; i < u->binary->num_parts; ++i) {
struct ac_rtld_part *part = &u->binary->parts[i];
bool first_section = true;
Elf_Scn *section = NULL;
while ((section = elf_nextscn(part->elf, section))) {
Elf64_Shdr *shdr = elf64_getshdr(section);
@ -727,6 +736,13 @@ int ac_rtld_upload(struct ac_rtld_upload_info *u)
Elf_Data *data = elf_getdata(section, NULL);
report_elf_if(!data || data->d_size != shdr->sh_size);
if (i > 0 && first_section && u->binary->options.waitcnt_wa) {
assert(s->offset >= 4);
*(uint32_t *)(u->rx_ptr + s->offset - 4) = util_cpu_to_le32(0xbf880fff);
first_section = false;
}
memcpy(u->rx_ptr + s->offset, data->d_buf, shdr->sh_size);
size = MAX2(size, s->offset + shdr->sh_size);

View file

@ -35,6 +35,8 @@ struct ac_rtld_options {
/* Loader will insert an s_sethalt 1 instruction as the
* first instruction. */
bool halt_at_entry : 1;
bool waitcnt_wa : 1;
};
/* Lightweight wrapper around underlying ELF objects. */

View file

@ -788,6 +788,8 @@ bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
.options =
{
.halt_at_entry = screen->options.halt_shaders,
.waitcnt_wa = num_parts > 1 &&
screen->info.needs_llvm_wait_wa,
},
.shader_type = sel->stage,
.wave_size = shader->wave_size,