diff --git a/.pick_status.json b/.pick_status.json index 16a064ae1b6..18776c6fc0a 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -1004,7 +1004,7 @@ "description": "ac/llvm: implement WA in nir to llvm", "nominated": true, "nomination_type": 0, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index ff16585ab80..7a83c7babaf 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -1320,6 +1320,12 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, info->has_export_conflict_bug = info->gfx_level == GFX11; + /* When LLVM is fixed to handle multiparts shaders, this value will depend + * on the known good versions of LLVM. Until then, enable the equivalent WA + * in the nir -> llvm backend. + */ + info->needs_llvm_wait_wa = info->gfx_level == GFX11; + /* Convert the SDMA version in the current GPU to an enum. */ info->sdma_ip_version = (enum sdma_version)SDMA_VERSION_VALUE(info->ip[AMD_IP_SDMA].ver_major, diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index 2c292ee265a..c19d5163da7 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -120,6 +120,9 @@ struct radeon_info { bool sdma_supports_compression; /* Whether SDMA supports DCC and HTILE. */ bool has_set_context_pairs_packed; bool has_set_sh_pairs_packed; + bool needs_llvm_wait_wa; /* True if the chip needs to workarounds based on s_waitcnt_deptr but + * the LLVM version doesn't work with multiparts shaders. + */ /* conformant_trunc_coord is equal to TA_CNTL2.TRUNCATE_COORD_MODE, which exists since gfx11. * diff --git a/src/amd/common/ac_rtld.c b/src/amd/common/ac_rtld.c index 8faee660871..879afd5b370 100644 --- a/src/amd/common/ac_rtld.c +++ b/src/amd/common/ac_rtld.c @@ -320,6 +320,7 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i) report_if(!part->sections); Elf_Scn *section = NULL; + bool first_section = true; while ((section = elf_nextscn(part->elf, section))) { Elf64_Shdr *shdr = elf64_getshdr(section); struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)]; @@ -348,6 +349,13 @@ bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i) } if (s->is_pasted_text) { + if (part_idx > 0 && first_section && binary->options.waitcnt_wa) { + /* Reserve a dword at the beginning of this part. */ + exec_size += 4; + pasted_text_size += 4; + first_section = false; + } + s->offset = pasted_text_size; pasted_text_size += shdr->sh_size; } else { @@ -715,6 +723,7 @@ int ac_rtld_upload(struct ac_rtld_upload_info *u) for (unsigned i = 0; i < u->binary->num_parts; ++i) { struct ac_rtld_part *part = &u->binary->parts[i]; + bool first_section = true; Elf_Scn *section = NULL; while ((section = elf_nextscn(part->elf, section))) { Elf64_Shdr *shdr = elf64_getshdr(section); @@ -727,6 +736,13 @@ int ac_rtld_upload(struct ac_rtld_upload_info *u) Elf_Data *data = elf_getdata(section, NULL); report_elf_if(!data || data->d_size != shdr->sh_size); + + if (i > 0 && first_section && u->binary->options.waitcnt_wa) { + assert(s->offset >= 4); + *(uint32_t *)(u->rx_ptr + s->offset - 4) = util_cpu_to_le32(0xbf880fff); + first_section = false; + } + memcpy(u->rx_ptr + s->offset, data->d_buf, shdr->sh_size); size = MAX2(size, s->offset + shdr->sh_size); diff --git a/src/amd/common/ac_rtld.h b/src/amd/common/ac_rtld.h index c40145cb11d..cedb246980a 100644 --- a/src/amd/common/ac_rtld.h +++ b/src/amd/common/ac_rtld.h @@ -35,6 +35,8 @@ struct ac_rtld_options { /* Loader will insert an s_sethalt 1 instruction as the * first instruction. */ bool halt_at_entry : 1; + + bool waitcnt_wa : 1; }; /* Lightweight wrapper around underlying ELF objects. */ diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 4c96a07c66a..182f116f403 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -788,6 +788,8 @@ bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader, .options = { .halt_at_entry = screen->options.halt_shaders, + .waitcnt_wa = num_parts > 1 && + screen->info.needs_llvm_wait_wa, }, .shader_type = sel->stage, .wave_size = shader->wave_size,