nak: use MemScope::CTA for shared memory scoped SCOPE_WORKGROUP barriers
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

CTA synchronizes between all threads within the same workgroup, so we
should use that over GPU which has some more severe performance
implications.

Sadly it doesn't appear like we can rely on .CTA to work for global
memory so let's keep using GPU for those for now.

Speeds up vk_cooperative_matrix by roughly 40%

Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36482>
This commit is contained in:
Karol Herbst 2025-07-31 03:35:26 +02:00 committed by Marge Bot
parent 50c6f31963
commit 67aeacc86b

View file

@ -3413,9 +3413,21 @@ impl<'a> ShaderFromNir<'a> {
if intrin.memory_scope() != SCOPE_NONE {
let mem_scope = match intrin.memory_scope() {
SCOPE_INVOCATION | SCOPE_SUBGROUP => MemScope::CTA,
SCOPE_WORKGROUP | SCOPE_QUEUE_FAMILY | SCOPE_DEVICE => {
MemScope::GPU
// A membar.gpu is very expensive so use .cta whenever
// possible.
// TODO: Figure out under which conditions we can relax
// them for global memory/images to CTA.
SCOPE_WORKGROUP => {
let global_modes = nir_var_image
| nir_var_mem_global
| nir_var_mem_ssbo;
if intrin.memory_modes() & global_modes != 0 {
MemScope::GPU
} else {
MemScope::CTA
}
}
SCOPE_QUEUE_FAMILY | SCOPE_DEVICE => MemScope::GPU,
_ => panic!("Unhandled memory scope"),
};
b.push_op(OpMemBar { scope: mem_scope });