nak: use MemScope::CTA for shared memory scoped SCOPE_WORKGROUP barriers

CTA synchronizes between all threads within the same workgroup, so we should use that over GPU which has some more severe performance implications. Sadly it doesn't appear like we can rely on .CTA to work for global memory so let's keep using GPU for those for now. Speeds up vk_cooperative_matrix by roughly 40% Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36482>
2026-05-05 13:58:04 +02:00 · 2025-07-31 03:35:26 +02:00 · 2025-07-31 03:35:26 +02:00 · 67aeacc86b
commit 67aeacc86b
parent 50c6f31963
1 changed files with 14 additions and 2 deletions
--- a/src/nouveau/compiler/nak/from_nir.rs
+++ b/src/nouveau/compiler/nak/from_nir.rs
@ -3413,9 +3413,21 @@ impl<'a> ShaderFromNir<'a> {
                if intrin.memory_scope() != SCOPE_NONE {
                    let mem_scope = match intrin.memory_scope() {
                        SCOPE_INVOCATION | SCOPE_SUBGROUP => MemScope::CTA,
-                        SCOPE_WORKGROUP | SCOPE_QUEUE_FAMILY | SCOPE_DEVICE => {
-                            MemScope::GPU
+                        // A membar.gpu is very expensive so use .cta whenever
+                        // possible.
+                        // TODO: Figure out under which conditions we can relax
+                        //       them for global memory/images to CTA.
+                        SCOPE_WORKGROUP => {
+                            let global_modes = nir_var_image
+                                | nir_var_mem_global
+                                | nir_var_mem_ssbo;
+                            if intrin.memory_modes() & global_modes != 0 {
+                                MemScope::GPU
+                            } else {
+                                MemScope::CTA
+                            }
                        }
+                        SCOPE_QUEUE_FAMILY | SCOPE_DEVICE => MemScope::GPU,
                        _ => panic!("Unhandled memory scope"),
                    };
                    b.push_op(OpMemBar { scope: mem_scope });