diff --git a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
index 31487af8f09..a80675fa4f0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
@@ -1304,15 +1304,19 @@ static void emit_load_mem(struct lp_build_nir_context *bld_base,
 
    offset = LLVMBuildAShr(gallivm->builder, offset, lp_build_const_int_vec(gallivm, uint_bld->type, shift_val), "");
 
-   /* If the address is uniform, then use the address from invocation 0 to load,
-    * and broadcast to all invocations.
+   /* If the address is uniform, then use the address from the first active
+    * invocation 0 to load, and broadcast to all invocations.  We can't do
+    * computed first active invocation for shared accesses (index == NULL),
+    * though, since those don't do bounds checking and we could use an invalid
+    * offset if exec_mask == 0.
     */
-   if (index_and_offset_are_uniform && invocation_0_must_be_active(bld_base)) {
+   if (index_and_offset_are_uniform && (invocation_0_must_be_active(bld_base) || index)) {
       LLVMValueRef ssbo_limit;
+      LLVMValueRef first_active = first_active_invocation(bld_base);
       LLVMValueRef mem_ptr = mem_access_base_pointer(bld_base, load_bld, bit_size, index,
-                                                     lp_build_const_int32(gallivm, 0), &ssbo_limit);
+                                                     first_active, &ssbo_limit);
 
-      offset = LLVMBuildExtractElement(gallivm->builder, offset, lp_build_const_int32(gallivm, 0), "");
+      offset = LLVMBuildExtractElement(gallivm->builder, offset, first_active, "");
 
       for (unsigned c = 0; c < nc; c++) {
          LLVMValueRef chan_offset = LLVMBuildAdd(builder, offset, lp_build_const_int32(gallivm, c), "");