diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index b26dbafd36b..0e677fd5ff9 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1254,9 +1254,9 @@ load("per_view_output", [1, 1], [BASE, RANGE, COMPONENT, DEST_TYPE, IO_SEMANTICS # src[] = { primitive, offset }. load("per_primitive_output", [1, 1], [BASE, COMPONENT, DEST_TYPE, IO_SEMANTICS], [CAN_ELIMINATE]) # src[] = { offset }. -load("shared", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE]) +load("shared", [1], [BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE]) # src[] = { offset }. -load("task_payload", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE]) +load("task_payload", [1], [BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE]) # src[] = { offset }. load("push_constant", [1], [BASE, RANGE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE, CAN_REORDER]) # src[] = { offset }. @@ -1302,9 +1302,9 @@ store("per_primitive_output", [1, 1], [BASE, RANGE, WRITE_MASK, COMPONENT, SRC_T # src[] = { value, block_index, offset } store("ssbo", [-1, 1], [WRITE_MASK, ACCESS, ALIGN_MUL, ALIGN_OFFSET, OFFSET_SHIFT]) # src[] = { value, offset }. -store("shared", [1], [BASE, WRITE_MASK, ALIGN_MUL, ALIGN_OFFSET]) +store("shared", [1], [BASE, ACCESS, WRITE_MASK, ALIGN_MUL, ALIGN_OFFSET]) # src[] = { value, offset }. -store("task_payload", [1], [BASE, WRITE_MASK, ALIGN_MUL, ALIGN_OFFSET]) +store("task_payload", [1], [BASE, ACCESS, WRITE_MASK, ALIGN_MUL, ALIGN_OFFSET]) # src[] = { value, address }. store("global", [1], [WRITE_MASK, ACCESS, ALIGN_MUL, ALIGN_OFFSET]) # src[] = { value, address }. @@ -1963,10 +1963,10 @@ intrinsic("load_smem_amd", src_comp=[1, 1], dest_comp=0, bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER]) # src[] = { offset }. -intrinsic("load_shared2_amd", [1], dest_comp=2, indices=[OFFSET0, OFFSET1, ST64], flags=[CAN_ELIMINATE]) +intrinsic("load_shared2_amd", [1], dest_comp=2, indices=[ACCESS, OFFSET0, OFFSET1, ST64], flags=[CAN_ELIMINATE]) # src[] = { value, offset }. -intrinsic("store_shared2_amd", [2, 1], indices=[OFFSET0, OFFSET1, ST64]) +intrinsic("store_shared2_amd", [2, 1], indices=[ACCESS, OFFSET0, OFFSET1, ST64]) # Vertex stride in LS-HS buffer system_value("lshs_vertex_stride_amd", 1) diff --git a/src/compiler/nir/nir_lower_atomics.c b/src/compiler/nir/nir_lower_atomics.c index 36026148524..16303466892 100644 --- a/src/compiler/nir/nir_lower_atomics.c +++ b/src/compiler/nir/nir_lower_atomics.c @@ -33,7 +33,7 @@ * eg: * atomicAdd(a[0], 1) -> * - * uint expected = a[0]; + * uint expected = atomicLoad(a[0]); * while (true) { * uint before = expected; * expected += 1; @@ -54,17 +54,18 @@ build_atomic(nir_builder *b, nir_intrinsic_instr *intr) .align_mul = intr->def.bit_size / 8, .align_offset = 0, .offset_shift = nir_intrinsic_offset_shift(intr), - .access = ACCESS_COHERENT); + .access = ACCESS_ATOMIC | ACCESS_COHERENT); break; case nir_intrinsic_shared_atomic: load = nir_load_shared(b, 1, intr->def.bit_size, intr->src[0].ssa, .align_mul = intr->def.bit_size / 8, - .align_offset = 0); + .align_offset = 0, + .access = ACCESS_ATOMIC); break; case nir_intrinsic_global_atomic: load = nir_build_load_global(b, 1, intr->def.bit_size, intr->src[0].ssa, - .access = ACCESS_COHERENT); + .access = ACCESS_ATOMIC | ACCESS_COHERENT); break; default: UNREACHABLE("unsupported atomic type"); diff --git a/src/compiler/nir/nir_lower_atomics_to_ssbo.c b/src/compiler/nir/nir_lower_atomics_to_ssbo.c index 3b2032c34ce..610ebf80fe4 100644 --- a/src/compiler/nir/nir_lower_atomics_to_ssbo.c +++ b/src/compiler/nir/nir_lower_atomics_to_ssbo.c @@ -104,7 +104,7 @@ lower_instr(nir_intrinsic_instr *instr, unsigned ssbo_offset, nir_builder *b, un if (nir_intrinsic_has_atomic_op(new_instr)) nir_intrinsic_set_atomic_op(new_instr, atomic_op); if (op == nir_intrinsic_load_ssbo) - nir_intrinsic_set_access(new_instr, ACCESS_COHERENT); + nir_intrinsic_set_access(new_instr, ACCESS_COHERENT | ACCESS_ATOMIC); /* a couple instructions need special handling since they don't map * 1:1 with ssbo atomics diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c index e92cafb1108..8c858205871 100644 --- a/src/compiler/nir/nir_opt_load_store_vectorize.c +++ b/src/compiler/nir/nir_opt_load_store_vectorize.c @@ -1416,6 +1416,10 @@ can_vectorize(struct vectorize_ctx *ctx, struct entry *first, struct entry *seco (first->access & ACCESS_VOLATILE) || first->info->is_unvectorizable) return false; + /* We can't change the bit size of atomic load/store */ + if ((first->access & ACCESS_ATOMIC) && get_bit_size(first) != get_bit_size(second)) + return false; + if (first->intrin->intrinsic == nir_intrinsic_load_buffer_amd || first->intrin->intrinsic == nir_intrinsic_store_buffer_amd) { if (first->access & ACCESS_USES_FORMAT_AMD) @@ -1467,7 +1471,7 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx, } else if (low_bit_size != high_bit_size && new_bitsize_acceptable(ctx, high_bit_size, low, high, new_size)) { new_bit_size = high_bit_size; - } else { + } else if (!(first->access & ACCESS_ATOMIC)) { new_bit_size = 64; for (; new_bit_size >= 8; new_bit_size /= 2) { /* don't repeat trying out bitsizes */ @@ -1478,6 +1482,8 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx, } if (new_bit_size < 8) return false; + } else { + return false; } unsigned new_num_components = new_size / new_bit_size; @@ -1536,15 +1542,16 @@ try_vectorize_shared2(struct vectorize_ctx *ctx, if (first != low) offset = nir_iadd_imm(&b, offset, -(int)diff); + uint32_t access = nir_intrinsic_access(first->intrin); if (first->is_store) { nir_def *low_val = low->intrin->src[low->info->value_src].ssa; nir_def *high_val = high->intrin->src[high->info->value_src].ssa; nir_def *val = nir_vec2(&b, nir_bitcast_vector(&b, low_val, low_size * 8u), nir_bitcast_vector(&b, high_val, low_size * 8u)); - nir_store_shared2_amd(&b, val, offset, .offset1 = diff / stride, .st64 = st64); + nir_store_shared2_amd(&b, val, offset, .offset1 = diff / stride, .st64 = st64, .access = access); } else { nir_def *new_def = nir_load_shared2_amd(&b, low_size * 8u, offset, .offset1 = diff / stride, - .st64 = st64); + .st64 = st64, .access = access); nir_def_rewrite_uses(&low->intrin->def, nir_bitcast_vector(&b, nir_channel(&b, new_def, 0), low_bit_size)); nir_def_rewrite_uses(&high->intrin->def, diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c index a13f3ea26e5..c9fbaab976f 100644 --- a/src/compiler/nir/nir_print.c +++ b/src/compiler/nir/nir_print.c @@ -851,6 +851,7 @@ print_access(enum gl_access_qualifier access, print_state *state, const char *se { ACCESS_KEEP_SCALAR, "keep-scalar" }, { ACCESS_SMEM_AMD, "smem-amd" }, { ACCESS_SKIP_HELPERS, "skip-helpers" }, + { ACCESS_ATOMIC, "atomic" }, }; bool first = true; diff --git a/src/compiler/shader_enums.h b/src/compiler/shader_enums.h index 528bf775e98..953fc5096a0 100644 --- a/src/compiler/shader_enums.h +++ b/src/compiler/shader_enums.h @@ -1199,6 +1199,42 @@ enum gl_access_qualifier * Indicates that this load must be skipped by helper invocations. */ ACCESS_SKIP_HELPERS = (1 << 17), + + /** + * Indicates that this is an atomic load/store. Atomic RMW, swap, and other + * intrinsics which are always atomic such as atomic_counter_read_deref do + * not need this flag. + * + * If this is a vector load/store, then each component is considered its + * own atomic access. + * + * For non-shared load/store, instructions with this flag should also have + * ACCESS_COHERENT. + * + * The differences between atomic and non-atomic accesses can be summarized + * as follows: + * - Bounds checking of a 64-bit atomic access must be done per-component, + * and not for each 32-bit part. + * - Atomics accesses are always coherent. Non-shared atomic load/store + * should have the ACCESS_COHERENT flag. + * - Data races do not happen with two atomic accesses, with each access + * instead reading/writing a valid value. Two non-atomic accesses or an + * atomic access and a non-atomic access can data race, which is either + * undefined behaviour or undefined value, depending on + * shader_info::assume_no_data_races. + * - Because of data races, atomics are necessary for sychronization + * without barriers. In the Vulkan memory model, synchronizes-with + * relations only form between two memory barriers if control barriers or + * atomic accesses are involved. + * + * Some hardware can "tear" loads with a subgroup uniform address, which + * means that a store from a different subgroup interrupts the load, + * causing the result to not be subgroup uniform and instead be a mix of + * the old and new values, despite the address being subgroup uniform. If + * a load is not atomic and assume_no_data_races=true, we can assume that + * the load never tears. + */ + ACCESS_ATOMIC = (1 << 18), }; /** diff --git a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c index c998c18e888..f4be4e5cba0 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c @@ -4579,7 +4579,7 @@ visit_shared_store(struct lp_build_nir_soa_context *bld, LLVMValueRef val[NIR_MAX_VEC_COMPONENTS] = { NULL }; get_src_vec(bld, 0, val); LLVMValueRef offset = get_src(bld, &instr->src[1], 0); - int writemask = instr->const_index[1]; + int writemask = nir_intrinsic_write_mask(instr); int nc = nir_src_num_components(instr->src[0]); int bitsize = nir_src_bit_size(instr->src[0]); emit_store_mem(bld, writemask, nc, bitsize, false, true, NULL, offset, val); @@ -4970,7 +4970,7 @@ visit_payload_store(struct lp_build_nir_soa_context *bld, LLVMValueRef val[NIR_MAX_VEC_COMPONENTS] = { NULL }; get_src_vec(bld, 0, val); LLVMValueRef offset = get_src(bld, &instr->src[1], 0); - int writemask = instr->const_index[1]; + int writemask = nir_intrinsic_write_mask(instr); int nc = nir_src_num_components(instr->src[0]); int bitsize = nir_src_bit_size(instr->src[0]); emit_store_mem(bld, writemask, nc, bitsize, true, true, NULL, offset, val); diff --git a/src/imagination/pco/usclib/libcl.h b/src/imagination/pco/usclib/libcl.h index 13fea5f6b3e..dadfbf9e873 100644 --- a/src/imagination/pco/usclib/libcl.h +++ b/src/imagination/pco/usclib/libcl.h @@ -34,11 +34,12 @@ void nir_store_ssbo(uint32_t value, uint offset_shift); uint32_t -nir_load_shared(uint offset, uint base, uint align_mul, uint align_offset); +nir_load_shared(uint offset, uint base, uint access, uint align_mul, uint align_offset); void nir_store_shared(uint32_t value, uint offset, uint base, + uint access, uint write_mask, uint align_mul, uint align_offset); diff --git a/src/imagination/pco/usclib/sync.cl b/src/imagination/pco/usclib/sync.cl index 100d8f29ed1..7678013e238 100644 --- a/src/imagination/pco/usclib/sync.cl +++ b/src/imagination/pco/usclib/sync.cl @@ -57,8 +57,8 @@ usclib_emu_global_atomic_comp_swap(uint32_t addr_lo, uint32_t addr_hi, uint comp void usclib_barrier(uint num_slots, uint counter_offset) { - #define load_barrier_counter() nir_load_shared(counter_offset, 0, 4, 0) - #define store_barrier_counter(value) nir_store_shared(value, counter_offset, 0, 0x1, 4, 0) + #define load_barrier_counter() nir_load_shared(counter_offset, 0, 0, 4, 0) + #define store_barrier_counter(value) nir_store_shared(value, counter_offset, 0, 0, 0x1, 4, 0) bool is_inst_zero = !nir_load_instance_num_pco(); @@ -85,5 +85,5 @@ void usclib_zero_init_wg_mem(uint count) { for (unsigned u = 0; u < count; ++u) - nir_store_shared(0, u * sizeof(uint32_t), 0, 0x1, 4, 0); + nir_store_shared(0, u * sizeof(uint32_t), 0, 0, 0x1, 4, 0); }