mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 16:00:08 +01:00
nir: add ACCESS_ATOMIC
This is so that passes and backends can tell if a coherent load/store is atomic or not, instead of having to assume it could be either. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36602>
This commit is contained in:
parent
4e762df664
commit
0dd09a292b
9 changed files with 66 additions and 20 deletions
|
|
@ -1254,9 +1254,9 @@ load("per_view_output", [1, 1], [BASE, RANGE, COMPONENT, DEST_TYPE, IO_SEMANTICS
|
||||||
# src[] = { primitive, offset }.
|
# src[] = { primitive, offset }.
|
||||||
load("per_primitive_output", [1, 1], [BASE, COMPONENT, DEST_TYPE, IO_SEMANTICS], [CAN_ELIMINATE])
|
load("per_primitive_output", [1, 1], [BASE, COMPONENT, DEST_TYPE, IO_SEMANTICS], [CAN_ELIMINATE])
|
||||||
# src[] = { offset }.
|
# src[] = { offset }.
|
||||||
load("shared", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE])
|
load("shared", [1], [BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE])
|
||||||
# src[] = { offset }.
|
# src[] = { offset }.
|
||||||
load("task_payload", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE])
|
load("task_payload", [1], [BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE])
|
||||||
# src[] = { offset }.
|
# src[] = { offset }.
|
||||||
load("push_constant", [1], [BASE, RANGE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE, CAN_REORDER])
|
load("push_constant", [1], [BASE, RANGE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE, CAN_REORDER])
|
||||||
# src[] = { offset }.
|
# src[] = { offset }.
|
||||||
|
|
@ -1302,9 +1302,9 @@ store("per_primitive_output", [1, 1], [BASE, RANGE, WRITE_MASK, COMPONENT, SRC_T
|
||||||
# src[] = { value, block_index, offset }
|
# src[] = { value, block_index, offset }
|
||||||
store("ssbo", [-1, 1], [WRITE_MASK, ACCESS, ALIGN_MUL, ALIGN_OFFSET, OFFSET_SHIFT])
|
store("ssbo", [-1, 1], [WRITE_MASK, ACCESS, ALIGN_MUL, ALIGN_OFFSET, OFFSET_SHIFT])
|
||||||
# src[] = { value, offset }.
|
# src[] = { value, offset }.
|
||||||
store("shared", [1], [BASE, WRITE_MASK, ALIGN_MUL, ALIGN_OFFSET])
|
store("shared", [1], [BASE, ACCESS, WRITE_MASK, ALIGN_MUL, ALIGN_OFFSET])
|
||||||
# src[] = { value, offset }.
|
# src[] = { value, offset }.
|
||||||
store("task_payload", [1], [BASE, WRITE_MASK, ALIGN_MUL, ALIGN_OFFSET])
|
store("task_payload", [1], [BASE, ACCESS, WRITE_MASK, ALIGN_MUL, ALIGN_OFFSET])
|
||||||
# src[] = { value, address }.
|
# src[] = { value, address }.
|
||||||
store("global", [1], [WRITE_MASK, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
|
store("global", [1], [WRITE_MASK, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
|
||||||
# src[] = { value, address }.
|
# src[] = { value, address }.
|
||||||
|
|
@ -1963,10 +1963,10 @@ intrinsic("load_smem_amd", src_comp=[1, 1], dest_comp=0, bit_sizes=[32],
|
||||||
flags=[CAN_ELIMINATE, CAN_REORDER])
|
flags=[CAN_ELIMINATE, CAN_REORDER])
|
||||||
|
|
||||||
# src[] = { offset }.
|
# src[] = { offset }.
|
||||||
intrinsic("load_shared2_amd", [1], dest_comp=2, indices=[OFFSET0, OFFSET1, ST64], flags=[CAN_ELIMINATE])
|
intrinsic("load_shared2_amd", [1], dest_comp=2, indices=[ACCESS, OFFSET0, OFFSET1, ST64], flags=[CAN_ELIMINATE])
|
||||||
|
|
||||||
# src[] = { value, offset }.
|
# src[] = { value, offset }.
|
||||||
intrinsic("store_shared2_amd", [2, 1], indices=[OFFSET0, OFFSET1, ST64])
|
intrinsic("store_shared2_amd", [2, 1], indices=[ACCESS, OFFSET0, OFFSET1, ST64])
|
||||||
|
|
||||||
# Vertex stride in LS-HS buffer
|
# Vertex stride in LS-HS buffer
|
||||||
system_value("lshs_vertex_stride_amd", 1)
|
system_value("lshs_vertex_stride_amd", 1)
|
||||||
|
|
|
||||||
|
|
@ -33,7 +33,7 @@
|
||||||
* eg:
|
* eg:
|
||||||
* atomicAdd(a[0], 1) ->
|
* atomicAdd(a[0], 1) ->
|
||||||
*
|
*
|
||||||
* uint expected = a[0];
|
* uint expected = atomicLoad(a[0]);
|
||||||
* while (true) {
|
* while (true) {
|
||||||
* uint before = expected;
|
* uint before = expected;
|
||||||
* expected += 1;
|
* expected += 1;
|
||||||
|
|
@ -54,17 +54,18 @@ build_atomic(nir_builder *b, nir_intrinsic_instr *intr)
|
||||||
.align_mul = intr->def.bit_size / 8,
|
.align_mul = intr->def.bit_size / 8,
|
||||||
.align_offset = 0,
|
.align_offset = 0,
|
||||||
.offset_shift = nir_intrinsic_offset_shift(intr),
|
.offset_shift = nir_intrinsic_offset_shift(intr),
|
||||||
.access = ACCESS_COHERENT);
|
.access = ACCESS_ATOMIC | ACCESS_COHERENT);
|
||||||
break;
|
break;
|
||||||
case nir_intrinsic_shared_atomic:
|
case nir_intrinsic_shared_atomic:
|
||||||
load = nir_load_shared(b, 1, intr->def.bit_size,
|
load = nir_load_shared(b, 1, intr->def.bit_size,
|
||||||
intr->src[0].ssa,
|
intr->src[0].ssa,
|
||||||
.align_mul = intr->def.bit_size / 8,
|
.align_mul = intr->def.bit_size / 8,
|
||||||
.align_offset = 0);
|
.align_offset = 0,
|
||||||
|
.access = ACCESS_ATOMIC);
|
||||||
break;
|
break;
|
||||||
case nir_intrinsic_global_atomic:
|
case nir_intrinsic_global_atomic:
|
||||||
load = nir_build_load_global(b, 1, intr->def.bit_size, intr->src[0].ssa,
|
load = nir_build_load_global(b, 1, intr->def.bit_size, intr->src[0].ssa,
|
||||||
.access = ACCESS_COHERENT);
|
.access = ACCESS_ATOMIC | ACCESS_COHERENT);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
UNREACHABLE("unsupported atomic type");
|
UNREACHABLE("unsupported atomic type");
|
||||||
|
|
|
||||||
|
|
@ -104,7 +104,7 @@ lower_instr(nir_intrinsic_instr *instr, unsigned ssbo_offset, nir_builder *b, un
|
||||||
if (nir_intrinsic_has_atomic_op(new_instr))
|
if (nir_intrinsic_has_atomic_op(new_instr))
|
||||||
nir_intrinsic_set_atomic_op(new_instr, atomic_op);
|
nir_intrinsic_set_atomic_op(new_instr, atomic_op);
|
||||||
if (op == nir_intrinsic_load_ssbo)
|
if (op == nir_intrinsic_load_ssbo)
|
||||||
nir_intrinsic_set_access(new_instr, ACCESS_COHERENT);
|
nir_intrinsic_set_access(new_instr, ACCESS_COHERENT | ACCESS_ATOMIC);
|
||||||
|
|
||||||
/* a couple instructions need special handling since they don't map
|
/* a couple instructions need special handling since they don't map
|
||||||
* 1:1 with ssbo atomics
|
* 1:1 with ssbo atomics
|
||||||
|
|
|
||||||
|
|
@ -1416,6 +1416,10 @@ can_vectorize(struct vectorize_ctx *ctx, struct entry *first, struct entry *seco
|
||||||
(first->access & ACCESS_VOLATILE) || first->info->is_unvectorizable)
|
(first->access & ACCESS_VOLATILE) || first->info->is_unvectorizable)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
/* We can't change the bit size of atomic load/store */
|
||||||
|
if ((first->access & ACCESS_ATOMIC) && get_bit_size(first) != get_bit_size(second))
|
||||||
|
return false;
|
||||||
|
|
||||||
if (first->intrin->intrinsic == nir_intrinsic_load_buffer_amd ||
|
if (first->intrin->intrinsic == nir_intrinsic_load_buffer_amd ||
|
||||||
first->intrin->intrinsic == nir_intrinsic_store_buffer_amd) {
|
first->intrin->intrinsic == nir_intrinsic_store_buffer_amd) {
|
||||||
if (first->access & ACCESS_USES_FORMAT_AMD)
|
if (first->access & ACCESS_USES_FORMAT_AMD)
|
||||||
|
|
@ -1467,7 +1471,7 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
|
||||||
} else if (low_bit_size != high_bit_size &&
|
} else if (low_bit_size != high_bit_size &&
|
||||||
new_bitsize_acceptable(ctx, high_bit_size, low, high, new_size)) {
|
new_bitsize_acceptable(ctx, high_bit_size, low, high, new_size)) {
|
||||||
new_bit_size = high_bit_size;
|
new_bit_size = high_bit_size;
|
||||||
} else {
|
} else if (!(first->access & ACCESS_ATOMIC)) {
|
||||||
new_bit_size = 64;
|
new_bit_size = 64;
|
||||||
for (; new_bit_size >= 8; new_bit_size /= 2) {
|
for (; new_bit_size >= 8; new_bit_size /= 2) {
|
||||||
/* don't repeat trying out bitsizes */
|
/* don't repeat trying out bitsizes */
|
||||||
|
|
@ -1478,6 +1482,8 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
|
||||||
}
|
}
|
||||||
if (new_bit_size < 8)
|
if (new_bit_size < 8)
|
||||||
return false;
|
return false;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
unsigned new_num_components = new_size / new_bit_size;
|
unsigned new_num_components = new_size / new_bit_size;
|
||||||
|
|
||||||
|
|
@ -1536,15 +1542,16 @@ try_vectorize_shared2(struct vectorize_ctx *ctx,
|
||||||
if (first != low)
|
if (first != low)
|
||||||
offset = nir_iadd_imm(&b, offset, -(int)diff);
|
offset = nir_iadd_imm(&b, offset, -(int)diff);
|
||||||
|
|
||||||
|
uint32_t access = nir_intrinsic_access(first->intrin);
|
||||||
if (first->is_store) {
|
if (first->is_store) {
|
||||||
nir_def *low_val = low->intrin->src[low->info->value_src].ssa;
|
nir_def *low_val = low->intrin->src[low->info->value_src].ssa;
|
||||||
nir_def *high_val = high->intrin->src[high->info->value_src].ssa;
|
nir_def *high_val = high->intrin->src[high->info->value_src].ssa;
|
||||||
nir_def *val = nir_vec2(&b, nir_bitcast_vector(&b, low_val, low_size * 8u),
|
nir_def *val = nir_vec2(&b, nir_bitcast_vector(&b, low_val, low_size * 8u),
|
||||||
nir_bitcast_vector(&b, high_val, low_size * 8u));
|
nir_bitcast_vector(&b, high_val, low_size * 8u));
|
||||||
nir_store_shared2_amd(&b, val, offset, .offset1 = diff / stride, .st64 = st64);
|
nir_store_shared2_amd(&b, val, offset, .offset1 = diff / stride, .st64 = st64, .access = access);
|
||||||
} else {
|
} else {
|
||||||
nir_def *new_def = nir_load_shared2_amd(&b, low_size * 8u, offset, .offset1 = diff / stride,
|
nir_def *new_def = nir_load_shared2_amd(&b, low_size * 8u, offset, .offset1 = diff / stride,
|
||||||
.st64 = st64);
|
.st64 = st64, .access = access);
|
||||||
nir_def_rewrite_uses(&low->intrin->def,
|
nir_def_rewrite_uses(&low->intrin->def,
|
||||||
nir_bitcast_vector(&b, nir_channel(&b, new_def, 0), low_bit_size));
|
nir_bitcast_vector(&b, nir_channel(&b, new_def, 0), low_bit_size));
|
||||||
nir_def_rewrite_uses(&high->intrin->def,
|
nir_def_rewrite_uses(&high->intrin->def,
|
||||||
|
|
|
||||||
|
|
@ -851,6 +851,7 @@ print_access(enum gl_access_qualifier access, print_state *state, const char *se
|
||||||
{ ACCESS_KEEP_SCALAR, "keep-scalar" },
|
{ ACCESS_KEEP_SCALAR, "keep-scalar" },
|
||||||
{ ACCESS_SMEM_AMD, "smem-amd" },
|
{ ACCESS_SMEM_AMD, "smem-amd" },
|
||||||
{ ACCESS_SKIP_HELPERS, "skip-helpers" },
|
{ ACCESS_SKIP_HELPERS, "skip-helpers" },
|
||||||
|
{ ACCESS_ATOMIC, "atomic" },
|
||||||
};
|
};
|
||||||
|
|
||||||
bool first = true;
|
bool first = true;
|
||||||
|
|
|
||||||
|
|
@ -1199,6 +1199,42 @@ enum gl_access_qualifier
|
||||||
* Indicates that this load must be skipped by helper invocations.
|
* Indicates that this load must be skipped by helper invocations.
|
||||||
*/
|
*/
|
||||||
ACCESS_SKIP_HELPERS = (1 << 17),
|
ACCESS_SKIP_HELPERS = (1 << 17),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicates that this is an atomic load/store. Atomic RMW, swap, and other
|
||||||
|
* intrinsics which are always atomic such as atomic_counter_read_deref do
|
||||||
|
* not need this flag.
|
||||||
|
*
|
||||||
|
* If this is a vector load/store, then each component is considered its
|
||||||
|
* own atomic access.
|
||||||
|
*
|
||||||
|
* For non-shared load/store, instructions with this flag should also have
|
||||||
|
* ACCESS_COHERENT.
|
||||||
|
*
|
||||||
|
* The differences between atomic and non-atomic accesses can be summarized
|
||||||
|
* as follows:
|
||||||
|
* - Bounds checking of a 64-bit atomic access must be done per-component,
|
||||||
|
* and not for each 32-bit part.
|
||||||
|
* - Atomics accesses are always coherent. Non-shared atomic load/store
|
||||||
|
* should have the ACCESS_COHERENT flag.
|
||||||
|
* - Data races do not happen with two atomic accesses, with each access
|
||||||
|
* instead reading/writing a valid value. Two non-atomic accesses or an
|
||||||
|
* atomic access and a non-atomic access can data race, which is either
|
||||||
|
* undefined behaviour or undefined value, depending on
|
||||||
|
* shader_info::assume_no_data_races.
|
||||||
|
* - Because of data races, atomics are necessary for sychronization
|
||||||
|
* without barriers. In the Vulkan memory model, synchronizes-with
|
||||||
|
* relations only form between two memory barriers if control barriers or
|
||||||
|
* atomic accesses are involved.
|
||||||
|
*
|
||||||
|
* Some hardware can "tear" loads with a subgroup uniform address, which
|
||||||
|
* means that a store from a different subgroup interrupts the load,
|
||||||
|
* causing the result to not be subgroup uniform and instead be a mix of
|
||||||
|
* the old and new values, despite the address being subgroup uniform. If
|
||||||
|
* a load is not atomic and assume_no_data_races=true, we can assume that
|
||||||
|
* the load never tears.
|
||||||
|
*/
|
||||||
|
ACCESS_ATOMIC = (1 << 18),
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -4579,7 +4579,7 @@ visit_shared_store(struct lp_build_nir_soa_context *bld,
|
||||||
LLVMValueRef val[NIR_MAX_VEC_COMPONENTS] = { NULL };
|
LLVMValueRef val[NIR_MAX_VEC_COMPONENTS] = { NULL };
|
||||||
get_src_vec(bld, 0, val);
|
get_src_vec(bld, 0, val);
|
||||||
LLVMValueRef offset = get_src(bld, &instr->src[1], 0);
|
LLVMValueRef offset = get_src(bld, &instr->src[1], 0);
|
||||||
int writemask = instr->const_index[1];
|
int writemask = nir_intrinsic_write_mask(instr);
|
||||||
int nc = nir_src_num_components(instr->src[0]);
|
int nc = nir_src_num_components(instr->src[0]);
|
||||||
int bitsize = nir_src_bit_size(instr->src[0]);
|
int bitsize = nir_src_bit_size(instr->src[0]);
|
||||||
emit_store_mem(bld, writemask, nc, bitsize, false, true, NULL, offset, val);
|
emit_store_mem(bld, writemask, nc, bitsize, false, true, NULL, offset, val);
|
||||||
|
|
@ -4970,7 +4970,7 @@ visit_payload_store(struct lp_build_nir_soa_context *bld,
|
||||||
LLVMValueRef val[NIR_MAX_VEC_COMPONENTS] = { NULL };
|
LLVMValueRef val[NIR_MAX_VEC_COMPONENTS] = { NULL };
|
||||||
get_src_vec(bld, 0, val);
|
get_src_vec(bld, 0, val);
|
||||||
LLVMValueRef offset = get_src(bld, &instr->src[1], 0);
|
LLVMValueRef offset = get_src(bld, &instr->src[1], 0);
|
||||||
int writemask = instr->const_index[1];
|
int writemask = nir_intrinsic_write_mask(instr);
|
||||||
int nc = nir_src_num_components(instr->src[0]);
|
int nc = nir_src_num_components(instr->src[0]);
|
||||||
int bitsize = nir_src_bit_size(instr->src[0]);
|
int bitsize = nir_src_bit_size(instr->src[0]);
|
||||||
emit_store_mem(bld, writemask, nc, bitsize, true, true, NULL, offset, val);
|
emit_store_mem(bld, writemask, nc, bitsize, true, true, NULL, offset, val);
|
||||||
|
|
|
||||||
|
|
@ -34,11 +34,12 @@ void nir_store_ssbo(uint32_t value,
|
||||||
uint offset_shift);
|
uint offset_shift);
|
||||||
|
|
||||||
uint32_t
|
uint32_t
|
||||||
nir_load_shared(uint offset, uint base, uint align_mul, uint align_offset);
|
nir_load_shared(uint offset, uint base, uint access, uint align_mul, uint align_offset);
|
||||||
|
|
||||||
void nir_store_shared(uint32_t value,
|
void nir_store_shared(uint32_t value,
|
||||||
uint offset,
|
uint offset,
|
||||||
uint base,
|
uint base,
|
||||||
|
uint access,
|
||||||
uint write_mask,
|
uint write_mask,
|
||||||
uint align_mul,
|
uint align_mul,
|
||||||
uint align_offset);
|
uint align_offset);
|
||||||
|
|
|
||||||
|
|
@ -57,8 +57,8 @@ usclib_emu_global_atomic_comp_swap(uint32_t addr_lo, uint32_t addr_hi, uint comp
|
||||||
void
|
void
|
||||||
usclib_barrier(uint num_slots, uint counter_offset)
|
usclib_barrier(uint num_slots, uint counter_offset)
|
||||||
{
|
{
|
||||||
#define load_barrier_counter() nir_load_shared(counter_offset, 0, 4, 0)
|
#define load_barrier_counter() nir_load_shared(counter_offset, 0, 0, 4, 0)
|
||||||
#define store_barrier_counter(value) nir_store_shared(value, counter_offset, 0, 0x1, 4, 0)
|
#define store_barrier_counter(value) nir_store_shared(value, counter_offset, 0, 0, 0x1, 4, 0)
|
||||||
|
|
||||||
bool is_inst_zero = !nir_load_instance_num_pco();
|
bool is_inst_zero = !nir_load_instance_num_pco();
|
||||||
|
|
||||||
|
|
@ -85,5 +85,5 @@ void
|
||||||
usclib_zero_init_wg_mem(uint count)
|
usclib_zero_init_wg_mem(uint count)
|
||||||
{
|
{
|
||||||
for (unsigned u = 0; u < count; ++u)
|
for (unsigned u = 0; u < count; ++u)
|
||||||
nir_store_shared(0, u * sizeof(uint32_t), 0, 0x1, 4, 0);
|
nir_store_shared(0, u * sizeof(uint32_t), 0, 0, 0x1, 4, 0);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue