mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-06 00:49:04 +02:00
nir_lower_mem_access_bit_sizes: Support unaligned stores via a pair of atomics
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/8282 Acked-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23173>
This commit is contained in:
parent
082eba6165
commit
c70d94a889
2 changed files with 89 additions and 14 deletions
|
|
@ -5115,6 +5115,7 @@ typedef nir_mem_access_size_align
|
||||||
typedef struct {
|
typedef struct {
|
||||||
nir_lower_mem_access_bit_sizes_cb callback;
|
nir_lower_mem_access_bit_sizes_cb callback;
|
||||||
nir_variable_mode modes;
|
nir_variable_mode modes;
|
||||||
|
bool may_lower_unaligned_stores_to_atomics;
|
||||||
void *cb_data;
|
void *cb_data;
|
||||||
} nir_lower_mem_access_bit_sizes_options;
|
} nir_lower_mem_access_bit_sizes_options;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -230,7 +230,7 @@ lower_mem_load(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||||
static bool
|
static bool
|
||||||
lower_mem_store(nir_builder *b, nir_intrinsic_instr *intrin,
|
lower_mem_store(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||||
nir_lower_mem_access_bit_sizes_cb mem_access_size_align_cb,
|
nir_lower_mem_access_bit_sizes_cb mem_access_size_align_cb,
|
||||||
const void *cb_data)
|
const void *cb_data, bool allow_unaligned_stores_as_atomics)
|
||||||
{
|
{
|
||||||
assert(intrin->src[0].is_ssa);
|
assert(intrin->src[0].is_ssa);
|
||||||
nir_ssa_def *value = intrin->src[0].ssa;
|
nir_ssa_def *value = intrin->src[0].ssa;
|
||||||
|
|
@ -287,28 +287,101 @@ lower_mem_store(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||||
const uint32_t max_chunk_bytes = end - chunk_start;
|
const uint32_t max_chunk_bytes = end - chunk_start;
|
||||||
const uint32_t chunk_align_offset =
|
const uint32_t chunk_align_offset =
|
||||||
(whole_align_offset + chunk_start) % align_mul;
|
(whole_align_offset + chunk_start) % align_mul;
|
||||||
|
const uint32_t chunk_align =
|
||||||
|
nir_combined_align(align_mul, chunk_align_offset);
|
||||||
|
|
||||||
requested = mem_access_size_align_cb(intrin->intrinsic, max_chunk_bytes,
|
requested = mem_access_size_align_cb(intrin->intrinsic, max_chunk_bytes,
|
||||||
bit_size, align_mul, chunk_align_offset,
|
bit_size, align_mul, chunk_align_offset,
|
||||||
offset_is_const, cb_data);
|
offset_is_const, cb_data);
|
||||||
|
|
||||||
const uint32_t chunk_bytes =
|
uint32_t chunk_bytes = requested.num_components * (requested.bit_size / 8);
|
||||||
requested.num_components * (requested.bit_size / 8);
|
|
||||||
assert(chunk_bytes <= max_chunk_bytes);
|
|
||||||
|
|
||||||
assert(util_is_power_of_two_nonzero(requested.align));
|
assert(util_is_power_of_two_nonzero(requested.align));
|
||||||
assert(requested.align <= align_mul);
|
if (chunk_align < requested.align ||
|
||||||
assert((chunk_align_offset % requested.align) == 0);
|
chunk_bytes > max_chunk_bytes) {
|
||||||
|
/* Otherwise the caller made a mistake with their return values. */
|
||||||
|
assert(chunk_bytes <= 32);
|
||||||
|
assert(allow_unaligned_stores_as_atomics);
|
||||||
|
|
||||||
nir_ssa_def *packed = nir_extract_bits(b, &value, 1, chunk_start * 8,
|
/* We'll turn this into a pair of 32-bit atomics to modify only the right
|
||||||
requested.num_components,
|
* bits of memory.
|
||||||
requested.bit_size);
|
*/
|
||||||
|
requested = (nir_mem_access_size_align){
|
||||||
|
.align = 4,
|
||||||
|
.bit_size = 32,
|
||||||
|
.num_components = 1,
|
||||||
|
};
|
||||||
|
|
||||||
nir_ssa_def *chunk_offset = nir_iadd_imm(b, offset, chunk_start);
|
uint64_t align_mask = requested.align - 1;
|
||||||
dup_mem_intrinsic(b, intrin, chunk_offset,
|
nir_ssa_def *chunk_offset = nir_iadd_imm(b, offset, chunk_start);
|
||||||
align_mul, chunk_align_offset, packed,
|
nir_ssa_def *pad = chunk_align < 4 ?
|
||||||
requested.num_components, requested.bit_size);
|
nir_iand_imm(b, chunk_offset, align_mask) :
|
||||||
|
nir_imm_intN_t(b, 0, chunk_offset->bit_size);
|
||||||
|
chunk_offset = nir_iand_imm(b, chunk_offset, ~align_mask);
|
||||||
|
|
||||||
|
unsigned max_pad = chunk_align < requested.align ? requested.align - chunk_align : 0;
|
||||||
|
unsigned requested_bytes =
|
||||||
|
requested.num_components * requested.bit_size / 8;
|
||||||
|
chunk_bytes = MIN2(max_chunk_bytes, requested_bytes - max_pad);
|
||||||
|
unsigned chunk_bits = chunk_bytes * 8;
|
||||||
|
|
||||||
|
nir_ssa_def *chunk_value = value;
|
||||||
|
/* The one special case where nir_extract_bits cannot get a scalar by asking for
|
||||||
|
* 1 component of chunk_bits.
|
||||||
|
*/
|
||||||
|
if (chunk_bits == 24) {
|
||||||
|
chunk_value = nir_pad_vec4(b, chunk_value);
|
||||||
|
chunk_bits = 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
nir_ssa_def *data = nir_u2u32(b,
|
||||||
|
nir_extract_bits(b, &chunk_value, 1, chunk_start * 8,
|
||||||
|
1, chunk_bits));
|
||||||
|
nir_ssa_def *iand_mask = nir_imm_int(b, (1 << chunk_bits) - 1);
|
||||||
|
|
||||||
|
if (chunk_align < requested.align) {
|
||||||
|
nir_ssa_def *shift = nir_imul_imm(b, pad, 8);
|
||||||
|
data = nir_ishl(b, data, shift);
|
||||||
|
iand_mask = nir_inot(b, nir_ishl(b, iand_mask, shift));
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (intrin->intrinsic) {
|
||||||
|
case nir_intrinsic_store_ssbo:
|
||||||
|
nir_build_ssbo_atomic(b, 32, intrin->src[1].ssa, chunk_offset, iand_mask,
|
||||||
|
.atomic_op = nir_atomic_op_iand,
|
||||||
|
.access = nir_intrinsic_access(intrin));
|
||||||
|
nir_build_ssbo_atomic(b, 32, intrin->src[1].ssa, chunk_offset, data,
|
||||||
|
.atomic_op = nir_atomic_op_ior,
|
||||||
|
.access = nir_intrinsic_access(intrin));
|
||||||
|
break;
|
||||||
|
case nir_intrinsic_store_global:
|
||||||
|
nir_build_global_atomic(b, 32, chunk_offset, iand_mask,
|
||||||
|
.atomic_op = nir_atomic_op_iand);
|
||||||
|
nir_build_global_atomic(b, 32, chunk_offset, data,
|
||||||
|
.atomic_op = nir_atomic_op_ior);
|
||||||
|
break;
|
||||||
|
case nir_intrinsic_store_shared:
|
||||||
|
nir_build_shared_atomic(b, 32, chunk_offset, iand_mask,
|
||||||
|
.atomic_op = nir_atomic_op_iand,
|
||||||
|
.base = nir_intrinsic_base(intrin));
|
||||||
|
nir_build_shared_atomic(b, 32, chunk_offset, data,
|
||||||
|
.atomic_op = nir_atomic_op_ior,
|
||||||
|
.base = nir_intrinsic_base(intrin));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
unreachable("Unsupported unaligned store");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
nir_ssa_def *packed = nir_extract_bits(b, &value, 1, chunk_start * 8,
|
||||||
|
requested.num_components,
|
||||||
|
requested.bit_size);
|
||||||
|
|
||||||
|
nir_ssa_def *chunk_offset = nir_iadd_imm(b, offset, chunk_start);
|
||||||
|
dup_mem_intrinsic(b, intrin, chunk_offset,
|
||||||
|
align_mul, chunk_align_offset, packed,
|
||||||
|
requested.num_components, requested.bit_size);
|
||||||
|
|
||||||
|
}
|
||||||
BITSET_CLEAR_RANGE(mask, chunk_start, (chunk_start + chunk_bytes - 1));
|
BITSET_CLEAR_RANGE(mask, chunk_start, (chunk_start + chunk_bytes - 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -381,7 +454,8 @@ lower_mem_access_instr(nir_builder *b, nir_instr *instr, void *_data)
|
||||||
case nir_intrinsic_store_shared:
|
case nir_intrinsic_store_shared:
|
||||||
case nir_intrinsic_store_scratch:
|
case nir_intrinsic_store_scratch:
|
||||||
case nir_intrinsic_store_task_payload:
|
case nir_intrinsic_store_task_payload:
|
||||||
return lower_mem_store(b, intrin, state->callback, state->cb_data);
|
return lower_mem_store(b, intrin, state->callback, state->cb_data,
|
||||||
|
state->may_lower_unaligned_stores_to_atomics);
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue