mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-21 18:00:13 +01:00
aco: add helpers for splitting stores
split_store_data() splits a vector and p_as_uniforms it if needed. scan_write_mask()/advance_write_mask() are similar to u_bit_scan_consecutive_range(), but makes it easier to only clear part of the range and will also give ranges for zero'd bits. split_buffer_store() is a helper for splitting VMEM/SMEM stores. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4639>
This commit is contained in:
parent
211a9f2057
commit
562353e1f1
1 changed files with 155 additions and 0 deletions
|
|
@ -3698,6 +3698,108 @@ void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsi
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp *dst, unsigned *offsets, Temp src)
|
||||||
|
{
|
||||||
|
if (!count)
|
||||||
|
return;
|
||||||
|
|
||||||
|
Builder bld(ctx->program, ctx->block);
|
||||||
|
|
||||||
|
ASSERTED bool is_subdword = false;
|
||||||
|
for (unsigned i = 0; i < count; i++)
|
||||||
|
is_subdword |= offsets[i] % 4;
|
||||||
|
is_subdword |= (src.bytes() - offsets[count - 1]) % 4;
|
||||||
|
assert(!is_subdword || dst_type == RegType::vgpr);
|
||||||
|
|
||||||
|
/* count == 1 fast path */
|
||||||
|
if (count == 1) {
|
||||||
|
if (dst_type == RegType::sgpr)
|
||||||
|
dst[0] = bld.as_uniform(src);
|
||||||
|
else
|
||||||
|
dst[0] = as_vgpr(ctx, src);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < count - 1; i++)
|
||||||
|
dst[i] = bld.tmp(RegClass::get(dst_type, offsets[i + 1] - offsets[i]));
|
||||||
|
dst[count - 1] = bld.tmp(RegClass::get(dst_type, src.bytes() - offsets[count - 1]));
|
||||||
|
|
||||||
|
if (is_subdword && src.type() == RegType::sgpr) {
|
||||||
|
src = as_vgpr(ctx, src);
|
||||||
|
} else {
|
||||||
|
/* use allocated_vec if possible */
|
||||||
|
auto it = ctx->allocated_vec.find(src.id());
|
||||||
|
if (it != ctx->allocated_vec.end()) {
|
||||||
|
unsigned total_size = 0;
|
||||||
|
for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
|
||||||
|
total_size += it->second[i].bytes();
|
||||||
|
if (total_size != src.bytes())
|
||||||
|
goto split;
|
||||||
|
|
||||||
|
unsigned elem_size = it->second[0].bytes();
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < count; i++) {
|
||||||
|
if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
|
||||||
|
goto split;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < count; i++) {
|
||||||
|
unsigned start_idx = offsets[i] / elem_size;
|
||||||
|
unsigned op_count = dst[i].bytes() / elem_size;
|
||||||
|
if (op_count == 1) {
|
||||||
|
if (dst_type == RegType::sgpr)
|
||||||
|
dst[i] = bld.as_uniform(it->second[start_idx]);
|
||||||
|
else
|
||||||
|
dst[i] = as_vgpr(ctx, it->second[start_idx]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)};
|
||||||
|
for (unsigned j = 0; j < op_count; j++) {
|
||||||
|
Temp tmp = it->second[start_idx + j];
|
||||||
|
if (dst_type == RegType::sgpr)
|
||||||
|
tmp = bld.as_uniform(tmp);
|
||||||
|
vec->operands[j] = Operand(tmp);
|
||||||
|
}
|
||||||
|
vec->definitions[0] = Definition(dst[i]);
|
||||||
|
bld.insert(std::move(vec));
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dst_type == RegType::sgpr)
|
||||||
|
src = bld.as_uniform(src);
|
||||||
|
|
||||||
|
split:
|
||||||
|
/* just split it */
|
||||||
|
aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
|
||||||
|
split->operands[0] = Operand(src);
|
||||||
|
for (unsigned i = 0; i < count; i++)
|
||||||
|
split->definitions[i] = Definition(dst[i]);
|
||||||
|
bld.insert(std::move(split));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool scan_write_mask(uint32_t mask, uint32_t todo_mask,
|
||||||
|
int *start, int *count)
|
||||||
|
{
|
||||||
|
unsigned start_elem = ffs(todo_mask) - 1;
|
||||||
|
bool skip = !(mask & (1 << start_elem));
|
||||||
|
if (skip)
|
||||||
|
mask = ~mask & todo_mask;
|
||||||
|
|
||||||
|
mask &= todo_mask;
|
||||||
|
|
||||||
|
u_bit_scan_consecutive_range(&mask, start, count);
|
||||||
|
|
||||||
|
return !skip;
|
||||||
|
}
|
||||||
|
|
||||||
|
void advance_write_mask(uint32_t *todo_mask, int start, int count)
|
||||||
|
{
|
||||||
|
*todo_mask &= ~u_bit_consecutive(0, count) << start;
|
||||||
|
}
|
||||||
|
|
||||||
void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
|
void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
|
||||||
Temp address, unsigned base_offset, unsigned align)
|
Temp address, unsigned base_offset, unsigned align)
|
||||||
{
|
{
|
||||||
|
|
@ -3755,6 +3857,59 @@ unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem, RegType dst_type,
|
||||||
|
Temp data, unsigned writemask, int swizzle_element_size,
|
||||||
|
unsigned *write_count, Temp *write_datas, unsigned *offsets)
|
||||||
|
{
|
||||||
|
unsigned write_count_with_skips = 0;
|
||||||
|
bool skips[16];
|
||||||
|
|
||||||
|
/* determine how to split the data */
|
||||||
|
unsigned todo = u_bit_consecutive(0, data.bytes());
|
||||||
|
while (todo) {
|
||||||
|
int offset, bytes;
|
||||||
|
skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &bytes);
|
||||||
|
offsets[write_count_with_skips] = offset;
|
||||||
|
if (skips[write_count_with_skips]) {
|
||||||
|
advance_write_mask(&todo, offset, bytes);
|
||||||
|
write_count_with_skips++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
|
||||||
|
* larger than swizzle_element_size */
|
||||||
|
bytes = MIN2(bytes, swizzle_element_size);
|
||||||
|
if (bytes % 4)
|
||||||
|
bytes = bytes > 4 ? bytes & ~0x3 : MIN2(bytes, 2);
|
||||||
|
|
||||||
|
/* SMEM and GFX6 VMEM can't emit 12-byte stores */
|
||||||
|
if ((ctx->program->chip_class == GFX6 || smem) && bytes == 12)
|
||||||
|
bytes = 8;
|
||||||
|
|
||||||
|
/* dword or larger stores have to be dword-aligned */
|
||||||
|
unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
|
||||||
|
unsigned align_offset = instr ? nir_intrinsic_align_mul(instr) : 0;
|
||||||
|
bool dword_aligned = (align_offset + offset) % 4 == 0 && align_mul % 4 == 0;
|
||||||
|
if (bytes >= 4 && !dword_aligned)
|
||||||
|
bytes = MIN2(bytes, 2);
|
||||||
|
|
||||||
|
advance_write_mask(&todo, offset, bytes);
|
||||||
|
write_count_with_skips++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* actually split data */
|
||||||
|
split_store_data(ctx, dst_type, write_count_with_skips, write_datas, offsets, data);
|
||||||
|
|
||||||
|
/* remove skips */
|
||||||
|
for (unsigned i = 0; i < write_count_with_skips; i++) {
|
||||||
|
if (skips[i])
|
||||||
|
continue;
|
||||||
|
write_datas[*write_count] = write_datas[i];
|
||||||
|
offsets[*write_count] = offsets[i];
|
||||||
|
(*write_count)++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes,
|
Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes,
|
||||||
unsigned split_cnt = 0u, Temp dst = Temp())
|
unsigned split_cnt = 0u, Temp dst = Temp())
|
||||||
{
|
{
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue