aco: Implement load_typed_buffer_amd.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16805>
This commit is contained in:
Timur Kristóf 2023-02-03 01:03:22 +01:00 committed by Marge Bot
parent 022e55557b
commit 477cb943f6
2 changed files with 147 additions and 13 deletions

View file

@ -4052,9 +4052,11 @@ struct LoadEmitInfo {
unsigned const_offset = 0;
unsigned align_mul = 0;
unsigned align_offset = 0;
pipe_format format;
bool glc = false;
bool slc = false;
bool split_by_component_stride = true;
unsigned swizzle_component_size = 0;
memory_sync_info sync;
Temp soffset = Temp(0, s1);
@ -4112,10 +4114,12 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
}
}
if (info.swizzle_component_size)
bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
if (info.component_stride)
bytes_needed = MIN2(bytes_needed, info.component_size);
if (info.split_by_component_stride) {
if (info.swizzle_component_size)
bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
if (info.component_stride)
bytes_needed = MIN2(bytes_needed, info.component_size);
}
bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
@ -4222,9 +4226,11 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
/* add result to list and advance */
if (info.component_stride) {
assert(val.bytes() == info.component_size && "unimplemented");
const_offset += info.component_stride;
align_offset = (align_offset + info.component_stride) % align_mul;
assert(val.bytes() % info.component_size == 0);
unsigned num_loaded_components = val.bytes() / info.component_size;
unsigned advance_bytes = info.component_stride * num_loaded_components;
const_offset += advance_bytes;
align_offset = (align_offset + advance_bytes) % align_mul;
} else {
const_offset += val.bytes();
align_offset = (align_offset + val.bytes()) % align_mul;
@ -5519,6 +5525,106 @@ visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
}
}
Temp
mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
unsigned alignment, unsigned const_offset, Temp dst_hint)
{
Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
if (info.soffset.id()) {
if (soffset.isTemp())
vaddr = bld.copy(bld.def(v1), soffset);
soffset = Operand(info.soffset);
}
const bool offen = !vaddr.isUndefined();
const bool idxen = info.idx.id();
if (offen && idxen)
vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
else if (idxen)
vaddr = Operand(info.idx);
/* Determine number of fetched components.
* Note, ACO IR works with GFX6-8 nfmt + dfmt fields, these are later converted for GFX10+.
*/
const struct ac_vtx_format_info* vtx_info =
ac_get_vtx_format_info(GFX8, CHIP_POLARIS10, info.format);
/* The number of channels in the format determines the memory range. */
const unsigned max_components = vtx_info->num_channels;
/* Calculate maximum number of components loaded according to alignment. */
unsigned max_fetched_components = bytes_needed / info.component_size;
max_fetched_components =
ac_get_safe_fetch_size(bld.program->gfx_level, vtx_info, const_offset, max_components,
alignment, max_fetched_components);
const unsigned fetch_fmt = vtx_info->hw_format[max_fetched_components - 1];
/* Adjust bytes needed in case we need to do a smaller load due to aligment.
* If a larger format is selected, it's still OK to load a smaller amount from it.
*/
bytes_needed = MIN2(bytes_needed, max_fetched_components * info.component_size);
unsigned bytes_size = 0;
const unsigned bit_size = info.component_size * 8;
aco_opcode op = aco_opcode::num_opcodes;
if (bytes_needed == 2) {
bytes_size = 2;
op = aco_opcode::tbuffer_load_format_d16_x;
} else if (bytes_needed <= 4) {
bytes_size = 4;
if (bit_size == 16)
op = aco_opcode::tbuffer_load_format_d16_xy;
else
op = aco_opcode::tbuffer_load_format_x;
} else if (bytes_needed <= 6) {
bytes_size = 6;
if (bit_size == 16)
op = aco_opcode::tbuffer_load_format_d16_xyz;
else
op = aco_opcode::tbuffer_load_format_xy;
} else if (bytes_needed <= 8) {
bytes_size = 8;
if (bit_size == 16)
op = aco_opcode::tbuffer_load_format_d16_xyzw;
else
op = aco_opcode::tbuffer_load_format_xy;
} else if (bytes_needed <= 12) {
bytes_size = 12;
op = aco_opcode::tbuffer_load_format_xyz;
} else {
bytes_size = 16;
op = aco_opcode::tbuffer_load_format_xyzw;
}
/* Abort when suitable opcode wasn't found so we don't compile buggy shaders. */
if (op == aco_opcode::num_opcodes) {
aco_err(bld.program, "unsupported bit size for typed buffer load");
abort();
}
aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(op, Format::MTBUF, 3, 1)};
mtbuf->operands[0] = Operand(info.resource);
mtbuf->operands[1] = vaddr;
mtbuf->operands[2] = soffset;
mtbuf->offen = offen;
mtbuf->idxen = idxen;
mtbuf->glc = info.glc;
mtbuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
mtbuf->slc = info.slc;
mtbuf->sync = info.sync;
mtbuf->offset = const_offset;
mtbuf->dfmt = fetch_fmt & 0xf;
mtbuf->nfmt = fetch_fmt >> 4;
RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
mtbuf->definitions[0] = Definition(val);
bld.insert(std::move(mtbuf));
return val;
}
const EmitLoadParameters mtbuf_load_params{mtbuf_load_callback, false, true, 4096};
void
visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
{
@ -7203,24 +7309,50 @@ visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
unsigned const_offset = nir_intrinsic_base(intrin);
unsigned elem_size_bytes = intrin->dest.ssa.bit_size / 8u;
unsigned num_components = intrin->dest.ssa.num_components;
unsigned swizzle_element_size = swizzled ? (ctx->program->gfx_level <= GFX8 ? 4 : 16) : 0;
nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode));
LoadEmitInfo info = {Operand(v_offset), dst, num_components, elem_size_bytes, descriptor};
info.idx = idx;
info.component_stride = swizzle_element_size;
info.glc = glc;
info.slc = slc;
info.swizzle_component_size = swizzle_element_size ? 4 : 0;
info.align_mul = MIN2(elem_size_bytes, 4);
info.align_offset = 0;
info.soffset = s_offset;
info.const_offset = const_offset;
info.sync = sync;
emit_load(ctx, bld, info, mubuf_load_params);
if (intrin->intrinsic == nir_intrinsic_load_typed_buffer_amd) {
const pipe_format format = nir_intrinsic_format(intrin);
const struct ac_vtx_format_info* vtx_info =
ac_get_vtx_format_info(ctx->program->gfx_level, ctx->program->family, format);
const struct util_format_description* f = util_format_description(format);
const unsigned align_mul = nir_intrinsic_align_mul(intrin);
const unsigned align_offset = nir_intrinsic_align_offset(intrin);
/* Avoid splitting:
* - non-array formats because that would result in incorrect code
* - when element size is same as component size (to reduce instruction count)
*/
const bool can_split = f->is_array && elem_size_bytes != vtx_info->chan_byte_size;
info.align_mul = align_mul;
info.align_offset = align_offset;
info.format = format;
info.component_stride = can_split ? vtx_info->chan_byte_size : 0;
info.split_by_component_stride = false;
emit_load(ctx, bld, info, mtbuf_load_params);
} else {
const unsigned swizzle_element_size =
swizzled ? (ctx->program->gfx_level <= GFX8 ? 4 : 16) : 0;
info.component_stride = swizzle_element_size;
info.swizzle_component_size = swizzle_element_size ? 4 : 0;
info.align_mul = MIN2(elem_size_bytes, 4);
info.align_offset = 0;
emit_load(ctx, bld, info, mubuf_load_params);
}
}
void
@ -8276,6 +8408,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
case nir_intrinsic_bindless_image_atomic_fmax: visit_image_atomic(ctx, instr); break;
case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
case nir_intrinsic_load_typed_buffer_amd:
case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
case nir_intrinsic_load_smem_amd: visit_load_smem(ctx, instr); break;

View file

@ -676,6 +676,7 @@ init_context(isel_context* ctx, nir_shader* shader)
case nir_intrinsic_load_scratch:
case nir_intrinsic_load_invocation_id:
case nir_intrinsic_load_primitive_id:
case nir_intrinsic_load_typed_buffer_amd:
case nir_intrinsic_load_buffer_amd:
case nir_intrinsic_load_initial_edgeflags_amd:
case nir_intrinsic_gds_atomic_add_amd: