aco: use scratch_* for scratch load/store on GFX9+

fossil-db (navi21):
Totals from 52 (0.03% of 162293) affected shaders:
Instrs: 83190 -> 82145 (-1.26%)
CodeSize: 454892 -> 447260 (-1.68%); split: -1.68%, +0.00%
VGPRs: 4768 -> 4672 (-2.01%)
Latency: 1490887 -> 1487170 (-0.25%); split: -0.68%, +0.43%
InvThroughput: 935500 -> 933060 (-0.26%); split: -0.72%, +0.46%
VClause: 2715 -> 2632 (-3.06%); split: -4.53%, +1.47%
SClause: 1902 -> 1883 (-1.00%)
Copies: 8839 -> 8496 (-3.88%)
PreSGPRs: 2012 -> 1807 (-10.19%)
PreVGPRs: 3282 -> 3192 (-2.74%)

fossil-db (vega10):
Totals from 41 (0.03% of 161355) affected shaders:
Instrs: 35772 -> 35699 (-0.20%)
CodeSize: 187040 -> 186584 (-0.24%)
VGPRs: 4044 -> 4072 (+0.69%)
Latency: 243088 -> 242379 (-0.29%)
InvThroughput: 180301 -> 179783 (-0.29%)
VClause: 1204 -> 1216 (+1.00%)
SClause: 653 -> 637 (-2.45%)
Copies: 3736 -> 3704 (-0.86%); split: -0.88%, +0.03%
PreSGPRs: 1331 -> 1207 (-9.32%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17079>
This commit is contained in:
Rhys Perry 2022-05-19 14:12:08 +01:00 committed by Marge Bot
parent d2d94b62f2
commit 0e783d687a

View file

@ -4421,7 +4421,47 @@ mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne
}
const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};
const EmitLoadParameters scratch_load_params{mubuf_load_callback, false, true, 4096};
Temp
scratch_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
unsigned align_, unsigned const_offset, Temp dst_hint)
{
unsigned bytes_size = 0;
aco_opcode op;
if (bytes_needed == 1 || align_ % 2u) {
bytes_size = 1;
op = aco_opcode::scratch_load_ubyte;
} else if (bytes_needed == 2 || align_ % 4u) {
bytes_size = 2;
op = aco_opcode::scratch_load_ushort;
} else if (bytes_needed <= 4) {
bytes_size = 4;
op = aco_opcode::scratch_load_dword;
} else if (bytes_needed <= 8) {
bytes_size = 8;
op = aco_opcode::scratch_load_dwordx2;
} else if (bytes_needed <= 12) {
bytes_size = 12;
op = aco_opcode::scratch_load_dwordx3;
} else {
bytes_size = 16;
op = aco_opcode::scratch_load_dwordx4;
}
RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, Format::SCRATCH, 2, 1)};
flat->operands[0] = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
flat->operands[1] = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
flat->sync = info.sync;
flat->offset = const_offset;
flat->definitions[0] = Definition(val);
bld.insert(std::move(flat));
return val;
}
const EmitLoadParameters scratch_mubuf_load_params{mubuf_load_callback, false, true, 4096};
const EmitLoadParameters scratch_flat_load_params{scratch_load_callback, false, true, 2048};
Temp
get_gfx6_global_rsrc(Builder& bld, Temp addr)
@ -7498,27 +7538,40 @@ void
visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
{
Builder bld(ctx->program, ctx->block);
Temp rsrc = get_scratch_resource(ctx);
Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components,
instr->dest.ssa.bit_size / 8u, rsrc};
LoadEmitInfo info = {Operand(v1), dst, instr->dest.ssa.num_components,
instr->dest.ssa.bit_size / 8u};
info.align_mul = nir_intrinsic_align_mul(instr);
info.align_offset = nir_intrinsic_align_offset(instr);
info.swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 0;
info.sync = memory_sync_info(storage_scratch, semantic_private);
info.soffset = ctx->program->scratch_offset;
emit_load(ctx, bld, info, scratch_load_params);
if (ctx->program->gfx_level >= GFX9) {
if (nir_src_is_const(instr->src[0])) {
uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
info.offset =
bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max)));
info.const_offset = nir_src_as_uint(instr->src[0]) % max;
} else {
info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa));
}
EmitLoadParameters params = scratch_flat_load_params;
params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1;
emit_load(ctx, bld, info, params);
} else {
info.resource = get_scratch_resource(ctx);
info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
info.soffset = ctx->program->scratch_offset;
emit_load(ctx, bld, info, scratch_mubuf_load_params);
}
}
void
visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
{
Builder bld(ctx->program, ctx->block);
Temp rsrc = get_scratch_resource(ctx);
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
@ -7530,11 +7583,44 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
&write_count, write_datas, offsets);
for (unsigned i = 0; i < write_count; i++) {
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i],
offsets[i], true, true);
mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
if (ctx->program->gfx_level >= GFX9) {
uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
offset = nir_src_is_const(instr->src[1]) ? Temp(0, s1) : offset;
uint32_t base_const_offset =
nir_src_is_const(instr->src[1]) ? nir_src_as_uint(instr->src[1]) : 0;
for (unsigned i = 0; i < write_count; i++) {
aco_opcode op;
switch (write_datas[i].bytes()) {
case 1: op = aco_opcode::scratch_store_byte; break;
case 2: op = aco_opcode::scratch_store_short; break;
case 4: op = aco_opcode::scratch_store_dword; break;
case 8: op = aco_opcode::scratch_store_dwordx2; break;
case 12: op = aco_opcode::scratch_store_dwordx3; break;
case 16: op = aco_opcode::scratch_store_dwordx4; break;
default: unreachable("Unexpected store size");
}
uint32_t const_offset = base_const_offset + offsets[i];
assert(const_offset < max || offset.id() == 0);
Operand addr = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
Operand saddr = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
if (offset.id() == 0)
saddr = bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(const_offset, max)));
bld.scratch(op, addr, saddr, write_datas[i], const_offset % max,
memory_sync_info(storage_scratch, semantic_private));
}
} else {
Temp rsrc = get_scratch_resource(ctx);
offset = as_vgpr(ctx, offset);
for (unsigned i = 0; i < write_count; i++) {
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset,
write_datas[i], offsets[i], true, true);
mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
}
}
}