aco: don't use SMEM for SSBO stores

fossil-db (Navi):
Totals from 70 (0.05% of 138791) affected shaders:
SGPRs: 2324 -> 2097 (-9.77%)
VGPRs: 1344 -> 1480 (+10.12%)
CodeSize: 157872 -> 154836 (-1.92%); split: -1.93%, +0.01%
MaxWaves: 1288 -> 1260 (-2.17%)
Instrs: 29730 -> 29108 (-2.09%); split: -2.13%, +0.04%
Cycles: 394944 -> 391280 (-0.93%); split: -0.94%, +0.01%
VMEM: 5288 -> 5695 (+7.70%); split: +11.97%, -4.27%
SMEM: 2680 -> 2444 (-8.81%); split: +1.34%, -10.15%
VClause: 291 -> 502 (+72.51%)
SClause: 1176 -> 918 (-21.94%)
Copies: 3549 -> 3517 (-0.90%); split: -1.80%, +0.90%
Branches: 1230 -> 1228 (-0.16%)
PreSGPRs: 1675 -> 1491 (-10.99%)
PreVGPRs: 1101 -> 1223 (+11.08%)

Totals from 70 (0.05% of 139517) affected shaders (RAVEN):
SGPRs: 2368 -> 2121 (-10.43%)
VGPRs: 1344 -> 1480 (+10.12%)
CodeSize: 156664 -> 153252 (-2.18%)
MaxWaves: 636 -> 622 (-2.20%)
Instrs: 29968 -> 29226 (-2.48%)
Cycles: 398284 -> 393492 (-1.20%)
VMEM: 5544 -> 5930 (+6.96%); split: +11.72%, -4.76%
SMEM: 2752 -> 2502 (-9.08%); split: +1.20%, -10.28%
VClause: 292 -> 504 (+72.60%)
SClause: 1236 -> 940 (-23.95%)
Copies: 3907 -> 3852 (-1.41%); split: -2.20%, +0.79%
Branches: 1230 -> 1228 (-0.16%)
PreSGPRs: 1671 -> 1487 (-11.01%)
PreVGPRs: 1102 -> 1225 (+11.16%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6143>
This commit is contained in:
Rhys Perry 2020-07-30 19:44:28 +01:00 committed by Marge Bot
parent 2410def98f
commit 867323379e
9 changed files with 27 additions and 143 deletions

View file

@ -146,7 +146,7 @@ bool needs_exact(aco_ptr<Instruction>& instr) {
FLAT_instruction *flat = static_cast<FLAT_instruction *>(instr.get());
return flat->disable_wqm;
} else {
return instr->format == Format::EXP || instr->opcode == aco_opcode::p_fs_buffer_store_smem;
return instr->format == Format::EXP;
}
}
@ -648,43 +648,6 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
return i;
}
void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptr<Instruction>& instr, Temp cur_exec)
{
Operand offset = instr->operands[1];
if (need_check) {
/* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */
Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u, bld.lm == s2));
if (offset.isLiteral())
offset = bld.copy(bld.def(s1), offset);
offset = bld.sop2(aco_opcode::s_cselect_b32, bld.hint_m0(bld.def(s1)),
offset, Operand(UINT32_MAX), bld.scc(nonempty));
} else if (offset.isConstant() && offset.constantValue() > 0xFFFFF) {
offset = bld.copy(bld.hint_m0(bld.def(s1)), offset);
}
if (!offset.isConstant())
offset.setFixed(m0);
switch (instr->operands[2].size()) {
case 1:
instr->opcode = aco_opcode::s_buffer_store_dword;
break;
case 2:
instr->opcode = aco_opcode::s_buffer_store_dwordx2;
break;
case 4:
instr->opcode = aco_opcode::s_buffer_store_dwordx4;
break;
default:
unreachable("Invalid SMEM buffer store size");
}
instr->operands[1] = offset;
/* as_uniform() needs to be done here so it's done in exact mode and helper
* lanes don't contribute. */
instr->operands[2] = Operand(bld.as_uniform(instr->operands[2]));
}
void process_instructions(exec_ctx& ctx, Block* block,
std::vector<aco_ptr<Instruction>>& instructions,
unsigned idx)
@ -831,10 +794,6 @@ void process_instructions(exec_ctx& ctx, Block* block,
instr->operands[0] = bld.scc(exit_cond);
state = Exact;
} else if (instr->opcode == aco_opcode::p_fs_buffer_store_smem) {
bool need_check = ctx.info[block->index].exec.size() != 1 &&
!(ctx.info[block->index].exec[ctx.info[block->index].exec.size() - 2].second & Exact);
lower_fs_buffer_store_smem(bld, need_check, instr, ctx.info[block->index].exec.back().first);
}
bld.insert(std::move(instr));

View file

@ -3801,24 +3801,21 @@ unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
}
aco_opcode get_buffer_store_op(bool smem, unsigned bytes)
aco_opcode get_buffer_store_op(unsigned bytes)
{
switch (bytes) {
case 1:
assert(!smem);
return aco_opcode::buffer_store_byte;
case 2:
assert(!smem);
return aco_opcode::buffer_store_short;
case 4:
return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword;
return aco_opcode::buffer_store_dword;
case 8:
return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2;
return aco_opcode::buffer_store_dwordx2;
case 12:
assert(!smem);
return aco_opcode::buffer_store_dwordx3;
case 16:
return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4;
return aco_opcode::buffer_store_dwordx4;
}
unreachable("Unexpected store size");
return aco_opcode::num_opcodes;
@ -3943,7 +3940,7 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
assert(vdata.size() >= 1 && vdata.size() <= 4);
Builder bld(ctx->program, ctx->block);
aco_opcode op = get_buffer_store_op(false, vdata.bytes());
aco_opcode op = get_buffer_store_op(vdata.bytes());
const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
@ -6198,70 +6195,29 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
/* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
* TODO: this optimization is disabled for now because we still need to ensure correct ordering
*/
bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore));
bool smem = !nir_src_is_divergent(instr->src[2]) &&
ctx->options->chip_class >= GFX8 &&
ctx->options->chip_class < GFX10_3 &&
(elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) &&
allow_smem;
if (smem)
offset = bld.as_uniform(offset);
bool smem_nonfs = smem && ctx->stage != fragment_fs;
unsigned write_count = 0;
Temp write_datas[32];
unsigned offsets[32];
split_buffer_store(ctx, instr, smem, smem_nonfs ? RegType::sgpr : (smem ? data.type() : RegType::vgpr),
split_buffer_store(ctx, instr, false, RegType::vgpr,
data, writemask, 16, &write_count, write_datas, offsets);
for (unsigned i = 0; i < write_count; i++) {
aco_opcode op = get_buffer_store_op(smem, write_datas[i].bytes());
if (smem && ctx->stage == fragment_fs)
op = aco_opcode::p_fs_buffer_store_smem;
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
if (smem) {
aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
store->operands[0] = Operand(rsrc);
if (offsets[i]) {
Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
offset, Operand(offsets[i]));
store->operands[1] = Operand(off);
} else {
store->operands[1] = Operand(offset);
}
if (op != aco_opcode::p_fs_buffer_store_smem)
store->operands[1].setFixed(m0);
store->operands[2] = Operand(write_datas[i]);
store->glc = glc;
store->dlc = false;
store->disable_wqm = true;
store->sync = sync;
ctx->block->instructions.emplace_back(std::move(store));
ctx->program->wb_smem_l1_on_end = true;
if (op == aco_opcode::p_fs_buffer_store_smem) {
ctx->block->kind |= block_kind_needs_lowering;
ctx->program->needs_exact = true;
}
} else {
aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
store->operands[0] = Operand(rsrc);
store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
store->operands[3] = Operand(write_datas[i]);
store->offset = offsets[i];
store->offen = (offset.type() == RegType::vgpr);
store->glc = glc;
store->dlc = false;
store->disable_wqm = true;
store->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(store));
}
aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
store->operands[0] = Operand(rsrc);
store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
store->operands[3] = Operand(write_datas[i]);
store->offset = offsets[i];
store->offen = (offset.type() == RegType::vgpr);
store->glc = glc;
store->dlc = false;
store->disable_wqm = true;
store->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(store));
}
}
@ -6469,7 +6425,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
} else {
assert(ctx->options->chip_class == GFX6);
aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
Temp rsrc = get_gfx6_global_rsrc(bld, addr);
@ -6906,7 +6862,7 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
swizzle_component_size, &write_count, write_datas, offsets);
for (unsigned i = 0; i < write_count; i++) {
aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_scratch, semantic_private);
}
@ -11863,8 +11819,6 @@ void select_program(Program *program,
append_logical_end(ctx.block);
ctx.block->kind |= block_kind_uniform;
Builder bld(ctx.program, ctx.block);
if (ctx.program->wb_smem_l1_on_end)
bld.smem(aco_opcode::s_dcache_wb, memory_sync_info(storage_buffer, semantic_volatile));
bld.sopp(aco_opcode::s_endpgm);
cleanup_cfg(program);

View file

@ -46,8 +46,6 @@ enum resource_flags {
has_nonglc_vmem_store = 0x8,
has_vmem_store = has_glc_vmem_store | has_nonglc_vmem_store,
has_vmem_loadstore = has_vmem_store | has_glc_vmem_load | has_nonglc_vmem_load,
has_nonglc_vmem_loadstore = has_nonglc_vmem_load | has_nonglc_vmem_store,
buffer_is_restrict = 0x10,
};
@ -200,20 +198,6 @@ inline uint8_t get_all_buffer_resource_flags(isel_context *ctx, nir_ssa_def *def
return res;
}
inline bool can_subdword_ssbo_store_use_smem(nir_intrinsic_instr *intrin)
{
unsigned wrmask = nir_intrinsic_write_mask(intrin);
if (util_last_bit(wrmask) != util_bitcount(wrmask) ||
util_bitcount(wrmask) * intrin->src[0].ssa->bit_size % 32 ||
util_bitcount(wrmask) != intrin->src[0].ssa->num_components)
return false;
if (nir_intrinsic_align_mul(intrin) % 4 || nir_intrinsic_align_offset(intrin) % 4)
return false;
return true;
}
void init_context(isel_context *ctx, nir_shader *shader);
void cleanup_context(isel_context *ctx);

View file

@ -189,10 +189,7 @@ void fill_desc_set_info(isel_context *ctx, nir_function_impl *impl)
res = intrin->src[0].ssa;
break;
case nir_intrinsic_store_ssbo:
if (nir_src_is_divergent(intrin->src[2]) ||
ctx->program->chip_class < GFX8 || ctx->program->chip_class >= GFX10_3 ||
(intrin->src[0].ssa->bit_size < 32 && !can_subdword_ssbo_store_use_smem(intrin)))
flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store;
flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store;
res = intrin->src[1].ssa;
break;
case nir_intrinsic_load_global:

View file

@ -1640,7 +1640,6 @@ public:
Stage stage;
bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
bool needs_wqm = false; /* there exists a p_wqm instruction */
bool wb_smem_l1_on_end = false;
std::vector<uint8_t> constant_data;
Temp private_segment_buffer;

View file

@ -1875,8 +1875,6 @@ void lower_to_hw_instr(Program* program)
bld.reset(discard_block);
bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
0, V_008DFC_SQ_EXP_NULL, false, true, true);
if (program->wb_smem_l1_on_end)
bld.smem(aco_opcode::s_dcache_wb);
bld.sopp(aco_opcode::s_endpgm);
bld.reset(&ctx.instructions);

View file

@ -285,8 +285,6 @@ opcode("p_demote_to_helper")
opcode("p_is_helper")
opcode("p_exit_early_if")
opcode("p_fs_buffer_store_smem", format=Format.SMEM)
# simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64
opcode("p_bpermute")

View file

@ -136,10 +136,7 @@ void finish_program(Program *program)
for (Block& block : program->blocks) {
if (block.linear_succs.size() == 0) {
block.kind |= block_kind_uniform;
Builder bld(program, &block);
if (program->wb_smem_l1_on_end)
bld.smem(aco_opcode::s_dcache_wb, false);
bld.sopp(aco_opcode::s_endpgm);
Builder(program, &block).sopp(aco_opcode::s_endpgm);
}
}
}

View file

@ -67,10 +67,8 @@ BEGIN_TEST(isel.compute.simple)
uint res;
};
void main() {
//~gfx7>> v1: %data = p_parallelcopy 42
//~gfx7>> buffer_store_dword %_, v1: undef, 0, %data disable_wqm storage:buffer semantics: scope:invocation
//~gfx8>> s1: %data = p_parallelcopy 42
//~gfx8>> s_buffer_store_dword %_, 0, %data storage:buffer semantics: scope:invocation
//>> v1: %data = p_parallelcopy 42
//buffer_store_dword %_, v1: undef, 0, %data disable_wqm storage:buffer semantics: scope:invocation
res = 42;
}
);