mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-09 02:28:10 +02:00
aco: don't use SMEM for SSBO stores
fossil-db (Navi): Totals from 70 (0.05% of 138791) affected shaders: SGPRs: 2324 -> 2097 (-9.77%) VGPRs: 1344 -> 1480 (+10.12%) CodeSize: 157872 -> 154836 (-1.92%); split: -1.93%, +0.01% MaxWaves: 1288 -> 1260 (-2.17%) Instrs: 29730 -> 29108 (-2.09%); split: -2.13%, +0.04% Cycles: 394944 -> 391280 (-0.93%); split: -0.94%, +0.01% VMEM: 5288 -> 5695 (+7.70%); split: +11.97%, -4.27% SMEM: 2680 -> 2444 (-8.81%); split: +1.34%, -10.15% VClause: 291 -> 502 (+72.51%) SClause: 1176 -> 918 (-21.94%) Copies: 3549 -> 3517 (-0.90%); split: -1.80%, +0.90% Branches: 1230 -> 1228 (-0.16%) PreSGPRs: 1675 -> 1491 (-10.99%) PreVGPRs: 1101 -> 1223 (+11.08%) Totals from 70 (0.05% of 139517) affected shaders (RAVEN): SGPRs: 2368 -> 2121 (-10.43%) VGPRs: 1344 -> 1480 (+10.12%) CodeSize: 156664 -> 153252 (-2.18%) MaxWaves: 636 -> 622 (-2.20%) Instrs: 29968 -> 29226 (-2.48%) Cycles: 398284 -> 393492 (-1.20%) VMEM: 5544 -> 5930 (+6.96%); split: +11.72%, -4.76% SMEM: 2752 -> 2502 (-9.08%); split: +1.20%, -10.28% VClause: 292 -> 504 (+72.60%) SClause: 1236 -> 940 (-23.95%) Copies: 3907 -> 3852 (-1.41%); split: -2.20%, +0.79% Branches: 1230 -> 1228 (-0.16%) PreSGPRs: 1671 -> 1487 (-11.01%) PreVGPRs: 1102 -> 1225 (+11.16%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6143>
This commit is contained in:
parent
2410def98f
commit
867323379e
9 changed files with 27 additions and 143 deletions
|
|
@ -146,7 +146,7 @@ bool needs_exact(aco_ptr<Instruction>& instr) {
|
|||
FLAT_instruction *flat = static_cast<FLAT_instruction *>(instr.get());
|
||||
return flat->disable_wqm;
|
||||
} else {
|
||||
return instr->format == Format::EXP || instr->opcode == aco_opcode::p_fs_buffer_store_smem;
|
||||
return instr->format == Format::EXP;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -648,43 +648,6 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
|||
return i;
|
||||
}
|
||||
|
||||
void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptr<Instruction>& instr, Temp cur_exec)
|
||||
{
|
||||
Operand offset = instr->operands[1];
|
||||
if (need_check) {
|
||||
/* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */
|
||||
Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u, bld.lm == s2));
|
||||
|
||||
if (offset.isLiteral())
|
||||
offset = bld.copy(bld.def(s1), offset);
|
||||
|
||||
offset = bld.sop2(aco_opcode::s_cselect_b32, bld.hint_m0(bld.def(s1)),
|
||||
offset, Operand(UINT32_MAX), bld.scc(nonempty));
|
||||
} else if (offset.isConstant() && offset.constantValue() > 0xFFFFF) {
|
||||
offset = bld.copy(bld.hint_m0(bld.def(s1)), offset);
|
||||
}
|
||||
if (!offset.isConstant())
|
||||
offset.setFixed(m0);
|
||||
|
||||
switch (instr->operands[2].size()) {
|
||||
case 1:
|
||||
instr->opcode = aco_opcode::s_buffer_store_dword;
|
||||
break;
|
||||
case 2:
|
||||
instr->opcode = aco_opcode::s_buffer_store_dwordx2;
|
||||
break;
|
||||
case 4:
|
||||
instr->opcode = aco_opcode::s_buffer_store_dwordx4;
|
||||
break;
|
||||
default:
|
||||
unreachable("Invalid SMEM buffer store size");
|
||||
}
|
||||
instr->operands[1] = offset;
|
||||
/* as_uniform() needs to be done here so it's done in exact mode and helper
|
||||
* lanes don't contribute. */
|
||||
instr->operands[2] = Operand(bld.as_uniform(instr->operands[2]));
|
||||
}
|
||||
|
||||
void process_instructions(exec_ctx& ctx, Block* block,
|
||||
std::vector<aco_ptr<Instruction>>& instructions,
|
||||
unsigned idx)
|
||||
|
|
@ -831,10 +794,6 @@ void process_instructions(exec_ctx& ctx, Block* block,
|
|||
instr->operands[0] = bld.scc(exit_cond);
|
||||
state = Exact;
|
||||
|
||||
} else if (instr->opcode == aco_opcode::p_fs_buffer_store_smem) {
|
||||
bool need_check = ctx.info[block->index].exec.size() != 1 &&
|
||||
!(ctx.info[block->index].exec[ctx.info[block->index].exec.size() - 2].second & Exact);
|
||||
lower_fs_buffer_store_smem(bld, need_check, instr, ctx.info[block->index].exec.back().first);
|
||||
}
|
||||
|
||||
bld.insert(std::move(instr));
|
||||
|
|
|
|||
|
|
@ -3801,24 +3801,21 @@ unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
|
|||
}
|
||||
|
||||
|
||||
aco_opcode get_buffer_store_op(bool smem, unsigned bytes)
|
||||
aco_opcode get_buffer_store_op(unsigned bytes)
|
||||
{
|
||||
switch (bytes) {
|
||||
case 1:
|
||||
assert(!smem);
|
||||
return aco_opcode::buffer_store_byte;
|
||||
case 2:
|
||||
assert(!smem);
|
||||
return aco_opcode::buffer_store_short;
|
||||
case 4:
|
||||
return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword;
|
||||
return aco_opcode::buffer_store_dword;
|
||||
case 8:
|
||||
return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2;
|
||||
return aco_opcode::buffer_store_dwordx2;
|
||||
case 12:
|
||||
assert(!smem);
|
||||
return aco_opcode::buffer_store_dwordx3;
|
||||
case 16:
|
||||
return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4;
|
||||
return aco_opcode::buffer_store_dwordx4;
|
||||
}
|
||||
unreachable("Unexpected store size");
|
||||
return aco_opcode::num_opcodes;
|
||||
|
|
@ -3943,7 +3940,7 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
|
|||
assert(vdata.size() >= 1 && vdata.size() <= 4);
|
||||
|
||||
Builder bld(ctx->program, ctx->block);
|
||||
aco_opcode op = get_buffer_store_op(false, vdata.bytes());
|
||||
aco_opcode op = get_buffer_store_op(vdata.bytes());
|
||||
const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
|
||||
|
||||
Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
|
||||
|
|
@ -6198,70 +6195,29 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
|
||||
memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
|
||||
bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
|
||||
uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
|
||||
/* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
|
||||
* TODO: this optimization is disabled for now because we still need to ensure correct ordering
|
||||
*/
|
||||
bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore));
|
||||
|
||||
bool smem = !nir_src_is_divergent(instr->src[2]) &&
|
||||
ctx->options->chip_class >= GFX8 &&
|
||||
ctx->options->chip_class < GFX10_3 &&
|
||||
(elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) &&
|
||||
allow_smem;
|
||||
if (smem)
|
||||
offset = bld.as_uniform(offset);
|
||||
bool smem_nonfs = smem && ctx->stage != fragment_fs;
|
||||
|
||||
unsigned write_count = 0;
|
||||
Temp write_datas[32];
|
||||
unsigned offsets[32];
|
||||
split_buffer_store(ctx, instr, smem, smem_nonfs ? RegType::sgpr : (smem ? data.type() : RegType::vgpr),
|
||||
split_buffer_store(ctx, instr, false, RegType::vgpr,
|
||||
data, writemask, 16, &write_count, write_datas, offsets);
|
||||
|
||||
for (unsigned i = 0; i < write_count; i++) {
|
||||
aco_opcode op = get_buffer_store_op(smem, write_datas[i].bytes());
|
||||
if (smem && ctx->stage == fragment_fs)
|
||||
op = aco_opcode::p_fs_buffer_store_smem;
|
||||
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
|
||||
|
||||
if (smem) {
|
||||
aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
|
||||
store->operands[0] = Operand(rsrc);
|
||||
if (offsets[i]) {
|
||||
Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
|
||||
offset, Operand(offsets[i]));
|
||||
store->operands[1] = Operand(off);
|
||||
} else {
|
||||
store->operands[1] = Operand(offset);
|
||||
}
|
||||
if (op != aco_opcode::p_fs_buffer_store_smem)
|
||||
store->operands[1].setFixed(m0);
|
||||
store->operands[2] = Operand(write_datas[i]);
|
||||
store->glc = glc;
|
||||
store->dlc = false;
|
||||
store->disable_wqm = true;
|
||||
store->sync = sync;
|
||||
ctx->block->instructions.emplace_back(std::move(store));
|
||||
ctx->program->wb_smem_l1_on_end = true;
|
||||
if (op == aco_opcode::p_fs_buffer_store_smem) {
|
||||
ctx->block->kind |= block_kind_needs_lowering;
|
||||
ctx->program->needs_exact = true;
|
||||
}
|
||||
} else {
|
||||
aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
|
||||
store->operands[0] = Operand(rsrc);
|
||||
store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
|
||||
store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
|
||||
store->operands[3] = Operand(write_datas[i]);
|
||||
store->offset = offsets[i];
|
||||
store->offen = (offset.type() == RegType::vgpr);
|
||||
store->glc = glc;
|
||||
store->dlc = false;
|
||||
store->disable_wqm = true;
|
||||
store->sync = sync;
|
||||
ctx->program->needs_exact = true;
|
||||
ctx->block->instructions.emplace_back(std::move(store));
|
||||
}
|
||||
aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
|
||||
store->operands[0] = Operand(rsrc);
|
||||
store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
|
||||
store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
|
||||
store->operands[3] = Operand(write_datas[i]);
|
||||
store->offset = offsets[i];
|
||||
store->offen = (offset.type() == RegType::vgpr);
|
||||
store->glc = glc;
|
||||
store->dlc = false;
|
||||
store->disable_wqm = true;
|
||||
store->sync = sync;
|
||||
ctx->program->needs_exact = true;
|
||||
ctx->block->instructions.emplace_back(std::move(store));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -6469,7 +6425,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
} else {
|
||||
assert(ctx->options->chip_class == GFX6);
|
||||
|
||||
aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
|
||||
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
|
||||
|
||||
Temp rsrc = get_gfx6_global_rsrc(bld, addr);
|
||||
|
||||
|
|
@ -6906,7 +6862,7 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
|
|||
swizzle_component_size, &write_count, write_datas, offsets);
|
||||
|
||||
for (unsigned i = 0; i < write_count; i++) {
|
||||
aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
|
||||
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
|
||||
Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
|
||||
static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_scratch, semantic_private);
|
||||
}
|
||||
|
|
@ -11863,8 +11819,6 @@ void select_program(Program *program,
|
|||
append_logical_end(ctx.block);
|
||||
ctx.block->kind |= block_kind_uniform;
|
||||
Builder bld(ctx.program, ctx.block);
|
||||
if (ctx.program->wb_smem_l1_on_end)
|
||||
bld.smem(aco_opcode::s_dcache_wb, memory_sync_info(storage_buffer, semantic_volatile));
|
||||
bld.sopp(aco_opcode::s_endpgm);
|
||||
|
||||
cleanup_cfg(program);
|
||||
|
|
|
|||
|
|
@ -46,8 +46,6 @@ enum resource_flags {
|
|||
has_nonglc_vmem_store = 0x8,
|
||||
|
||||
has_vmem_store = has_glc_vmem_store | has_nonglc_vmem_store,
|
||||
has_vmem_loadstore = has_vmem_store | has_glc_vmem_load | has_nonglc_vmem_load,
|
||||
has_nonglc_vmem_loadstore = has_nonglc_vmem_load | has_nonglc_vmem_store,
|
||||
|
||||
buffer_is_restrict = 0x10,
|
||||
};
|
||||
|
|
@ -200,20 +198,6 @@ inline uint8_t get_all_buffer_resource_flags(isel_context *ctx, nir_ssa_def *def
|
|||
return res;
|
||||
}
|
||||
|
||||
inline bool can_subdword_ssbo_store_use_smem(nir_intrinsic_instr *intrin)
|
||||
{
|
||||
unsigned wrmask = nir_intrinsic_write_mask(intrin);
|
||||
if (util_last_bit(wrmask) != util_bitcount(wrmask) ||
|
||||
util_bitcount(wrmask) * intrin->src[0].ssa->bit_size % 32 ||
|
||||
util_bitcount(wrmask) != intrin->src[0].ssa->num_components)
|
||||
return false;
|
||||
|
||||
if (nir_intrinsic_align_mul(intrin) % 4 || nir_intrinsic_align_offset(intrin) % 4)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void init_context(isel_context *ctx, nir_shader *shader);
|
||||
void cleanup_context(isel_context *ctx);
|
||||
|
||||
|
|
|
|||
|
|
@ -189,10 +189,7 @@ void fill_desc_set_info(isel_context *ctx, nir_function_impl *impl)
|
|||
res = intrin->src[0].ssa;
|
||||
break;
|
||||
case nir_intrinsic_store_ssbo:
|
||||
if (nir_src_is_divergent(intrin->src[2]) ||
|
||||
ctx->program->chip_class < GFX8 || ctx->program->chip_class >= GFX10_3 ||
|
||||
(intrin->src[0].ssa->bit_size < 32 && !can_subdword_ssbo_store_use_smem(intrin)))
|
||||
flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store;
|
||||
flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store;
|
||||
res = intrin->src[1].ssa;
|
||||
break;
|
||||
case nir_intrinsic_load_global:
|
||||
|
|
|
|||
|
|
@ -1640,7 +1640,6 @@ public:
|
|||
Stage stage;
|
||||
bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
|
||||
bool needs_wqm = false; /* there exists a p_wqm instruction */
|
||||
bool wb_smem_l1_on_end = false;
|
||||
|
||||
std::vector<uint8_t> constant_data;
|
||||
Temp private_segment_buffer;
|
||||
|
|
|
|||
|
|
@ -1875,8 +1875,6 @@ void lower_to_hw_instr(Program* program)
|
|||
bld.reset(discard_block);
|
||||
bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
|
||||
0, V_008DFC_SQ_EXP_NULL, false, true, true);
|
||||
if (program->wb_smem_l1_on_end)
|
||||
bld.smem(aco_opcode::s_dcache_wb);
|
||||
bld.sopp(aco_opcode::s_endpgm);
|
||||
|
||||
bld.reset(&ctx.instructions);
|
||||
|
|
|
|||
|
|
@ -285,8 +285,6 @@ opcode("p_demote_to_helper")
|
|||
opcode("p_is_helper")
|
||||
opcode("p_exit_early_if")
|
||||
|
||||
opcode("p_fs_buffer_store_smem", format=Format.SMEM)
|
||||
|
||||
# simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64
|
||||
opcode("p_bpermute")
|
||||
|
||||
|
|
|
|||
|
|
@ -136,10 +136,7 @@ void finish_program(Program *program)
|
|||
for (Block& block : program->blocks) {
|
||||
if (block.linear_succs.size() == 0) {
|
||||
block.kind |= block_kind_uniform;
|
||||
Builder bld(program, &block);
|
||||
if (program->wb_smem_l1_on_end)
|
||||
bld.smem(aco_opcode::s_dcache_wb, false);
|
||||
bld.sopp(aco_opcode::s_endpgm);
|
||||
Builder(program, &block).sopp(aco_opcode::s_endpgm);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -67,10 +67,8 @@ BEGIN_TEST(isel.compute.simple)
|
|||
uint res;
|
||||
};
|
||||
void main() {
|
||||
//~gfx7>> v1: %data = p_parallelcopy 42
|
||||
//~gfx7>> buffer_store_dword %_, v1: undef, 0, %data disable_wqm storage:buffer semantics: scope:invocation
|
||||
//~gfx8>> s1: %data = p_parallelcopy 42
|
||||
//~gfx8>> s_buffer_store_dword %_, 0, %data storage:buffer semantics: scope:invocation
|
||||
//>> v1: %data = p_parallelcopy 42
|
||||
//buffer_store_dword %_, v1: undef, 0, %data disable_wqm storage:buffer semantics: scope:invocation
|
||||
res = 42;
|
||||
}
|
||||
);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue