aco: use ac_hw_cache_flags

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29243>
This commit is contained in:
Rhys Perry 2024-05-14 18:34:01 +01:00 committed by Marge Bot
parent cdaf269924
commit b41f0f6cc1
9 changed files with 244 additions and 205 deletions

View file

@ -126,11 +126,14 @@ template <typename T>
uint32_t
get_gfx12_cpol(const T& instr)
{
bool glc = instr.cache.value & ac_glc;
bool slc = instr.cache.value & ac_slc;
bool dlc = instr.cache.value & ac_dlc;
if (instr_info.is_atomic[(int)instr.opcode]) {
return (instr.glc ? 1 /*TH_ATOMIC_RETURN*/ : 0) << 2;
return (glc ? 1 /*TH_ATOMIC_RETURN*/ : 0) << 2;
} else {
return (instr.definitions.empty() || instr.glc || instr.slc || instr.dlc) ? 3 /*SCOPE_SYS*/
: 0 /*SCOPE_CU*/;
return (instr.definitions.empty() || glc || slc || dlc) ? 3 /*SCOPE_SYS*/
: 0 /*SCOPE_CU*/;
}
}
@ -228,6 +231,8 @@ emit_smem_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
{
uint32_t opcode = ctx.opcode[(int)instr->opcode];
SMEM_instruction& smem = instr->smem();
bool glc = smem.cache.value & ac_glc;
bool dlc = smem.cache.value & ac_dlc;
bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
bool is_load = !instr->definitions.empty();
@ -258,22 +263,21 @@ emit_smem_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
if (ctx.gfx_level <= GFX9) {
encoding = (0b110000 << 26);
assert(!smem.dlc); /* Device-level coherent is not supported on GFX9 and lower */
encoding |= smem.nv ? 1 << 15 : 0;
assert(!dlc); /* Device-level coherent is not supported on GFX9 and lower */
/* We don't use the NV bit. */
} else {
encoding = (0b111101 << 26);
assert(!smem.nv); /* Non-volatile is not supported on GFX10 */
if (ctx.gfx_level <= GFX11_5)
encoding |= smem.dlc ? 1 << (ctx.gfx_level >= GFX11 ? 13 : 14) : 0;
encoding |= dlc ? 1 << (ctx.gfx_level >= GFX11 ? 13 : 14) : 0;
}
if (ctx.gfx_level <= GFX11_5) {
encoding |= opcode << 18;
encoding |= smem.glc ? 1 << (ctx.gfx_level >= GFX11 ? 14 : 16) : 0;
encoding |= glc ? 1 << (ctx.gfx_level >= GFX11 ? 14 : 16) : 0;
} else {
encoding |= opcode << 13;
if (is_load)
encoding |= ((smem.glc || smem.dlc) ? 3 /*SCOPE_SYS*/ : 0 /*SCOPE_CU*/) << 21;
encoding |= ((glc || dlc) ? 3 /*SCOPE_SYS*/ : 0 /*SCOPE_CU*/) << 21;
}
if (ctx.gfx_level <= GFX9) {
@ -536,6 +540,9 @@ emit_mubuf_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction
{
uint32_t opcode = ctx.opcode[(int)instr->opcode];
MUBUF_instruction& mubuf = instr->mubuf();
bool glc = mubuf.cache.value & ac_glc;
bool slc = mubuf.cache.value & ac_slc;
bool dlc = mubuf.cache.value & ac_dlc;
uint32_t encoding = (0b111000 << 26);
if (ctx.gfx_level >= GFX11 && mubuf.lds) /* GFX11 has separate opcodes for LDS loads */
@ -543,7 +550,7 @@ emit_mubuf_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction
else
encoding |= (mubuf.lds ? 1 : 0) << 16;
encoding |= opcode << 18;
encoding |= (mubuf.glc ? 1 : 0) << 14;
encoding |= (glc ? 1 : 0) << 14;
if (ctx.gfx_level <= GFX10_3)
encoding |= (mubuf.idxen ? 1 : 0) << 13;
assert(!mubuf.addr64 || ctx.gfx_level <= GFX7);
@ -552,19 +559,19 @@ emit_mubuf_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction
if (ctx.gfx_level <= GFX10_3)
encoding |= (mubuf.offen ? 1 : 0) << 12;
if (ctx.gfx_level == GFX8 || ctx.gfx_level == GFX9) {
assert(!mubuf.dlc); /* Device-level coherent is not supported on GFX9 and lower */
encoding |= (mubuf.slc ? 1 : 0) << 17;
assert(!dlc); /* Device-level coherent is not supported on GFX9 and lower */
encoding |= (slc ? 1 : 0) << 17;
} else if (ctx.gfx_level >= GFX11) {
encoding |= (mubuf.slc ? 1 : 0) << 12;
encoding |= (mubuf.dlc ? 1 : 0) << 13;
encoding |= (slc ? 1 : 0) << 12;
encoding |= (dlc ? 1 : 0) << 13;
} else if (ctx.gfx_level >= GFX10) {
encoding |= (mubuf.dlc ? 1 : 0) << 15;
encoding |= (dlc ? 1 : 0) << 15;
}
encoding |= 0x0FFF & mubuf.offset;
out.push_back(encoding);
encoding = 0;
if (ctx.gfx_level <= GFX7 || (ctx.gfx_level >= GFX10 && ctx.gfx_level <= GFX10_3)) {
encoding |= (mubuf.slc ? 1 : 0) << 22;
encoding |= (slc ? 1 : 0) << 22;
}
encoding |= reg(ctx, instr->operands[2]) << 24;
if (ctx.gfx_level >= GFX11) {
@ -625,24 +632,27 @@ emit_mtbuf_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction
{
uint32_t opcode = ctx.opcode[(int)instr->opcode];
MTBUF_instruction& mtbuf = instr->mtbuf();
bool glc = mtbuf.cache.value & ac_glc;
bool slc = mtbuf.cache.value & ac_slc;
bool dlc = mtbuf.cache.value & ac_dlc;
uint32_t img_format = ac_get_tbuffer_format(ctx.gfx_level, mtbuf.dfmt, mtbuf.nfmt);
uint32_t encoding = (0b111010 << 26);
assert(img_format <= 0x7F);
assert(!mtbuf.dlc || ctx.gfx_level >= GFX10);
assert(!dlc || ctx.gfx_level >= GFX10);
if (ctx.gfx_level >= GFX11) {
encoding |= (mtbuf.slc ? 1 : 0) << 12;
encoding |= (mtbuf.dlc ? 1 : 0) << 13;
encoding |= (slc ? 1 : 0) << 12;
encoding |= (dlc ? 1 : 0) << 13;
} else {
/* DLC bit replaces one bit of the OPCODE on GFX10 */
encoding |= (mtbuf.dlc ? 1 : 0) << 15;
encoding |= (dlc ? 1 : 0) << 15;
}
if (ctx.gfx_level <= GFX10_3) {
encoding |= (mtbuf.idxen ? 1 : 0) << 13;
encoding |= (mtbuf.offen ? 1 : 0) << 12;
}
encoding |= (mtbuf.glc ? 1 : 0) << 14;
encoding |= (glc ? 1 : 0) << 14;
encoding |= 0x0FFF & mtbuf.offset;
encoding |= (img_format << 19); /* Handles both the GFX10 FORMAT and the old NFMT+DFMT */
@ -662,7 +672,7 @@ emit_mtbuf_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction
encoding |= (mtbuf.idxen ? 1 : 0) << 23;
} else {
encoding |= (mtbuf.tfe ? 1 : 0) << 23;
encoding |= (mtbuf.slc ? 1 : 0) << 22;
encoding |= (slc ? 1 : 0) << 22;
}
encoding |= (reg(ctx, instr->operands[0]) >> 2) << 16;
if (instr->operands.size() > 3)
@ -721,6 +731,9 @@ emit_mimg_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
{
uint32_t opcode = ctx.opcode[(int)instr->opcode];
MIMG_instruction& mimg = instr->mimg();
bool glc = mimg.cache.value & ac_glc;
bool slc = mimg.cache.value & ac_slc;
bool dlc = mimg.cache.value & ac_dlc;
unsigned nsa_dwords = get_mimg_nsa_dwords(instr);
assert(!nsa_dwords || ctx.gfx_level >= GFX10);
@ -732,23 +745,23 @@ emit_mimg_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
encoding |= mimg.dim << 2;
encoding |= mimg.unrm ? 1 << 7 : 0;
encoding |= (0xF & mimg.dmask) << 8;
encoding |= mimg.slc ? 1 << 12 : 0;
encoding |= mimg.dlc ? 1 << 13 : 0;
encoding |= mimg.glc ? 1 << 14 : 0;
encoding |= slc ? 1 << 12 : 0;
encoding |= dlc ? 1 << 13 : 0;
encoding |= glc ? 1 << 14 : 0;
encoding |= mimg.r128 ? 1 << 15 : 0;
encoding |= mimg.a16 ? 1 << 16 : 0;
encoding |= mimg.d16 ? 1 << 17 : 0;
encoding |= (opcode & 0xFF) << 18;
} else {
encoding |= mimg.slc ? 1 << 25 : 0;
encoding |= slc ? 1 << 25 : 0;
encoding |= (opcode & 0x7f) << 18;
encoding |= (opcode >> 7) & 1;
encoding |= mimg.lwe ? 1 << 17 : 0;
encoding |= mimg.tfe ? 1 << 16 : 0;
encoding |= mimg.glc ? 1 << 13 : 0;
encoding |= glc ? 1 << 13 : 0;
encoding |= mimg.unrm ? 1 << 12 : 0;
if (ctx.gfx_level <= GFX9) {
assert(!mimg.dlc); /* Device-level coherent is not supported on GFX9 and lower */
assert(!dlc); /* Device-level coherent is not supported on GFX9 and lower */
assert(!mimg.r128);
encoding |= mimg.a16 ? 1 << 15 : 0;
encoding |= mimg.da ? 1 << 14 : 0;
@ -757,7 +770,7 @@ emit_mimg_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
: 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
encoding |= nsa_dwords << 1;
encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */
encoding |= mimg.dlc ? 1 << 7 : 0;
encoding |= dlc ? 1 << 7 : 0;
}
encoding |= (0xF & mimg.dmask) << 8;
}
@ -856,6 +869,9 @@ emit_flatlike_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruct
{
uint32_t opcode = ctx.opcode[(int)instr->opcode];
FLAT_instruction& flat = instr->flatlike();
bool glc = flat.cache.value & ac_glc;
bool slc = flat.cache.value & ac_slc;
bool dlc = flat.cache.value & ac_dlc;
uint32_t encoding = (0b110111 << 26);
encoding |= opcode << 18;
@ -879,13 +895,13 @@ emit_flatlike_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruct
else if (instr->isGlobal())
encoding |= 2 << (ctx.gfx_level >= GFX11 ? 16 : 14);
encoding |= flat.lds ? 1 << 13 : 0;
encoding |= flat.glc ? 1 << (ctx.gfx_level >= GFX11 ? 14 : 16) : 0;
encoding |= flat.slc ? 1 << (ctx.gfx_level >= GFX11 ? 15 : 17) : 0;
encoding |= glc ? 1 << (ctx.gfx_level >= GFX11 ? 14 : 16) : 0;
encoding |= slc ? 1 << (ctx.gfx_level >= GFX11 ? 15 : 17) : 0;
if (ctx.gfx_level >= GFX10) {
assert(!flat.nv);
encoding |= flat.dlc ? 1 << (ctx.gfx_level >= GFX11 ? 13 : 12) : 0;
encoding |= dlc ? 1 << (ctx.gfx_level >= GFX11 ? 13 : 12) : 0;
} else {
assert(!flat.dlc);
assert(!dlc);
}
out.push_back(encoding);
encoding = reg(ctx, instr->operands[0], 8);

View file

@ -4423,6 +4423,35 @@ lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned
const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};
ac_hw_cache_flags
get_gfx6_cache_flags(bool glc, bool slc, bool dlc)
{
uint8_t value = 0;
value |= glc ? ac_glc : 0;
value |= slc ? ac_slc : 0;
value |= dlc ? ac_dlc : 0;
return ac_hw_cache_flags{value};
}
ac_hw_cache_flags
get_load_cache_flags(Builder& bld, bool glc, bool slc)
{
bool dlc = glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
return get_gfx6_cache_flags(glc, slc, dlc);
}
ac_hw_cache_flags
get_store_cache_flags(Builder& bld, bool glc, bool slc)
{
return get_gfx6_cache_flags(glc, slc, false);
}
ac_hw_cache_flags
get_atomic_cache_flags(Builder& bld, bool return_previous)
{
return get_gfx6_cache_flags(return_previous, false, false);
}
Temp
smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
unsigned align, unsigned const_offset, Temp dst_hint)
@ -4478,9 +4507,7 @@ smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned
RegClass rc(RegType::sgpr, DIV_ROUND_UP(bytes_needed, 4u));
Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
load->definitions[0] = Definition(val);
load->smem().glc = info.glc;
load->smem().dlc =
info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
load->smem().cache = get_load_cache_flags(bld, info.glc, false);
load->smem().sync = info.sync;
bld.insert(std::move(load));
return val;
@ -4539,13 +4566,11 @@ mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne
mubuf->operands[2] = soffset;
mubuf->mubuf().offen = offen;
mubuf->mubuf().idxen = idxen;
mubuf->mubuf().glc = info.glc;
mubuf->mubuf().dlc =
info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
mubuf->mubuf().slc = info.slc;
mubuf->mubuf().cache = get_load_cache_flags(bld, info.glc, info.slc);
if (info.swizzle_component_size != 0)
mubuf->mubuf().cache.value |= ac_swizzled;
mubuf->mubuf().sync = info.sync;
mubuf->mubuf().offset = const_offset;
mubuf->mubuf().swizzled = info.swizzle_component_size != 0;
RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
mubuf->definitions[0] = Definition(val);
@ -4607,10 +4632,7 @@ mubuf_load_format_callback(Builder& bld, const LoadEmitInfo& info, Temp offset,
mubuf->operands[2] = soffset;
mubuf->mubuf().offen = offen;
mubuf->mubuf().idxen = idxen;
mubuf->mubuf().glc = info.glc;
mubuf->mubuf().dlc =
info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
mubuf->mubuf().slc = info.slc;
mubuf->mubuf().cache = get_load_cache_flags(bld, info.glc, info.slc);
mubuf->mubuf().sync = info.sync;
mubuf->mubuf().offset = const_offset;
RegClass rc = RegClass::get(RegType::vgpr, bytes_needed);
@ -4818,8 +4840,7 @@ global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsign
mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr));
mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
mubuf->operands[2] = Operand(offset);
mubuf->mubuf().glc = info.glc;
mubuf->mubuf().dlc = false;
mubuf->mubuf().cache = get_load_cache_flags(bld, info.glc, false);
mubuf->mubuf().offset = const_offset;
mubuf->mubuf().addr64 = addr.type() == RegType::vgpr;
mubuf->mubuf().disable_wqm = false;
@ -4838,9 +4859,7 @@ global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsign
flat->operands[0] = Operand(addr);
flat->operands[1] = Operand(s1);
}
flat->flatlike().glc = info.glc;
flat->flatlike().dlc =
info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
flat->flatlike().cache = get_load_cache_flags(bld, info.glc, false);
flat->flatlike().sync = info.sync;
assert(global || !const_offset);
flat->flatlike().offset = const_offset;
@ -5673,10 +5692,7 @@ mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne
mtbuf->operands[2] = soffset;
mtbuf->mtbuf().offen = offen;
mtbuf->mtbuf().idxen = idxen;
mtbuf->mtbuf().glc = info.glc;
mtbuf->mtbuf().dlc =
info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
mtbuf->mtbuf().slc = info.slc;
mtbuf->mtbuf().cache = get_load_cache_flags(bld, info.glc, info.slc);
mtbuf->mtbuf().sync = info.sync;
mtbuf->mtbuf().offset = const_offset;
mtbuf->mtbuf().dfmt = fetch_fmt & 0xf;
@ -6220,6 +6236,7 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
unsigned access = nir_intrinsic_access(instr);
bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
unsigned result_size = instr->def.num_components - is_sparse;
unsigned expand_mask = nir_def_components_read(&instr->def) & u_bit_consecutive(0, result_size);
@ -6275,9 +6292,7 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
load->operands[2] = Operand::c32(0);
load->definitions[0] = Definition(tmp);
load->mubuf().idxen = true;
load->mubuf().glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
load->mubuf().dlc = load->mubuf().glc &&
(ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
load->mubuf().cache = get_load_cache_flags(bld, glc, false);
load->mubuf().sync = sync;
load->mubuf().tfe = is_sparse;
if (load->mubuf().tfe)
@ -6296,9 +6311,7 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
MIMG_instruction* load = emit_mimg(bld, opcode, tmp, resource, Operand(s4), coords, vdata);
load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
load->dlc =
load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
load->cache = get_load_cache_flags(bld, glc, false);
load->a16 = instr->src[1].ssa->bit_size == 16;
load->d16 = d16;
load->dmask = dmask;
@ -6422,8 +6435,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
store->operands[2] = Operand::c32(0);
store->operands[3] = Operand(data);
store->mubuf().idxen = true;
store->mubuf().glc = glc;
store->mubuf().dlc = false;
store->mubuf().cache = get_store_cache_flags(bld, glc, false);
store->mubuf().disable_wqm = true;
store->mubuf().sync = sync;
ctx->program->needs_exact = true;
@ -6440,8 +6452,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
MIMG_instruction* store =
emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, Operand(data));
store->glc = glc;
store->dlc = false;
store->cache = get_store_cache_flags(bld, glc, false);
store->a16 = instr->src[1].ssa->bit_size == 16;
store->d16 = d16;
store->dmask = dmask;
@ -6581,8 +6592,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
mubuf->definitions[0] = def;
mubuf->mubuf().offset = 0;
mubuf->mubuf().idxen = true;
mubuf->mubuf().glc = return_previous;
mubuf->mubuf().dlc = false; /* Not needed for atomics */
mubuf->mubuf().cache = get_atomic_cache_flags(bld, return_previous);
mubuf->mubuf().disable_wqm = true;
mubuf->mubuf().sync = sync;
ctx->program->needs_exact = true;
@ -6597,8 +6607,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
Temp tmp = return_previous ? (cmpswap ? bld.tmp(data.regClass()) : dst) : Temp(0, v1);
MIMG_instruction* mimg =
emit_mimg(bld, image_op, tmp, resource, Operand(s4), coords, Operand(data));
mimg->glc = return_previous;
mimg->dlc = false; /* Not needed for atomics */
mimg->cache = get_atomic_cache_flags(bld, return_previous);
mimg->dmask = (1 << data.size()) - 1;
mimg->a16 = instr->src[1].ssa->bit_size == 16;
mimg->unrm = true;
@ -6670,8 +6679,8 @@ visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
store->operands[3] = Operand(write_datas[i]);
store->mubuf().offset = offsets[i];
store->mubuf().offen = (offset.type() == RegType::vgpr);
store->mubuf().glc = glc || (ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4);
store->mubuf().dlc = false;
store->mubuf().cache = get_store_cache_flags(
bld, glc || (ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4), false);
store->mubuf().disable_wqm = true;
store->mubuf().sync = sync;
ctx->program->needs_exact = true;
@ -6712,8 +6721,7 @@ visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
mubuf->definitions[0] = def;
mubuf->mubuf().offset = 0;
mubuf->mubuf().offen = (offset.type() == RegType::vgpr);
mubuf->mubuf().glc = return_previous;
mubuf->mubuf().dlc = false; /* Not needed for atomics */
mubuf->mubuf().cache = get_atomic_cache_flags(bld, return_previous);
mubuf->mubuf().disable_wqm = true;
mubuf->mubuf().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
ctx->program->needs_exact = true;
@ -6846,8 +6854,7 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
flat->operands[1] = Operand(s1);
}
flat->operands[2] = Operand(write_datas[i]);
flat->flatlike().glc = glc;
flat->flatlike().dlc = false;
flat->flatlike().cache = get_store_cache_flags(bld, glc, false);
assert(global || !write_const_offset);
flat->flatlike().offset = write_const_offset;
flat->flatlike().disable_wqm = true;
@ -6867,8 +6874,8 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
write_address.type() == RegType::vgpr ? Operand(write_address) : Operand(v1);
mubuf->operands[2] = Operand(write_offset);
mubuf->operands[3] = Operand(write_datas[i]);
mubuf->mubuf().glc = glc || write_datas[i].bytes() < 4;
mubuf->mubuf().dlc = false;
mubuf->mubuf().cache =
get_store_cache_flags(bld, glc || write_datas[i].bytes() < 4, false);
mubuf->mubuf().offset = write_const_offset;
mubuf->mubuf().addr64 = write_address.type() == RegType::vgpr;
mubuf->mubuf().disable_wqm = true;
@ -6980,8 +6987,7 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
flat->operands[2] = Operand(data);
if (return_previous)
flat->definitions[0] = Definition(dst);
flat->flatlike().glc = return_previous;
flat->flatlike().dlc = false; /* Not needed for atomics */
flat->flatlike().cache = get_atomic_cache_flags(bld, return_previous);
assert(global || !const_offset);
flat->flatlike().offset = const_offset;
flat->flatlike().disable_wqm = true;
@ -7007,8 +7013,7 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
if (return_previous)
mubuf->definitions[0] = def;
mubuf->mubuf().glc = return_previous;
mubuf->mubuf().dlc = false;
mubuf->mubuf().cache = get_atomic_cache_flags(bld, return_previous);
mubuf->mubuf().offset = const_offset;
mubuf->mubuf().addr64 = addr.type() == RegType::vgpr;
mubuf->mubuf().disable_wqm = true;
@ -7167,6 +7172,9 @@ visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
glc |= ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4;
glc &= ctx->program->gfx_level < GFX11;
ac_hw_cache_flags cache = get_store_cache_flags(bld, glc, slc);
if (swizzled)
cache.value |= ac_swizzled;
Operand vaddr_op(v1);
if (offen && idxen)
@ -7177,9 +7185,8 @@ visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
vaddr_op = Operand(idx);
Instruction* mubuf = bld.mubuf(op, Operand(descriptor), vaddr_op, s_offset,
Operand(write_datas[i]), const_offset, offen, swizzled, idxen,
/* addr64 */ false, /* disable_wqm */ false, glc,
/* dlc */ false, slc)
Operand(write_datas[i]), const_offset, offen, idxen,
/* addr64 */ false, /* disable_wqm */ false, cache)
.instr;
mubuf->mubuf().sync = sync;
}
@ -7637,9 +7644,11 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
for (unsigned i = 0; i < write_count; i++) {
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset,
write_datas[i], offsets[i], true, true);
write_datas[i], offsets[i], true);
mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
mubuf->mubuf().glc = ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4;
bool glc = ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4;
mubuf->mubuf().cache = get_store_cache_flags(bld, glc, false);
mubuf->mubuf().cache.value |= ac_swizzled;
}
}
}
@ -12098,9 +12107,12 @@ select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shade
bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
Operand::zero());
ac_hw_cache_flags cache_glc;
cache_glc.value = ac_glc;
/* Store TTMP0-TTMP1. */
bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),
Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
Operand(PhysReg{ttmp0}, s2), memory_sync_info(), cache_glc);
uint32_t hw_regs_idx[] = {
2, /* HW_REG_STATUS */
@ -12116,7 +12128,8 @@ select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shade
((20 - 1) << 11) | hw_regs_idx[i]);
bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(),
cache_glc);
}
program->config->float_mode = program->blocks[0].fp_mode.val;
@ -12632,18 +12645,18 @@ load_unaligned_vs_attrib(Builder& bld, PhysReg dst, Operand desc, Operand index,
PhysReg scratch(load.scratch);
if (load.d16) {
bld.mubuf(aco_opcode::buffer_load_ubyte_d16, Definition(dst, v1), desc, index,
Operand::c32(0u), offset, false, false, true);
Operand::c32(0u), offset, false, true);
bld.mubuf(aco_opcode::buffer_load_ubyte_d16_hi, Definition(dst, v1), desc, index,
Operand::c32(0u), offset + 2, false, false, true);
Operand::c32(0u), offset + 2, false, true);
bld.mubuf(aco_opcode::buffer_load_ubyte_d16, Definition(scratch, v1), desc, index,
Operand::c32(0u), offset + 1, false, false, true);
Operand::c32(0u), offset + 1, false, true);
bld.mubuf(aco_opcode::buffer_load_ubyte_d16_hi, Definition(scratch, v1), desc, index,
Operand::c32(0u), offset + 3, false, false, true);
Operand::c32(0u), offset + 3, false, true);
} else {
for (unsigned i = 0; i < size; i++) {
Definition def(i ? scratch.advance(i * 4 - 4) : dst, v1);
bld.mubuf(aco_opcode::buffer_load_ubyte, def, desc, index, Operand::c32(0u), offset + i,
false, false, true);
false, true);
}
}
@ -12835,7 +12848,7 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh
i += slots;
} else {
bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true);
Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, true);
loc++;
i++;
}

View file

@ -15,6 +15,7 @@
#include "ac_binary.h"
#include "ac_hw_stage.h"
#include "ac_shader_util.h"
#include "amd_family.h"
#include <algorithm>
#include <bitset>
@ -1309,11 +1310,7 @@ static_assert(sizeof(SALU_instruction) == sizeof(Instruction) + 4, "Unexpected p
*/
struct SMEM_instruction : public Instruction {
memory_sync_info sync;
bool glc : 1; /* VI+: globally coherent */
bool dlc : 1; /* NAVI: device level coherent */
bool nv : 1; /* VEGA only: Non-volatile */
bool disable_wqm : 1;
uint8_t padding : 4;
ac_hw_cache_flags cache;
};
static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
@ -1492,19 +1489,16 @@ static_assert(sizeof(LDSDIR_instruction) == sizeof(Instruction) + 8, "Unexpected
*/
struct MUBUF_instruction : public Instruction {
memory_sync_info sync;
ac_hw_cache_flags cache;
bool offen : 1; /* Supply an offset from VGPR (VADDR) */
bool idxen : 1; /* Supply an index from VGPR (VADDR) */
bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
bool glc : 1; /* globally coherent */
bool dlc : 1; /* NAVI: device level coherent */
bool slc : 1; /* system level coherent */
bool tfe : 1; /* texture fail enable */
bool lds : 1; /* Return read-data to LDS instead of VGPRs */
uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
uint16_t swizzled : 1;
uint16_t padding0 : 2;
uint16_t padding1;
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
uint8_t padding0 : 2;
uint8_t padding1;
uint16_t offset; /* Unsigned byte offset - 12 bit */
};
static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
@ -1518,16 +1512,14 @@ static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected
*/
struct MTBUF_instruction : public Instruction {
memory_sync_info sync;
ac_hw_cache_flags cache;
uint8_t dfmt : 4; /* Data Format of data in memory buffer */
uint8_t nfmt : 3; /* Numeric format of data in memory */
bool offen : 1; /* Supply an offset from VGPR (VADDR) */
uint16_t idxen : 1; /* Supply an index from VGPR (VADDR) */
uint16_t glc : 1; /* globally coherent */
uint16_t dlc : 1; /* NAVI: device level coherent */
uint16_t slc : 1; /* system level coherent */
uint16_t tfe : 1; /* texture fail enable */
uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
uint16_t padding : 10;
bool idxen : 1; /* Supply an index from VGPR (VADDR) */
bool tfe : 1; /* texture fail enable */
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
uint8_t padding : 5;
uint16_t offset; /* Unsigned byte offset - 12 bit */
};
static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
@ -1543,12 +1535,10 @@ static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected
*/
struct MIMG_instruction : public Instruction {
memory_sync_info sync;
ac_hw_cache_flags cache;
uint8_t dmask; /* Data VGPR enable mask */
uint8_t dim : 3; /* NAVI: dimensionality */
bool unrm : 1; /* Force address to be un-normalized */
bool dlc : 1; /* NAVI: device level coherent */
bool glc : 1; /* globally coherent */
bool slc : 1; /* system level coherent */
bool tfe : 1; /* texture fail enable */
bool da : 1; /* declare an array */
bool lwe : 1; /* LOD warning enable */
@ -1557,9 +1547,8 @@ struct MIMG_instruction : public Instruction {
bool d16 : 1; /* Convert 32-bit data to 16-bit data */
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
bool strict_wqm : 1; /* VADDR is a linear VGPR and additional VGPRs may be copied into it */
uint8_t padding0 : 1;
uint8_t padding0 : 4;
uint8_t padding1;
uint8_t padding2;
};
static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
@ -1572,15 +1561,13 @@ static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected p
*/
struct FLAT_instruction : public Instruction {
memory_sync_info sync;
bool slc : 1; /* system level coherent */
bool glc : 1; /* globally coherent */
bool dlc : 1; /* NAVI: device level coherent */
ac_hw_cache_flags cache;
bool lds : 1;
bool nv : 1;
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
uint8_t padding0 : 2;
uint8_t padding0 : 5;
uint8_t padding1;
int16_t offset; /* Vega/Navi only */
uint16_t padding1;
};
static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

View file

@ -105,9 +105,7 @@ class Format(IntEnum):
return [('uint32_t', 'imm', '0')]
elif self == Format.SMEM:
return [('memory_sync_info', 'sync', 'memory_sync_info()'),
('bool', 'glc', 'false'),
('bool', 'dlc', 'false'),
('bool', 'nv', 'false')]
('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}')]
elif self == Format.DS:
return [('uint16_t', 'offset0', '0'),
('uint8_t', 'offset1', '0'),
@ -125,20 +123,15 @@ class Format(IntEnum):
('bool', 'offen', None),
('bool', 'idxen', 'false'),
('bool', 'disable_wqm', 'false'),
('bool', 'glc', 'false'),
('bool', 'dlc', 'false'),
('bool', 'slc', 'false'),
('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'),
('bool', 'tfe', 'false')]
elif self == Format.MUBUF:
return [('unsigned', 'offset', None),
('bool', 'offen', None),
('bool', 'swizzled', 'false'),
('bool', 'idxen', 'false'),
('bool', 'addr64', 'false'),
('bool', 'disable_wqm', 'false'),
('bool', 'glc', 'false'),
('bool', 'dlc', 'false'),
('bool', 'slc', 'false'),
('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'),
('bool', 'tfe', 'false'),
('bool', 'lds', 'false')]
elif self == Format.MIMG:
@ -146,9 +139,7 @@ class Format(IntEnum):
('bool', 'da', 'false'),
('bool', 'unrm', 'false'),
('bool', 'disable_wqm', 'false'),
('bool', 'glc', 'false'),
('bool', 'dlc', 'false'),
('bool', 'slc', 'false'),
('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'),
('bool', 'tfe', 'false'),
('bool', 'lwe', 'false'),
('bool', 'r128', 'false'),
@ -195,8 +186,7 @@ class Format(IntEnum):
elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
return [('int16_t', 'offset', 0),
('memory_sync_info', 'sync', 'memory_sync_info()'),
('bool', 'glc', 'false'),
('bool', 'slc', 'false'),
('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'),
('bool', 'lds', 'false'),
('bool', 'nv', 'false')]
else:

View file

@ -164,8 +164,7 @@ struct InstrPred {
case Format::SMEM: {
SMEM_instruction& aS = a->smem();
SMEM_instruction& bS = b->smem();
return aS.sync == bS.sync && aS.glc == bS.glc && aS.dlc == bS.dlc && aS.nv == bS.nv &&
aS.disable_wqm == bS.disable_wqm;
return aS.sync == bS.sync && aS.cache.value == bS.cache.value;
}
case Format::VINTRP: {
VINTRP_instruction& aI = a->vintrp();
@ -203,21 +202,21 @@ struct InstrPred {
MTBUF_instruction& bM = b->mtbuf();
return aM.sync == bM.sync && aM.dfmt == bM.dfmt && aM.nfmt == bM.nfmt &&
aM.offset == bM.offset && aM.offen == bM.offen && aM.idxen == bM.idxen &&
aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc && aM.tfe == bM.tfe &&
aM.cache.value == bM.cache.value && aM.tfe == bM.tfe &&
aM.disable_wqm == bM.disable_wqm;
}
case Format::MUBUF: {
MUBUF_instruction& aM = a->mubuf();
MUBUF_instruction& bM = b->mubuf();
return aM.sync == bM.sync && aM.offset == bM.offset && aM.offen == bM.offen &&
aM.idxen == bM.idxen && aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc &&
aM.tfe == bM.tfe && aM.lds == bM.lds && aM.disable_wqm == bM.disable_wqm;
aM.idxen == bM.idxen && aM.cache.value == bM.cache.value && aM.tfe == bM.tfe &&
aM.lds == bM.lds && aM.disable_wqm == bM.disable_wqm;
}
case Format::MIMG: {
MIMG_instruction& aM = a->mimg();
MIMG_instruction& bM = b->mimg();
return aM.sync == bM.sync && aM.dmask == bM.dmask && aM.unrm == bM.unrm &&
aM.glc == bM.glc && aM.slc == bM.slc && aM.tfe == bM.tfe && aM.da == bM.da &&
aM.cache.value == bM.cache.value && aM.tfe == bM.tfe && aM.da == bM.da &&
aM.lwe == bM.lwe && aM.r128 == bM.r128 && aM.a16 == bM.a16 && aM.d16 == bM.d16 &&
aM.disable_wqm == bM.disable_wqm;
}

View file

@ -860,10 +860,7 @@ smem_combine(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (!smem.definitions.empty())
new_instr->definitions[0] = smem.definitions[0];
new_instr->smem().sync = smem.sync;
new_instr->smem().glc = smem.glc;
new_instr->smem().dlc = smem.dlc;
new_instr->smem().nv = smem.nv;
new_instr->smem().disable_wqm = smem.disable_wqm;
new_instr->smem().cache = smem.cache;
instr.reset(new_instr);
}
}
@ -1429,13 +1426,14 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
while (info.is_temp())
info = ctx.info[info.temp.id()];
bool swizzled = mubuf.cache.value & ac_swizzled;
/* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr
* overflow for scratch accesses works only on GFX9+ and saddr overflow
* never works. Since swizzling is the only thing that separates
* scratch accesses and other accesses and swizzling changing how
* addressing works significantly, this probably applies to swizzled
* MUBUF accesses. */
bool vaddr_prevent_overflow = mubuf.swizzled && ctx.program->gfx_level < GFX9;
bool vaddr_prevent_overflow = swizzled && ctx.program->gfx_level < GFX9;
if (mubuf.offen && mubuf.idxen && i == 1 && info.is_vec() &&
info.instr->operands.size() == 2 && info.instr->operands[0].isTemp() &&
@ -1465,7 +1463,7 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
mubuf.offset += offset;
continue;
} else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) &&
base.regClass() == s1 && mubuf.offset + offset < 4096 && !mubuf.swizzled) {
base.regClass() == s1 && mubuf.offset + offset < 4096 && !swizzled) {
instr->operands[i].setTemp(base);
mubuf.offset += offset;
continue;

View file

@ -262,6 +262,20 @@ print_sync(memory_sync_info sync, FILE* output)
print_scope(sync.scope, output);
}
template <typename T>
static void
print_cache_flags(enum amd_gfx_level gfx_level, const T& instr, FILE* output)
{
if (instr.cache.value & ac_glc)
fprintf(output, " glc");
if (instr.cache.value & ac_slc)
fprintf(output, " slc");
if (instr.cache.value & ac_dlc)
fprintf(output, " dlc");
if (instr.cache.value & ac_swizzled)
fprintf(output, " swizzled");
}
static void
print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* instr, FILE* output)
{
@ -428,12 +442,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
}
case Format::SMEM: {
const SMEM_instruction& smem = instr->smem();
if (smem.glc)
fprintf(output, " glc");
if (smem.dlc)
fprintf(output, " dlc");
if (smem.nv)
fprintf(output, " nv");
print_cache_flags(gfx_level, smem, output);
print_sync(smem.sync, output);
break;
}
@ -482,12 +491,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
fprintf(output, " idxen");
if (mubuf.addr64)
fprintf(output, " addr64");
if (mubuf.glc)
fprintf(output, " glc");
if (mubuf.dlc)
fprintf(output, " dlc");
if (mubuf.slc)
fprintf(output, " slc");
print_cache_flags(gfx_level, mubuf, output);
if (mubuf.tfe)
fprintf(output, " tfe");
if (mubuf.lds)
@ -517,12 +521,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
}
if (mimg.unrm)
fprintf(output, " unrm");
if (mimg.glc)
fprintf(output, " glc");
if (mimg.dlc)
fprintf(output, " dlc");
if (mimg.slc)
fprintf(output, " slc");
print_cache_flags(gfx_level, mimg, output);
if (mimg.tfe)
fprintf(output, " tfe");
if (mimg.da)
@ -594,12 +593,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
const FLAT_instruction& flat = instr->flatlike();
if (flat.offset)
fprintf(output, " offset:%d", flat.offset);
if (flat.glc)
fprintf(output, " glc");
if (flat.dlc)
fprintf(output, " dlc");
if (flat.slc)
fprintf(output, " slc");
print_cache_flags(gfx_level, flat, output);
if (flat.lds)
fprintf(output, " lds");
if (flat.nv)
@ -646,12 +640,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
fprintf(output, " offen");
if (mtbuf.idxen)
fprintf(output, " idxen");
if (mtbuf.glc)
fprintf(output, " glc");
if (mtbuf.dlc)
fprintf(output, " dlc");
if (mtbuf.slc)
fprintf(output, " slc");
print_cache_flags(gfx_level, mtbuf, output);
if (mtbuf.tfe)
fprintf(output, " tfe");
if (mtbuf.disable_wqm)

View file

@ -1324,8 +1324,9 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& inst
offset, memory_sync_info(storage_vgpr_spill, semantic_private));
} else {
Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc,
Operand(v1), scratch_offset, elem, offset, false, true);
Operand(v1), scratch_offset, elem, offset, false);
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
instr->mubuf().cache.value = ac_swizzled;
}
}
} else if (ctx.program->gfx_level >= GFX9) {
@ -1333,8 +1334,9 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& inst
memory_sync_info(storage_vgpr_spill, semantic_private));
} else {
Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc, Operand(v1),
scratch_offset, temp, offset, false, true);
scratch_offset, temp, offset, false);
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
instr->mubuf().cache.value = ac_swizzled;
}
}
@ -1366,8 +1368,9 @@ reload_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& ins
} else {
Instruction* instr =
bld.mubuf(aco_opcode::buffer_load_dword, Definition(tmp), ctx.scratch_rsrc,
Operand(v1), scratch_offset, offset, false, true);
Operand(v1), scratch_offset, offset, false);
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
instr->mubuf().cache.value = ac_swizzled;
}
}
bld.insert(vec);
@ -1376,8 +1379,9 @@ reload_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& ins
memory_sync_info(storage_vgpr_spill, semantic_private));
} else {
Instruction* instr = bld.mubuf(aco_opcode::buffer_load_dword, def, ctx.scratch_rsrc,
Operand(v1), scratch_offset, offset, false, true);
Operand(v1), scratch_offset, offset, false);
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
instr->mubuf().cache.value = ac_swizzled;
}
}

View file

@ -411,13 +411,19 @@ BEGIN_TEST(assembler.smem)
//! s_load_b32 s4, s[16:17], s8 offset:0x2a ; f4000108 1000002a
bld.smem(aco_opcode::s_load_dword, dst, op_s2, Operand::c32(42), op_s1);
ac_hw_cache_flags cache_coherent;
ac_hw_cache_flags cache_non_temporal;
cache_coherent.value = ac_glc;
cache_non_temporal.value = ac_dlc;
//~gfx11! s_buffer_load_b32 s4, s[32:35], s8 glc ; f4204110 10000000
//~gfx12! s_buffer_load_b32 s4, s[32:35], s8 offset:0x0 scope:SCOPE_SYS ; f4620110 10000000
bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().glc = true;
bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().cache = cache_coherent;
//~gfx11! s_buffer_load_b32 s4, s[32:35], s8 dlc ; f4202110 10000000
//~gfx12! (then repeated 1 times)
bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().dlc = true;
bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().cache =
cache_non_temporal;
finish_assembler_test();
}
@ -482,22 +488,31 @@ BEGIN_TEST(assembler.mubuf)
bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), op_s1, 84, false);
/* Various flags */
ac_hw_cache_flags cache_coherent;
ac_hw_cache_flags cache_sys_coherent;
ac_hw_cache_flags cache_non_temporal;
ac_hw_cache_flags cache_atomic_rtn;
cache_coherent.value = ac_glc;
cache_sys_coherent.value = ac_slc;
cache_non_temporal.value = ac_dlc;
cache_atomic_rtn.value = ac_glc;
//~gfx11! buffer_load_b32 v42, off, s[32:35], 0 glc ; e0504000 80082a80
//~gfx12! buffer_load_b32 v42, off, s[32:35], null scope:SCOPE_SYS ; c405007c 008c402a 00000000
bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
->mubuf()
.glc = true;
.cache = cache_coherent;
//~gfx11! buffer_load_b32 v42, off, s[32:35], 0 dlc ; e0502000 80082a80
//~gfx12! (then repeated 2 times)
bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
->mubuf()
.dlc = true;
.cache = cache_non_temporal;
//~gfx11! buffer_load_b32 v42, off, s[32:35], 0 slc ; e0501000 80082a80
bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
->mubuf()
.slc = true;
.cache = cache_sys_coherent;
//; if llvm_ver >= 16 and variant == 'gfx11':
//; insert_pattern('buffer_load_b32 v[42:43], off, s[32:35], 0 tfe ; e0500000 80282a80')
@ -562,7 +577,7 @@ BEGIN_TEST(assembler.mubuf)
bld.mubuf(aco_opcode::buffer_atomic_add, Definition(op_v1.physReg(), v1), op_s4, Operand(v1),
Operand::zero(), op_v1, 0, false)
->mubuf()
.glc = true;
.cache = cache_atomic_rtn;
finish_assembler_test();
}
@ -632,25 +647,32 @@ BEGIN_TEST(assembler.mtbuf)
false);
/* Various flags */
ac_hw_cache_flags cache_coherent;
ac_hw_cache_flags cache_sys_coherent;
ac_hw_cache_flags cache_non_temporal;
cache_coherent.value = ac_glc;
cache_sys_coherent.value = ac_slc;
cache_non_temporal.value = ac_dlc;
//~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] glc ; e9904000 80082a80
//~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] scope:SCOPE_SYS ; c420007c 190c402a 00000080
bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
nfmt, 0, false)
->mtbuf()
.glc = true;
.cache = cache_coherent;
//~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] dlc ; e9902000 80082a80
//~gfx12! (then repeated 2 times)
bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
nfmt, 0, false)
->mtbuf()
.dlc = true;
.cache = cache_non_temporal;
//~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] slc ; e9901000 80082a80
bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
nfmt, 0, false)
->mtbuf()
.slc = true;
.cache = cache_sys_coherent;
//; if llvm_ver >= 16 and variant == 'gfx11':
//; insert_pattern('tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] ; e9900000 80282a80')
@ -718,19 +740,28 @@ BEGIN_TEST(assembler.mimg)
0x1;
/* Various flags */
ac_hw_cache_flags cache_coherent;
ac_hw_cache_flags cache_sys_coherent;
ac_hw_cache_flags cache_non_temporal;
ac_hw_cache_flags cache_atomic_rtn;
cache_coherent.value = ac_glc;
cache_sys_coherent.value = ac_slc;
cache_non_temporal.value = ac_dlc;
cache_atomic_rtn.value = ac_glc;
//~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D dlc ; f06c2f00 2010540a
//~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D scope:SCOPE_SYS ; e7c6c000 100c8054 0000000a
bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().dlc =
true;
bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache =
cache_non_temporal;
//~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D glc ; f06c4f00 2010540a
//~gfx12! (then repeated 2 times)
bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().glc =
true;
bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache =
cache_coherent;
//~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D slc ; f06c1f00 2010540a
bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().slc =
true;
bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache =
cache_sys_coherent;
//~gfx11! image_sample v[84:88], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; f06c0f00 2030540a
//~gfx12! image_sample v[84:88], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; e7c6c008 10008054 0000000a
@ -799,7 +830,7 @@ BEGIN_TEST(assembler.mimg)
//~gfx11! image_atomic_add v10, v20, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D glc ; f0304f04 00100a14
//~gfx12! image_atomic_add_uint v10, [v20, v21, v0, v0], s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D th:TH_ATOMIC_RETURN ; d3c30001 0010800a 00001514
bld.mimg(aco_opcode::image_atomic_add, Definition(op_v1.physReg(), v1), op_s8, Operand(s4),
op_v1, op_v2, 0xf, false, false, false, true)
op_v1, op_v2, 0xf, false, false, false, cache_atomic_rtn)
->mimg()
.dim = ac_image_2d;
@ -876,16 +907,28 @@ BEGIN_TEST(assembler.flat)
bld.global(aco_opcode::global_load_dword, dst_v1, op_v2, Operand(s1), 84);
/* Various flags */
ac_hw_cache_flags cache_coherent;
ac_hw_cache_flags cache_sys_coherent;
ac_hw_cache_flags cache_non_temporal;
ac_hw_cache_flags cache_atomic_rtn;
cache_coherent.value = ac_glc;
cache_sys_coherent.value = ac_slc;
cache_non_temporal.value = ac_dlc;
cache_atomic_rtn.value = ac_glc;
//~gfx11! flat_load_b32 v42, v[20:21] slc ; dc508000 2a7c0014
//~gfx12! flat_load_b32 v42, v[20:21] scope:SCOPE_SYS ; ec05007c 000c002a 00000014
bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().slc = true;
bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache =
cache_sys_coherent;
//~gfx11! flat_load_b32 v42, v[20:21] glc ; dc504000 2a7c0014
//~gfx12! (then repeated 2 times)
bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().glc = true;
bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache =
cache_coherent;
//~gfx11! flat_load_b32 v42, v[20:21] dlc ; dc502000 2a7c0014
bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().dlc = true;
bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache =
cache_non_temporal;
/* Stores */
//~gfx11! flat_store_b32 v[20:21], v10 ; dc680000 007c0a14
@ -895,8 +938,8 @@ BEGIN_TEST(assembler.flat)
/* Atomic with return */
//~gfx11! global_atomic_add_u32 v42, v[20:21], v10, off glc ; dcd64000 2a7c0a14
//~gfx12! global_atomic_add_u32 v42, v[20:21], v10, off th:TH_ATOMIC_RETURN ; ee0d407c 0510002a 00000014
bld.global(aco_opcode::global_atomic_add, dst_v1, op_v2, Operand(s1), op_v1)->global().glc =
true;
bld.global(aco_opcode::global_atomic_add, dst_v1, op_v2, Operand(s1), op_v1)->global().cache =
cache_atomic_rtn;
finish_assembler_test();
}