diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 203e8719595..b135de6afd2 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -126,11 +126,14 @@ template uint32_t get_gfx12_cpol(const T& instr) { + bool glc = instr.cache.value & ac_glc; + bool slc = instr.cache.value & ac_slc; + bool dlc = instr.cache.value & ac_dlc; if (instr_info.is_atomic[(int)instr.opcode]) { - return (instr.glc ? 1 /*TH_ATOMIC_RETURN*/ : 0) << 2; + return (glc ? 1 /*TH_ATOMIC_RETURN*/ : 0) << 2; } else { - return (instr.definitions.empty() || instr.glc || instr.slc || instr.dlc) ? 3 /*SCOPE_SYS*/ - : 0 /*SCOPE_CU*/; + return (instr.definitions.empty() || glc || slc || dlc) ? 3 /*SCOPE_SYS*/ + : 0 /*SCOPE_CU*/; } } @@ -228,6 +231,8 @@ emit_smem_instruction(asm_context& ctx, std::vector& out, Instruction* { uint32_t opcode = ctx.opcode[(int)instr->opcode]; SMEM_instruction& smem = instr->smem(); + bool glc = smem.cache.value & ac_glc; + bool dlc = smem.cache.value & ac_dlc; bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4); bool is_load = !instr->definitions.empty(); @@ -258,22 +263,21 @@ emit_smem_instruction(asm_context& ctx, std::vector& out, Instruction* if (ctx.gfx_level <= GFX9) { encoding = (0b110000 << 26); - assert(!smem.dlc); /* Device-level coherent is not supported on GFX9 and lower */ - encoding |= smem.nv ? 1 << 15 : 0; + assert(!dlc); /* Device-level coherent is not supported on GFX9 and lower */ + /* We don't use the NV bit. */ } else { encoding = (0b111101 << 26); - assert(!smem.nv); /* Non-volatile is not supported on GFX10 */ if (ctx.gfx_level <= GFX11_5) - encoding |= smem.dlc ? 1 << (ctx.gfx_level >= GFX11 ? 13 : 14) : 0; + encoding |= dlc ? 1 << (ctx.gfx_level >= GFX11 ? 13 : 14) : 0; } if (ctx.gfx_level <= GFX11_5) { encoding |= opcode << 18; - encoding |= smem.glc ? 1 << (ctx.gfx_level >= GFX11 ? 14 : 16) : 0; + encoding |= glc ? 1 << (ctx.gfx_level >= GFX11 ? 14 : 16) : 0; } else { encoding |= opcode << 13; if (is_load) - encoding |= ((smem.glc || smem.dlc) ? 3 /*SCOPE_SYS*/ : 0 /*SCOPE_CU*/) << 21; + encoding |= ((glc || dlc) ? 3 /*SCOPE_SYS*/ : 0 /*SCOPE_CU*/) << 21; } if (ctx.gfx_level <= GFX9) { @@ -536,6 +540,9 @@ emit_mubuf_instruction(asm_context& ctx, std::vector& out, Instruction { uint32_t opcode = ctx.opcode[(int)instr->opcode]; MUBUF_instruction& mubuf = instr->mubuf(); + bool glc = mubuf.cache.value & ac_glc; + bool slc = mubuf.cache.value & ac_slc; + bool dlc = mubuf.cache.value & ac_dlc; uint32_t encoding = (0b111000 << 26); if (ctx.gfx_level >= GFX11 && mubuf.lds) /* GFX11 has separate opcodes for LDS loads */ @@ -543,7 +550,7 @@ emit_mubuf_instruction(asm_context& ctx, std::vector& out, Instruction else encoding |= (mubuf.lds ? 1 : 0) << 16; encoding |= opcode << 18; - encoding |= (mubuf.glc ? 1 : 0) << 14; + encoding |= (glc ? 1 : 0) << 14; if (ctx.gfx_level <= GFX10_3) encoding |= (mubuf.idxen ? 1 : 0) << 13; assert(!mubuf.addr64 || ctx.gfx_level <= GFX7); @@ -552,19 +559,19 @@ emit_mubuf_instruction(asm_context& ctx, std::vector& out, Instruction if (ctx.gfx_level <= GFX10_3) encoding |= (mubuf.offen ? 1 : 0) << 12; if (ctx.gfx_level == GFX8 || ctx.gfx_level == GFX9) { - assert(!mubuf.dlc); /* Device-level coherent is not supported on GFX9 and lower */ - encoding |= (mubuf.slc ? 1 : 0) << 17; + assert(!dlc); /* Device-level coherent is not supported on GFX9 and lower */ + encoding |= (slc ? 1 : 0) << 17; } else if (ctx.gfx_level >= GFX11) { - encoding |= (mubuf.slc ? 1 : 0) << 12; - encoding |= (mubuf.dlc ? 1 : 0) << 13; + encoding |= (slc ? 1 : 0) << 12; + encoding |= (dlc ? 1 : 0) << 13; } else if (ctx.gfx_level >= GFX10) { - encoding |= (mubuf.dlc ? 1 : 0) << 15; + encoding |= (dlc ? 1 : 0) << 15; } encoding |= 0x0FFF & mubuf.offset; out.push_back(encoding); encoding = 0; if (ctx.gfx_level <= GFX7 || (ctx.gfx_level >= GFX10 && ctx.gfx_level <= GFX10_3)) { - encoding |= (mubuf.slc ? 1 : 0) << 22; + encoding |= (slc ? 1 : 0) << 22; } encoding |= reg(ctx, instr->operands[2]) << 24; if (ctx.gfx_level >= GFX11) { @@ -625,24 +632,27 @@ emit_mtbuf_instruction(asm_context& ctx, std::vector& out, Instruction { uint32_t opcode = ctx.opcode[(int)instr->opcode]; MTBUF_instruction& mtbuf = instr->mtbuf(); + bool glc = mtbuf.cache.value & ac_glc; + bool slc = mtbuf.cache.value & ac_slc; + bool dlc = mtbuf.cache.value & ac_dlc; uint32_t img_format = ac_get_tbuffer_format(ctx.gfx_level, mtbuf.dfmt, mtbuf.nfmt); uint32_t encoding = (0b111010 << 26); assert(img_format <= 0x7F); - assert(!mtbuf.dlc || ctx.gfx_level >= GFX10); + assert(!dlc || ctx.gfx_level >= GFX10); if (ctx.gfx_level >= GFX11) { - encoding |= (mtbuf.slc ? 1 : 0) << 12; - encoding |= (mtbuf.dlc ? 1 : 0) << 13; + encoding |= (slc ? 1 : 0) << 12; + encoding |= (dlc ? 1 : 0) << 13; } else { /* DLC bit replaces one bit of the OPCODE on GFX10 */ - encoding |= (mtbuf.dlc ? 1 : 0) << 15; + encoding |= (dlc ? 1 : 0) << 15; } if (ctx.gfx_level <= GFX10_3) { encoding |= (mtbuf.idxen ? 1 : 0) << 13; encoding |= (mtbuf.offen ? 1 : 0) << 12; } - encoding |= (mtbuf.glc ? 1 : 0) << 14; + encoding |= (glc ? 1 : 0) << 14; encoding |= 0x0FFF & mtbuf.offset; encoding |= (img_format << 19); /* Handles both the GFX10 FORMAT and the old NFMT+DFMT */ @@ -662,7 +672,7 @@ emit_mtbuf_instruction(asm_context& ctx, std::vector& out, Instruction encoding |= (mtbuf.idxen ? 1 : 0) << 23; } else { encoding |= (mtbuf.tfe ? 1 : 0) << 23; - encoding |= (mtbuf.slc ? 1 : 0) << 22; + encoding |= (slc ? 1 : 0) << 22; } encoding |= (reg(ctx, instr->operands[0]) >> 2) << 16; if (instr->operands.size() > 3) @@ -721,6 +731,9 @@ emit_mimg_instruction(asm_context& ctx, std::vector& out, Instruction* { uint32_t opcode = ctx.opcode[(int)instr->opcode]; MIMG_instruction& mimg = instr->mimg(); + bool glc = mimg.cache.value & ac_glc; + bool slc = mimg.cache.value & ac_slc; + bool dlc = mimg.cache.value & ac_dlc; unsigned nsa_dwords = get_mimg_nsa_dwords(instr); assert(!nsa_dwords || ctx.gfx_level >= GFX10); @@ -732,23 +745,23 @@ emit_mimg_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= mimg.dim << 2; encoding |= mimg.unrm ? 1 << 7 : 0; encoding |= (0xF & mimg.dmask) << 8; - encoding |= mimg.slc ? 1 << 12 : 0; - encoding |= mimg.dlc ? 1 << 13 : 0; - encoding |= mimg.glc ? 1 << 14 : 0; + encoding |= slc ? 1 << 12 : 0; + encoding |= dlc ? 1 << 13 : 0; + encoding |= glc ? 1 << 14 : 0; encoding |= mimg.r128 ? 1 << 15 : 0; encoding |= mimg.a16 ? 1 << 16 : 0; encoding |= mimg.d16 ? 1 << 17 : 0; encoding |= (opcode & 0xFF) << 18; } else { - encoding |= mimg.slc ? 1 << 25 : 0; + encoding |= slc ? 1 << 25 : 0; encoding |= (opcode & 0x7f) << 18; encoding |= (opcode >> 7) & 1; encoding |= mimg.lwe ? 1 << 17 : 0; encoding |= mimg.tfe ? 1 << 16 : 0; - encoding |= mimg.glc ? 1 << 13 : 0; + encoding |= glc ? 1 << 13 : 0; encoding |= mimg.unrm ? 1 << 12 : 0; if (ctx.gfx_level <= GFX9) { - assert(!mimg.dlc); /* Device-level coherent is not supported on GFX9 and lower */ + assert(!dlc); /* Device-level coherent is not supported on GFX9 and lower */ assert(!mimg.r128); encoding |= mimg.a16 ? 1 << 15 : 0; encoding |= mimg.da ? 1 << 14 : 0; @@ -757,7 +770,7 @@ emit_mimg_instruction(asm_context& ctx, std::vector& out, Instruction* : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */ encoding |= nsa_dwords << 1; encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */ - encoding |= mimg.dlc ? 1 << 7 : 0; + encoding |= dlc ? 1 << 7 : 0; } encoding |= (0xF & mimg.dmask) << 8; } @@ -856,6 +869,9 @@ emit_flatlike_instruction(asm_context& ctx, std::vector& out, Instruct { uint32_t opcode = ctx.opcode[(int)instr->opcode]; FLAT_instruction& flat = instr->flatlike(); + bool glc = flat.cache.value & ac_glc; + bool slc = flat.cache.value & ac_slc; + bool dlc = flat.cache.value & ac_dlc; uint32_t encoding = (0b110111 << 26); encoding |= opcode << 18; @@ -879,13 +895,13 @@ emit_flatlike_instruction(asm_context& ctx, std::vector& out, Instruct else if (instr->isGlobal()) encoding |= 2 << (ctx.gfx_level >= GFX11 ? 16 : 14); encoding |= flat.lds ? 1 << 13 : 0; - encoding |= flat.glc ? 1 << (ctx.gfx_level >= GFX11 ? 14 : 16) : 0; - encoding |= flat.slc ? 1 << (ctx.gfx_level >= GFX11 ? 15 : 17) : 0; + encoding |= glc ? 1 << (ctx.gfx_level >= GFX11 ? 14 : 16) : 0; + encoding |= slc ? 1 << (ctx.gfx_level >= GFX11 ? 15 : 17) : 0; if (ctx.gfx_level >= GFX10) { assert(!flat.nv); - encoding |= flat.dlc ? 1 << (ctx.gfx_level >= GFX11 ? 13 : 12) : 0; + encoding |= dlc ? 1 << (ctx.gfx_level >= GFX11 ? 13 : 12) : 0; } else { - assert(!flat.dlc); + assert(!dlc); } out.push_back(encoding); encoding = reg(ctx, instr->operands[0], 8); diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 2de459a79e5..f1dde7a0171 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -4423,6 +4423,35 @@ lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX}; +ac_hw_cache_flags +get_gfx6_cache_flags(bool glc, bool slc, bool dlc) +{ + uint8_t value = 0; + value |= glc ? ac_glc : 0; + value |= slc ? ac_slc : 0; + value |= dlc ? ac_dlc : 0; + return ac_hw_cache_flags{value}; +} + +ac_hw_cache_flags +get_load_cache_flags(Builder& bld, bool glc, bool slc) +{ + bool dlc = glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); + return get_gfx6_cache_flags(glc, slc, dlc); +} + +ac_hw_cache_flags +get_store_cache_flags(Builder& bld, bool glc, bool slc) +{ + return get_gfx6_cache_flags(glc, slc, false); +} + +ac_hw_cache_flags +get_atomic_cache_flags(Builder& bld, bool return_previous) +{ + return get_gfx6_cache_flags(return_previous, false, false); +} + Temp smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed, unsigned align, unsigned const_offset, Temp dst_hint) @@ -4478,9 +4507,7 @@ smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned RegClass rc(RegType::sgpr, DIV_ROUND_UP(bytes_needed, 4u)); Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc); load->definitions[0] = Definition(val); - load->smem().glc = info.glc; - load->smem().dlc = - info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); + load->smem().cache = get_load_cache_flags(bld, info.glc, false); load->smem().sync = info.sync; bld.insert(std::move(load)); return val; @@ -4539,13 +4566,11 @@ mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne mubuf->operands[2] = soffset; mubuf->mubuf().offen = offen; mubuf->mubuf().idxen = idxen; - mubuf->mubuf().glc = info.glc; - mubuf->mubuf().dlc = - info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); - mubuf->mubuf().slc = info.slc; + mubuf->mubuf().cache = get_load_cache_flags(bld, info.glc, info.slc); + if (info.swizzle_component_size != 0) + mubuf->mubuf().cache.value |= ac_swizzled; mubuf->mubuf().sync = info.sync; mubuf->mubuf().offset = const_offset; - mubuf->mubuf().swizzled = info.swizzle_component_size != 0; RegClass rc = RegClass::get(RegType::vgpr, bytes_size); Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc); mubuf->definitions[0] = Definition(val); @@ -4607,10 +4632,7 @@ mubuf_load_format_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, mubuf->operands[2] = soffset; mubuf->mubuf().offen = offen; mubuf->mubuf().idxen = idxen; - mubuf->mubuf().glc = info.glc; - mubuf->mubuf().dlc = - info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); - mubuf->mubuf().slc = info.slc; + mubuf->mubuf().cache = get_load_cache_flags(bld, info.glc, info.slc); mubuf->mubuf().sync = info.sync; mubuf->mubuf().offset = const_offset; RegClass rc = RegClass::get(RegType::vgpr, bytes_needed); @@ -4818,8 +4840,7 @@ global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsign mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr)); mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); mubuf->operands[2] = Operand(offset); - mubuf->mubuf().glc = info.glc; - mubuf->mubuf().dlc = false; + mubuf->mubuf().cache = get_load_cache_flags(bld, info.glc, false); mubuf->mubuf().offset = const_offset; mubuf->mubuf().addr64 = addr.type() == RegType::vgpr; mubuf->mubuf().disable_wqm = false; @@ -4838,9 +4859,7 @@ global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsign flat->operands[0] = Operand(addr); flat->operands[1] = Operand(s1); } - flat->flatlike().glc = info.glc; - flat->flatlike().dlc = - info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); + flat->flatlike().cache = get_load_cache_flags(bld, info.glc, false); flat->flatlike().sync = info.sync; assert(global || !const_offset); flat->flatlike().offset = const_offset; @@ -5673,10 +5692,7 @@ mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne mtbuf->operands[2] = soffset; mtbuf->mtbuf().offen = offen; mtbuf->mtbuf().idxen = idxen; - mtbuf->mtbuf().glc = info.glc; - mtbuf->mtbuf().dlc = - info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); - mtbuf->mtbuf().slc = info.slc; + mtbuf->mtbuf().cache = get_load_cache_flags(bld, info.glc, info.slc); mtbuf->mtbuf().sync = info.sync; mtbuf->mtbuf().offset = const_offset; mtbuf->mtbuf().dfmt = fetch_fmt & 0xf; @@ -6220,6 +6236,7 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr) memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0); unsigned access = nir_intrinsic_access(instr); + bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT); unsigned result_size = instr->def.num_components - is_sparse; unsigned expand_mask = nir_def_components_read(&instr->def) & u_bit_consecutive(0, result_size); @@ -6275,9 +6292,7 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr) load->operands[2] = Operand::c32(0); load->definitions[0] = Definition(tmp); load->mubuf().idxen = true; - load->mubuf().glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT); - load->mubuf().dlc = load->mubuf().glc && - (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3); + load->mubuf().cache = get_load_cache_flags(bld, glc, false); load->mubuf().sync = sync; load->mubuf().tfe = is_sparse; if (load->mubuf().tfe) @@ -6296,9 +6311,7 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr) Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1); MIMG_instruction* load = emit_mimg(bld, opcode, tmp, resource, Operand(s4), coords, vdata); - load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0; - load->dlc = - load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3); + load->cache = get_load_cache_flags(bld, glc, false); load->a16 = instr->src[1].ssa->bit_size == 16; load->d16 = d16; load->dmask = dmask; @@ -6422,8 +6435,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr) store->operands[2] = Operand::c32(0); store->operands[3] = Operand(data); store->mubuf().idxen = true; - store->mubuf().glc = glc; - store->mubuf().dlc = false; + store->mubuf().cache = get_store_cache_flags(bld, glc, false); store->mubuf().disable_wqm = true; store->mubuf().sync = sync; ctx->program->needs_exact = true; @@ -6440,8 +6452,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr) MIMG_instruction* store = emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, Operand(data)); - store->glc = glc; - store->dlc = false; + store->cache = get_store_cache_flags(bld, glc, false); store->a16 = instr->src[1].ssa->bit_size == 16; store->d16 = d16; store->dmask = dmask; @@ -6581,8 +6592,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr) mubuf->definitions[0] = def; mubuf->mubuf().offset = 0; mubuf->mubuf().idxen = true; - mubuf->mubuf().glc = return_previous; - mubuf->mubuf().dlc = false; /* Not needed for atomics */ + mubuf->mubuf().cache = get_atomic_cache_flags(bld, return_previous); mubuf->mubuf().disable_wqm = true; mubuf->mubuf().sync = sync; ctx->program->needs_exact = true; @@ -6597,8 +6607,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr) Temp tmp = return_previous ? (cmpswap ? bld.tmp(data.regClass()) : dst) : Temp(0, v1); MIMG_instruction* mimg = emit_mimg(bld, image_op, tmp, resource, Operand(s4), coords, Operand(data)); - mimg->glc = return_previous; - mimg->dlc = false; /* Not needed for atomics */ + mimg->cache = get_atomic_cache_flags(bld, return_previous); mimg->dmask = (1 << data.size()) - 1; mimg->a16 = instr->src[1].ssa->bit_size == 16; mimg->unrm = true; @@ -6670,8 +6679,8 @@ visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr) store->operands[3] = Operand(write_datas[i]); store->mubuf().offset = offsets[i]; store->mubuf().offen = (offset.type() == RegType::vgpr); - store->mubuf().glc = glc || (ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4); - store->mubuf().dlc = false; + store->mubuf().cache = get_store_cache_flags( + bld, glc || (ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4), false); store->mubuf().disable_wqm = true; store->mubuf().sync = sync; ctx->program->needs_exact = true; @@ -6712,8 +6721,7 @@ visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr) mubuf->definitions[0] = def; mubuf->mubuf().offset = 0; mubuf->mubuf().offen = (offset.type() == RegType::vgpr); - mubuf->mubuf().glc = return_previous; - mubuf->mubuf().dlc = false; /* Not needed for atomics */ + mubuf->mubuf().cache = get_atomic_cache_flags(bld, return_previous); mubuf->mubuf().disable_wqm = true; mubuf->mubuf().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw); ctx->program->needs_exact = true; @@ -6846,8 +6854,7 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr) flat->operands[1] = Operand(s1); } flat->operands[2] = Operand(write_datas[i]); - flat->flatlike().glc = glc; - flat->flatlike().dlc = false; + flat->flatlike().cache = get_store_cache_flags(bld, glc, false); assert(global || !write_const_offset); flat->flatlike().offset = write_const_offset; flat->flatlike().disable_wqm = true; @@ -6867,8 +6874,8 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr) write_address.type() == RegType::vgpr ? Operand(write_address) : Operand(v1); mubuf->operands[2] = Operand(write_offset); mubuf->operands[3] = Operand(write_datas[i]); - mubuf->mubuf().glc = glc || write_datas[i].bytes() < 4; - mubuf->mubuf().dlc = false; + mubuf->mubuf().cache = + get_store_cache_flags(bld, glc || write_datas[i].bytes() < 4, false); mubuf->mubuf().offset = write_const_offset; mubuf->mubuf().addr64 = write_address.type() == RegType::vgpr; mubuf->mubuf().disable_wqm = true; @@ -6980,8 +6987,7 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr) flat->operands[2] = Operand(data); if (return_previous) flat->definitions[0] = Definition(dst); - flat->flatlike().glc = return_previous; - flat->flatlike().dlc = false; /* Not needed for atomics */ + flat->flatlike().cache = get_atomic_cache_flags(bld, return_previous); assert(global || !const_offset); flat->flatlike().offset = const_offset; flat->flatlike().disable_wqm = true; @@ -7007,8 +7013,7 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr) return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition(); if (return_previous) mubuf->definitions[0] = def; - mubuf->mubuf().glc = return_previous; - mubuf->mubuf().dlc = false; + mubuf->mubuf().cache = get_atomic_cache_flags(bld, return_previous); mubuf->mubuf().offset = const_offset; mubuf->mubuf().addr64 = addr.type() == RegType::vgpr; mubuf->mubuf().disable_wqm = true; @@ -7167,6 +7172,9 @@ visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT; glc |= ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4; glc &= ctx->program->gfx_level < GFX11; + ac_hw_cache_flags cache = get_store_cache_flags(bld, glc, slc); + if (swizzled) + cache.value |= ac_swizzled; Operand vaddr_op(v1); if (offen && idxen) @@ -7177,9 +7185,8 @@ visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) vaddr_op = Operand(idx); Instruction* mubuf = bld.mubuf(op, Operand(descriptor), vaddr_op, s_offset, - Operand(write_datas[i]), const_offset, offen, swizzled, idxen, - /* addr64 */ false, /* disable_wqm */ false, glc, - /* dlc */ false, slc) + Operand(write_datas[i]), const_offset, offen, idxen, + /* addr64 */ false, /* disable_wqm */ false, cache) .instr; mubuf->mubuf().sync = sync; } @@ -7637,9 +7644,11 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr) for (unsigned i = 0; i < write_count; i++) { aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, - write_datas[i], offsets[i], true, true); + write_datas[i], offsets[i], true); mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private); - mubuf->mubuf().glc = ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4; + bool glc = ctx->program->gfx_level == GFX6 && write_datas[i].bytes() < 4; + mubuf->mubuf().cache = get_store_cache_flags(bld, glc, false); + mubuf->mubuf().cache.value |= ac_swizzled; } } } @@ -12098,9 +12107,12 @@ select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shade bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2), Operand::zero()); + ac_hw_cache_flags cache_glc; + cache_glc.value = ac_glc; + /* Store TTMP0-TTMP1. */ bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(), - Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true); + Operand(PhysReg{ttmp0}, s2), memory_sync_info(), cache_glc); uint32_t hw_regs_idx[] = { 2, /* HW_REG_STATUS */ @@ -12116,7 +12128,8 @@ select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shade ((20 - 1) << 11) | hw_regs_idx[i]); bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4), - Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true); + Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), + cache_glc); } program->config->float_mode = program->blocks[0].fp_mode.val; @@ -12632,18 +12645,18 @@ load_unaligned_vs_attrib(Builder& bld, PhysReg dst, Operand desc, Operand index, PhysReg scratch(load.scratch); if (load.d16) { bld.mubuf(aco_opcode::buffer_load_ubyte_d16, Definition(dst, v1), desc, index, - Operand::c32(0u), offset, false, false, true); + Operand::c32(0u), offset, false, true); bld.mubuf(aco_opcode::buffer_load_ubyte_d16_hi, Definition(dst, v1), desc, index, - Operand::c32(0u), offset + 2, false, false, true); + Operand::c32(0u), offset + 2, false, true); bld.mubuf(aco_opcode::buffer_load_ubyte_d16, Definition(scratch, v1), desc, index, - Operand::c32(0u), offset + 1, false, false, true); + Operand::c32(0u), offset + 1, false, true); bld.mubuf(aco_opcode::buffer_load_ubyte_d16_hi, Definition(scratch, v1), desc, index, - Operand::c32(0u), offset + 3, false, false, true); + Operand::c32(0u), offset + 3, false, true); } else { for (unsigned i = 0; i < size; i++) { Definition def(i ? scratch.advance(i * 4 - 4) : dst, v1); bld.mubuf(aco_opcode::buffer_load_ubyte, def, desc, index, Operand::c32(0u), offset + i, - false, false, true); + false, true); } } @@ -12835,7 +12848,7 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh i += slots; } else { bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4), - Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true); + Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, true); loc++; i++; } diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index bbac03eb410..aa3c4bb996c 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -15,6 +15,7 @@ #include "ac_binary.h" #include "ac_hw_stage.h" +#include "ac_shader_util.h" #include "amd_family.h" #include #include @@ -1309,11 +1310,7 @@ static_assert(sizeof(SALU_instruction) == sizeof(Instruction) + 4, "Unexpected p */ struct SMEM_instruction : public Instruction { memory_sync_info sync; - bool glc : 1; /* VI+: globally coherent */ - bool dlc : 1; /* NAVI: device level coherent */ - bool nv : 1; /* VEGA only: Non-volatile */ - bool disable_wqm : 1; - uint8_t padding : 4; + ac_hw_cache_flags cache; }; static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); @@ -1492,19 +1489,16 @@ static_assert(sizeof(LDSDIR_instruction) == sizeof(Instruction) + 8, "Unexpected */ struct MUBUF_instruction : public Instruction { memory_sync_info sync; + ac_hw_cache_flags cache; bool offen : 1; /* Supply an offset from VGPR (VADDR) */ bool idxen : 1; /* Supply an index from VGPR (VADDR) */ bool addr64 : 1; /* SI, CIK: Address size is 64-bit */ - bool glc : 1; /* globally coherent */ - bool dlc : 1; /* NAVI: device level coherent */ - bool slc : 1; /* system level coherent */ bool tfe : 1; /* texture fail enable */ bool lds : 1; /* Return read-data to LDS instead of VGPRs */ - uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */ - uint16_t offset : 12; /* Unsigned byte offset - 12 bit */ - uint16_t swizzled : 1; - uint16_t padding0 : 2; - uint16_t padding1; + bool disable_wqm : 1; /* Require an exec mask without helper invocations */ + uint8_t padding0 : 2; + uint8_t padding1; + uint16_t offset; /* Unsigned byte offset - 12 bit */ }; static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); @@ -1518,16 +1512,14 @@ static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected */ struct MTBUF_instruction : public Instruction { memory_sync_info sync; + ac_hw_cache_flags cache; uint8_t dfmt : 4; /* Data Format of data in memory buffer */ uint8_t nfmt : 3; /* Numeric format of data in memory */ bool offen : 1; /* Supply an offset from VGPR (VADDR) */ - uint16_t idxen : 1; /* Supply an index from VGPR (VADDR) */ - uint16_t glc : 1; /* globally coherent */ - uint16_t dlc : 1; /* NAVI: device level coherent */ - uint16_t slc : 1; /* system level coherent */ - uint16_t tfe : 1; /* texture fail enable */ - uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */ - uint16_t padding : 10; + bool idxen : 1; /* Supply an index from VGPR (VADDR) */ + bool tfe : 1; /* texture fail enable */ + bool disable_wqm : 1; /* Require an exec mask without helper invocations */ + uint8_t padding : 5; uint16_t offset; /* Unsigned byte offset - 12 bit */ }; static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); @@ -1543,12 +1535,10 @@ static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected */ struct MIMG_instruction : public Instruction { memory_sync_info sync; + ac_hw_cache_flags cache; uint8_t dmask; /* Data VGPR enable mask */ uint8_t dim : 3; /* NAVI: dimensionality */ bool unrm : 1; /* Force address to be un-normalized */ - bool dlc : 1; /* NAVI: device level coherent */ - bool glc : 1; /* globally coherent */ - bool slc : 1; /* system level coherent */ bool tfe : 1; /* texture fail enable */ bool da : 1; /* declare an array */ bool lwe : 1; /* LOD warning enable */ @@ -1557,9 +1547,8 @@ struct MIMG_instruction : public Instruction { bool d16 : 1; /* Convert 32-bit data to 16-bit data */ bool disable_wqm : 1; /* Require an exec mask without helper invocations */ bool strict_wqm : 1; /* VADDR is a linear VGPR and additional VGPRs may be copied into it */ - uint8_t padding0 : 1; + uint8_t padding0 : 4; uint8_t padding1; - uint8_t padding2; }; static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); @@ -1572,15 +1561,13 @@ static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected p */ struct FLAT_instruction : public Instruction { memory_sync_info sync; - bool slc : 1; /* system level coherent */ - bool glc : 1; /* globally coherent */ - bool dlc : 1; /* NAVI: device level coherent */ + ac_hw_cache_flags cache; bool lds : 1; bool nv : 1; bool disable_wqm : 1; /* Require an exec mask without helper invocations */ - uint8_t padding0 : 2; + uint8_t padding0 : 5; + uint8_t padding1; int16_t offset; /* Vega/Navi only */ - uint16_t padding1; }; static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 9f23d595d78..816c59464dd 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -105,9 +105,7 @@ class Format(IntEnum): return [('uint32_t', 'imm', '0')] elif self == Format.SMEM: return [('memory_sync_info', 'sync', 'memory_sync_info()'), - ('bool', 'glc', 'false'), - ('bool', 'dlc', 'false'), - ('bool', 'nv', 'false')] + ('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}')] elif self == Format.DS: return [('uint16_t', 'offset0', '0'), ('uint8_t', 'offset1', '0'), @@ -125,20 +123,15 @@ class Format(IntEnum): ('bool', 'offen', None), ('bool', 'idxen', 'false'), ('bool', 'disable_wqm', 'false'), - ('bool', 'glc', 'false'), - ('bool', 'dlc', 'false'), - ('bool', 'slc', 'false'), + ('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'), ('bool', 'tfe', 'false')] elif self == Format.MUBUF: return [('unsigned', 'offset', None), ('bool', 'offen', None), - ('bool', 'swizzled', 'false'), ('bool', 'idxen', 'false'), ('bool', 'addr64', 'false'), ('bool', 'disable_wqm', 'false'), - ('bool', 'glc', 'false'), - ('bool', 'dlc', 'false'), - ('bool', 'slc', 'false'), + ('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'), ('bool', 'tfe', 'false'), ('bool', 'lds', 'false')] elif self == Format.MIMG: @@ -146,9 +139,7 @@ class Format(IntEnum): ('bool', 'da', 'false'), ('bool', 'unrm', 'false'), ('bool', 'disable_wqm', 'false'), - ('bool', 'glc', 'false'), - ('bool', 'dlc', 'false'), - ('bool', 'slc', 'false'), + ('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'), ('bool', 'tfe', 'false'), ('bool', 'lwe', 'false'), ('bool', 'r128', 'false'), @@ -195,8 +186,7 @@ class Format(IntEnum): elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]: return [('int16_t', 'offset', 0), ('memory_sync_info', 'sync', 'memory_sync_info()'), - ('bool', 'glc', 'false'), - ('bool', 'slc', 'false'), + ('ac_hw_cache_flags', 'cache', '{{0, 0, 0, 0, 0}}'), ('bool', 'lds', 'false'), ('bool', 'nv', 'false')] else: diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index 4f46f386d69..e040221be61 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -164,8 +164,7 @@ struct InstrPred { case Format::SMEM: { SMEM_instruction& aS = a->smem(); SMEM_instruction& bS = b->smem(); - return aS.sync == bS.sync && aS.glc == bS.glc && aS.dlc == bS.dlc && aS.nv == bS.nv && - aS.disable_wqm == bS.disable_wqm; + return aS.sync == bS.sync && aS.cache.value == bS.cache.value; } case Format::VINTRP: { VINTRP_instruction& aI = a->vintrp(); @@ -203,21 +202,21 @@ struct InstrPred { MTBUF_instruction& bM = b->mtbuf(); return aM.sync == bM.sync && aM.dfmt == bM.dfmt && aM.nfmt == bM.nfmt && aM.offset == bM.offset && aM.offen == bM.offen && aM.idxen == bM.idxen && - aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc && aM.tfe == bM.tfe && + aM.cache.value == bM.cache.value && aM.tfe == bM.tfe && aM.disable_wqm == bM.disable_wqm; } case Format::MUBUF: { MUBUF_instruction& aM = a->mubuf(); MUBUF_instruction& bM = b->mubuf(); return aM.sync == bM.sync && aM.offset == bM.offset && aM.offen == bM.offen && - aM.idxen == bM.idxen && aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc && - aM.tfe == bM.tfe && aM.lds == bM.lds && aM.disable_wqm == bM.disable_wqm; + aM.idxen == bM.idxen && aM.cache.value == bM.cache.value && aM.tfe == bM.tfe && + aM.lds == bM.lds && aM.disable_wqm == bM.disable_wqm; } case Format::MIMG: { MIMG_instruction& aM = a->mimg(); MIMG_instruction& bM = b->mimg(); return aM.sync == bM.sync && aM.dmask == bM.dmask && aM.unrm == bM.unrm && - aM.glc == bM.glc && aM.slc == bM.slc && aM.tfe == bM.tfe && aM.da == bM.da && + aM.cache.value == bM.cache.value && aM.tfe == bM.tfe && aM.da == bM.da && aM.lwe == bM.lwe && aM.r128 == bM.r128 && aM.a16 == bM.a16 && aM.d16 == bM.d16 && aM.disable_wqm == bM.disable_wqm; } diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index ecce23e8bf0..61e3d605745 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -860,10 +860,7 @@ smem_combine(opt_ctx& ctx, aco_ptr& instr) if (!smem.definitions.empty()) new_instr->definitions[0] = smem.definitions[0]; new_instr->smem().sync = smem.sync; - new_instr->smem().glc = smem.glc; - new_instr->smem().dlc = smem.dlc; - new_instr->smem().nv = smem.nv; - new_instr->smem().disable_wqm = smem.disable_wqm; + new_instr->smem().cache = smem.cache; instr.reset(new_instr); } } @@ -1429,13 +1426,14 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) while (info.is_temp()) info = ctx.info[info.temp.id()]; + bool swizzled = mubuf.cache.value & ac_swizzled; /* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr * overflow for scratch accesses works only on GFX9+ and saddr overflow * never works. Since swizzling is the only thing that separates * scratch accesses and other accesses and swizzling changing how * addressing works significantly, this probably applies to swizzled * MUBUF accesses. */ - bool vaddr_prevent_overflow = mubuf.swizzled && ctx.program->gfx_level < GFX9; + bool vaddr_prevent_overflow = swizzled && ctx.program->gfx_level < GFX9; if (mubuf.offen && mubuf.idxen && i == 1 && info.is_vec() && info.instr->operands.size() == 2 && info.instr->operands[0].isTemp() && @@ -1465,7 +1463,7 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) mubuf.offset += offset; continue; } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) && - base.regClass() == s1 && mubuf.offset + offset < 4096 && !mubuf.swizzled) { + base.regClass() == s1 && mubuf.offset + offset < 4096 && !swizzled) { instr->operands[i].setTemp(base); mubuf.offset += offset; continue; diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index e34abb9b5fd..dfd86114998 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -262,6 +262,20 @@ print_sync(memory_sync_info sync, FILE* output) print_scope(sync.scope, output); } +template +static void +print_cache_flags(enum amd_gfx_level gfx_level, const T& instr, FILE* output) +{ + if (instr.cache.value & ac_glc) + fprintf(output, " glc"); + if (instr.cache.value & ac_slc) + fprintf(output, " slc"); + if (instr.cache.value & ac_dlc) + fprintf(output, " dlc"); + if (instr.cache.value & ac_swizzled) + fprintf(output, " swizzled"); +} + static void print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* instr, FILE* output) { @@ -428,12 +442,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins } case Format::SMEM: { const SMEM_instruction& smem = instr->smem(); - if (smem.glc) - fprintf(output, " glc"); - if (smem.dlc) - fprintf(output, " dlc"); - if (smem.nv) - fprintf(output, " nv"); + print_cache_flags(gfx_level, smem, output); print_sync(smem.sync, output); break; } @@ -482,12 +491,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins fprintf(output, " idxen"); if (mubuf.addr64) fprintf(output, " addr64"); - if (mubuf.glc) - fprintf(output, " glc"); - if (mubuf.dlc) - fprintf(output, " dlc"); - if (mubuf.slc) - fprintf(output, " slc"); + print_cache_flags(gfx_level, mubuf, output); if (mubuf.tfe) fprintf(output, " tfe"); if (mubuf.lds) @@ -517,12 +521,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins } if (mimg.unrm) fprintf(output, " unrm"); - if (mimg.glc) - fprintf(output, " glc"); - if (mimg.dlc) - fprintf(output, " dlc"); - if (mimg.slc) - fprintf(output, " slc"); + print_cache_flags(gfx_level, mimg, output); if (mimg.tfe) fprintf(output, " tfe"); if (mimg.da) @@ -594,12 +593,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins const FLAT_instruction& flat = instr->flatlike(); if (flat.offset) fprintf(output, " offset:%d", flat.offset); - if (flat.glc) - fprintf(output, " glc"); - if (flat.dlc) - fprintf(output, " dlc"); - if (flat.slc) - fprintf(output, " slc"); + print_cache_flags(gfx_level, flat, output); if (flat.lds) fprintf(output, " lds"); if (flat.nv) @@ -646,12 +640,7 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins fprintf(output, " offen"); if (mtbuf.idxen) fprintf(output, " idxen"); - if (mtbuf.glc) - fprintf(output, " glc"); - if (mtbuf.dlc) - fprintf(output, " dlc"); - if (mtbuf.slc) - fprintf(output, " slc"); + print_cache_flags(gfx_level, mtbuf, output); if (mtbuf.tfe) fprintf(output, " tfe"); if (mtbuf.disable_wqm) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 10b8992578d..37d6b7408bb 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -1324,8 +1324,9 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector>& inst offset, memory_sync_info(storage_vgpr_spill, semantic_private)); } else { Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc, - Operand(v1), scratch_offset, elem, offset, false, true); + Operand(v1), scratch_offset, elem, offset, false); instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); + instr->mubuf().cache.value = ac_swizzled; } } } else if (ctx.program->gfx_level >= GFX9) { @@ -1333,8 +1334,9 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector>& inst memory_sync_info(storage_vgpr_spill, semantic_private)); } else { Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc, Operand(v1), - scratch_offset, temp, offset, false, true); + scratch_offset, temp, offset, false); instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); + instr->mubuf().cache.value = ac_swizzled; } } @@ -1366,8 +1368,9 @@ reload_vgpr(spill_ctx& ctx, Block& block, std::vector>& ins } else { Instruction* instr = bld.mubuf(aco_opcode::buffer_load_dword, Definition(tmp), ctx.scratch_rsrc, - Operand(v1), scratch_offset, offset, false, true); + Operand(v1), scratch_offset, offset, false); instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); + instr->mubuf().cache.value = ac_swizzled; } } bld.insert(vec); @@ -1376,8 +1379,9 @@ reload_vgpr(spill_ctx& ctx, Block& block, std::vector>& ins memory_sync_info(storage_vgpr_spill, semantic_private)); } else { Instruction* instr = bld.mubuf(aco_opcode::buffer_load_dword, def, ctx.scratch_rsrc, - Operand(v1), scratch_offset, offset, false, true); + Operand(v1), scratch_offset, offset, false); instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); + instr->mubuf().cache.value = ac_swizzled; } } diff --git a/src/amd/compiler/tests/test_assembler.cpp b/src/amd/compiler/tests/test_assembler.cpp index 57a6b3c5d87..604c91acc68 100644 --- a/src/amd/compiler/tests/test_assembler.cpp +++ b/src/amd/compiler/tests/test_assembler.cpp @@ -411,13 +411,19 @@ BEGIN_TEST(assembler.smem) //! s_load_b32 s4, s[16:17], s8 offset:0x2a ; f4000108 1000002a bld.smem(aco_opcode::s_load_dword, dst, op_s2, Operand::c32(42), op_s1); + ac_hw_cache_flags cache_coherent; + ac_hw_cache_flags cache_non_temporal; + cache_coherent.value = ac_glc; + cache_non_temporal.value = ac_dlc; + //~gfx11! s_buffer_load_b32 s4, s[32:35], s8 glc ; f4204110 10000000 //~gfx12! s_buffer_load_b32 s4, s[32:35], s8 offset:0x0 scope:SCOPE_SYS ; f4620110 10000000 - bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().glc = true; + bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().cache = cache_coherent; //~gfx11! s_buffer_load_b32 s4, s[32:35], s8 dlc ; f4202110 10000000 //~gfx12! (then repeated 1 times) - bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().dlc = true; + bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().cache = + cache_non_temporal; finish_assembler_test(); } @@ -482,22 +488,31 @@ BEGIN_TEST(assembler.mubuf) bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), op_s1, 84, false); /* Various flags */ + ac_hw_cache_flags cache_coherent; + ac_hw_cache_flags cache_sys_coherent; + ac_hw_cache_flags cache_non_temporal; + ac_hw_cache_flags cache_atomic_rtn; + cache_coherent.value = ac_glc; + cache_sys_coherent.value = ac_slc; + cache_non_temporal.value = ac_dlc; + cache_atomic_rtn.value = ac_glc; + //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 glc ; e0504000 80082a80 //~gfx12! buffer_load_b32 v42, off, s[32:35], null scope:SCOPE_SYS ; c405007c 008c402a 00000000 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false) ->mubuf() - .glc = true; + .cache = cache_coherent; //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 dlc ; e0502000 80082a80 //~gfx12! (then repeated 2 times) bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false) ->mubuf() - .dlc = true; + .cache = cache_non_temporal; //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 slc ; e0501000 80082a80 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false) ->mubuf() - .slc = true; + .cache = cache_sys_coherent; //; if llvm_ver >= 16 and variant == 'gfx11': //; insert_pattern('buffer_load_b32 v[42:43], off, s[32:35], 0 tfe ; e0500000 80282a80') @@ -562,7 +577,7 @@ BEGIN_TEST(assembler.mubuf) bld.mubuf(aco_opcode::buffer_atomic_add, Definition(op_v1.physReg(), v1), op_s4, Operand(v1), Operand::zero(), op_v1, 0, false) ->mubuf() - .glc = true; + .cache = cache_atomic_rtn; finish_assembler_test(); } @@ -632,25 +647,32 @@ BEGIN_TEST(assembler.mtbuf) false); /* Various flags */ + ac_hw_cache_flags cache_coherent; + ac_hw_cache_flags cache_sys_coherent; + ac_hw_cache_flags cache_non_temporal; + cache_coherent.value = ac_glc; + cache_sys_coherent.value = ac_slc; + cache_non_temporal.value = ac_dlc; + //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] glc ; e9904000 80082a80 //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] scope:SCOPE_SYS ; c420007c 190c402a 00000080 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt, nfmt, 0, false) ->mtbuf() - .glc = true; + .cache = cache_coherent; //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] dlc ; e9902000 80082a80 //~gfx12! (then repeated 2 times) bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt, nfmt, 0, false) ->mtbuf() - .dlc = true; + .cache = cache_non_temporal; //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] slc ; e9901000 80082a80 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt, nfmt, 0, false) ->mtbuf() - .slc = true; + .cache = cache_sys_coherent; //; if llvm_ver >= 16 and variant == 'gfx11': //; insert_pattern('tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] ; e9900000 80282a80') @@ -718,19 +740,28 @@ BEGIN_TEST(assembler.mimg) 0x1; /* Various flags */ + ac_hw_cache_flags cache_coherent; + ac_hw_cache_flags cache_sys_coherent; + ac_hw_cache_flags cache_non_temporal; + ac_hw_cache_flags cache_atomic_rtn; + cache_coherent.value = ac_glc; + cache_sys_coherent.value = ac_slc; + cache_non_temporal.value = ac_dlc; + cache_atomic_rtn.value = ac_glc; + //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D dlc ; f06c2f00 2010540a //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D scope:SCOPE_SYS ; e7c6c000 100c8054 0000000a - bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().dlc = - true; + bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache = + cache_non_temporal; //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D glc ; f06c4f00 2010540a //~gfx12! (then repeated 2 times) - bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().glc = - true; + bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache = + cache_coherent; //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D slc ; f06c1f00 2010540a - bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().slc = - true; + bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache = + cache_sys_coherent; //~gfx11! image_sample v[84:88], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; f06c0f00 2030540a //~gfx12! image_sample v[84:88], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; e7c6c008 10008054 0000000a @@ -799,7 +830,7 @@ BEGIN_TEST(assembler.mimg) //~gfx11! image_atomic_add v10, v20, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D glc ; f0304f04 00100a14 //~gfx12! image_atomic_add_uint v10, [v20, v21, v0, v0], s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D th:TH_ATOMIC_RETURN ; d3c30001 0010800a 00001514 bld.mimg(aco_opcode::image_atomic_add, Definition(op_v1.physReg(), v1), op_s8, Operand(s4), - op_v1, op_v2, 0xf, false, false, false, true) + op_v1, op_v2, 0xf, false, false, false, cache_atomic_rtn) ->mimg() .dim = ac_image_2d; @@ -876,16 +907,28 @@ BEGIN_TEST(assembler.flat) bld.global(aco_opcode::global_load_dword, dst_v1, op_v2, Operand(s1), 84); /* Various flags */ + ac_hw_cache_flags cache_coherent; + ac_hw_cache_flags cache_sys_coherent; + ac_hw_cache_flags cache_non_temporal; + ac_hw_cache_flags cache_atomic_rtn; + cache_coherent.value = ac_glc; + cache_sys_coherent.value = ac_slc; + cache_non_temporal.value = ac_dlc; + cache_atomic_rtn.value = ac_glc; + //~gfx11! flat_load_b32 v42, v[20:21] slc ; dc508000 2a7c0014 //~gfx12! flat_load_b32 v42, v[20:21] scope:SCOPE_SYS ; ec05007c 000c002a 00000014 - bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().slc = true; + bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache = + cache_sys_coherent; //~gfx11! flat_load_b32 v42, v[20:21] glc ; dc504000 2a7c0014 //~gfx12! (then repeated 2 times) - bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().glc = true; + bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache = + cache_coherent; //~gfx11! flat_load_b32 v42, v[20:21] dlc ; dc502000 2a7c0014 - bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().dlc = true; + bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache = + cache_non_temporal; /* Stores */ //~gfx11! flat_store_b32 v[20:21], v10 ; dc680000 007c0a14 @@ -895,8 +938,8 @@ BEGIN_TEST(assembler.flat) /* Atomic with return */ //~gfx11! global_atomic_add_u32 v42, v[20:21], v10, off glc ; dcd64000 2a7c0a14 //~gfx12! global_atomic_add_u32 v42, v[20:21], v10, off th:TH_ATOMIC_RETURN ; ee0d407c 0510002a 00000014 - bld.global(aco_opcode::global_atomic_add, dst_v1, op_v2, Operand(s1), op_v1)->global().glc = - true; + bld.global(aco_opcode::global_atomic_add, dst_v1, op_v2, Operand(s1), op_v1)->global().cache = + cache_atomic_rtn; finish_assembler_test(); }