diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 35f07dddb85..05ec485a2cf 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -428,6 +428,15 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* break; } case Format::MIMG: { + unsigned use_nsa = false; + unsigned addr_dwords = instr->operands.size() - 3; + for (unsigned i = 1; i < addr_dwords; i++) { + if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4)) + use_nsa = true; + } + assert(!use_nsa || ctx.chip_class >= GFX10); + unsigned nsa_dwords = use_nsa ? DIV_ROUND_UP(addr_dwords - 1, 4) : 0; + MIMG_instruction* mimg = static_cast(instr); uint32_t encoding = (0b111100 << 26); encoding |= mimg->slc ? 1 << 25 : 0; @@ -443,6 +452,7 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= mimg->da ? 1 << 14 : 0; } else { encoding |= mimg->r128 ? 1 << 15 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */ + encoding |= nsa_dwords << 1; encoding |= mimg->dim << 3; /* GFX10: dimensionality instead of declare array */ encoding |= mimg->dlc ? 1 << 7 : 0; } @@ -465,6 +475,13 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* } out.push_back(encoding); + + if (nsa_dwords) { + out.resize(out.size() + nsa_dwords); + std::vector::iterator nsa = std::prev(out.end(), nsa_dwords); + for (unsigned i = 0; i < addr_dwords - 1; i++) + nsa[i / 4] |= (0xFF & instr->operands[4 + i].physReg().reg()) << (i % 4 * 8); + } break; } case Format::FLAT: diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index d97604f8571..08ad9572bdb 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5841,38 +5841,52 @@ static MIMG_instruction *emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp, - const std::vector& coords, + std::vector coords, unsigned num_wqm_coords=0, Operand vdata=Operand(v1)) { - Temp coord = coords[0]; - if (coords.size() > 1) { - coord = bld.tmp(RegType::vgpr, coords.size()); + if (bld.program->chip_class < GFX10) { + Temp coord = coords[0]; + if (coords.size() > 1) { + coord = bld.tmp(RegType::vgpr, coords.size()); - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)}; - for (unsigned i = 0; i < coords.size(); i++) - vec->operands[i] = Operand(coords[i]); - vec->definitions[0] = Definition(coord); - bld.insert(std::move(vec)); - } else if (coord.type() == RegType::sgpr) { - coord = bld.copy(bld.def(v1), coord); - } + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)}; + for (unsigned i = 0; i < coords.size(); i++) + vec->operands[i] = Operand(coords[i]); + vec->definitions[0] = Definition(coord); + bld.insert(std::move(vec)); + } else if (coord.type() == RegType::sgpr) { + coord = bld.copy(bld.def(v1), coord); + } - if (num_wqm_coords) { - /* We don't need the bias, sample index, compare value or offset to be - * computed in WQM but if the p_create_vector copies the coordinates, then it - * needs to be in WQM. */ - coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true); + if (num_wqm_coords) { + /* We don't need the bias, sample index, compare value or offset to be + * computed in WQM but if the p_create_vector copies the coordinates, then it + * needs to be in WQM. */ + coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true); + } + + coords[0] = coord; + coords.resize(1); + } else { + for (unsigned i = 0; i < num_wqm_coords; i++) + coords[i] = emit_wqm(bld, coords[i], bld.tmp(coords[i].regClass()), true); + + for (Temp& coord : coords) { + if (coord.type() == RegType::sgpr) + coord = bld.copy(bld.def(v1), coord); + } } aco_ptr mimg{create_instruction( - op, Format::MIMG, 4, dst.isTemp())}; + op, Format::MIMG, 3 + coords.size(), dst.isTemp())}; if (dst.isTemp()) mimg->definitions[0] = dst; mimg->operands[0] = Operand(rsrc); mimg->operands[1] = samp; mimg->operands[2] = vdata; - mimg->operands[3] = Operand(coord); + for (unsigned i = 0; i < coords.size(); i++) + mimg->operands[3 + i] = Operand(coords[i]); MIMG_instruction *res = mimg.get(); bld.insert(std::move(mimg)); diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 3abb21b5796..4c707b49699 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -436,7 +436,7 @@ bool validate_ir(Program* program) break; } case Format::MIMG: { - check(instr->operands.size() == 4, "MIMG instructions must have 4 operands", instr.get()); + check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands", instr.get()); check(instr->operands[0].hasRegClass() && (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8), "MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get()); if (instr->operands[1].hasRegClass()) @@ -447,8 +447,15 @@ bool validate_ir(Program* program) check(instr->definitions.empty() || (instr->definitions[0].regClass() == instr->operands[2].regClass() || is_cmpswap), "MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and TFE/LWE loads", instr.get()); } - check(instr->operands[3].hasRegClass() && instr->operands[3].regClass().type() == RegType::vgpr, - "MIMG operands[3] (VADDR) must be VGPR", instr.get()); + check(instr->operands.size() == 4 || program->chip_class >= GFX10, "NSA is only supported on GFX10+", instr.get()); + for (unsigned i = 3; i < instr->operands.size(); i++) { + if (instr->operands.size() == 4) { + check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr, + "MIMG operands[3] (VADDR) must be VGPR", instr.get()); + } else { + check(instr->operands[i].regClass() == v1, "MIMG VADDR must be v1 if NSA is used", instr.get()); + } + } check(instr->definitions.empty() || (instr->definitions[0].isTemp() && instr->definitions[0].regClass().type() == RegType::vgpr), "MIMG definitions[0] (VDATA) must be VGPR", instr.get()); break; diff --git a/src/amd/compiler/tests/test_isel.cpp b/src/amd/compiler/tests/test_isel.cpp index c911fea839b..208833c54fc 100644 --- a/src/amd/compiler/tests/test_isel.cpp +++ b/src/amd/compiler/tests/test_isel.cpp @@ -149,18 +149,18 @@ BEGIN_TEST(isel.sparse.clause) }; void main() { //>> v5: (noCSE)%zero0 = p_create_vector 0, 0, 0, 0, 0 - //>> v5: %_ = image_sample_lz_o %_, %_, %zero0, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation + //>> v5: %_ = image_sample_lz_o %_, %_, %zero0, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation //>> v5: (noCSE)%zero1 = p_create_vector 0, 0, 0, 0, 0 - //>> v5: %_ = image_sample_lz_o %_, %_, %zero1, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation + //>> v5: %_ = image_sample_lz_o %_, %_, %zero1, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation //>> v5: (noCSE)%zero2 = p_create_vector 0, 0, 0, 0, 0 - //>> v5: %_ = image_sample_lz_o %_, %_, %zero2, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation + //>> v5: %_ = image_sample_lz_o %_, %_, %zero2, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation //>> v5: (noCSE)%zero3 = p_create_vector 0, 0, 0, 0, 0 - //>> v5: %_ = image_sample_lz_o %_, %_, %zero3, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation + //>> v5: %_ = image_sample_lz_o %_, %_, %zero3, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation //>> s_clause 0x3 - //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe - //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe - //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe - //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe + //! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe + //! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe + //! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe + //! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe code[0] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(1, 0), res[0]); code[1] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(2, 0), res[1]); code[2] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(3, 0), res[2]);