diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 0f1c3afebe0..9f2b52e9bdd 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5930,14 +5930,7 @@ static MIMG_instruction* emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::vector coords, bool needs_wqm = false, Operand vdata = Operand(v1)) { - /* Limit NSA instructions to 3 dwords on GFX10 to avoid stability issues. - * On GFX11 the first 4 vaddr are single registers and the last contains the remaining - * vector. - */ - size_t nsa_size = bld.program->gfx_level == GFX10 ? 5 - : bld.program->gfx_level == GFX10_3 ? 13 - : bld.program->gfx_level >= GFX11 ? 4 - : 0; + size_t nsa_size = bld.program->dev.max_nsa_vgprs; nsa_size = bld.program->gfx_level >= GFX11 || coords.size() <= nsa_size ? nsa_size : 0; for (unsigned i = 0; i < std::min(coords.size(), nsa_size); i++) { diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index f4a8f6375e9..d4d76505270 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -188,6 +188,21 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info, program->dev.scratch_global_offset_max = 4095; } + if (program->gfx_level >= GFX11) { + /* GFX11 can have only 1 NSA dword. The last VGPR isn't included here because it contains the + * rest of the address. + */ + program->dev.max_nsa_vgprs = 4; + } else if (program->gfx_level >= GFX10_3) { + /* GFX10.3 can have up to 3 NSA dwords. */ + program->dev.max_nsa_vgprs = 13; + } else if (program->gfx_level >= GFX10) { + /* Limit NSA instructions to 1 NSA dword on GFX10 to avoid stability issues. */ + program->dev.max_nsa_vgprs = 5; + } else { + program->dev.max_nsa_vgprs = 0; + } + program->wgp_mode = wgp_mode; program->progress = CompilationProgress::after_isel; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 809cab79728..6f7071a1437 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1619,7 +1619,8 @@ struct MIMG_instruction : public Instruction { bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */ bool d16 : 1; /* Convert 32-bit data to 16-bit data */ bool disable_wqm : 1; /* Require an exec mask without helper invocations */ - uint8_t padding0 : 2; + bool strict_wqm : 1; /* VADDR is a linear VGPR and additional VGPRs may be copied into it */ + uint8_t padding0 : 1; uint8_t padding1; uint8_t padding2; }; @@ -2095,6 +2096,7 @@ struct DeviceInfo { int16_t scratch_global_offset_min; int16_t scratch_global_offset_max; + unsigned max_nsa_vgprs; }; enum class CompilationProgress { diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index c048aa406b8..6da27e170d5 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2138,6 +2138,66 @@ hw_init_scratch(Builder& bld, Definition def, Operand scratch_addr, Operand scra } } +void +lower_image_sample(lower_context* ctx, aco_ptr& instr) +{ + Operand linear_vgpr = instr->operands[3]; + + unsigned nsa_size = ctx->program->dev.max_nsa_vgprs; + unsigned vaddr_size = linear_vgpr.size(); + unsigned num_copied_vgprs = instr->operands.size() - 4; + nsa_size = num_copied_vgprs > 0 && (ctx->program->gfx_level >= GFX11 || vaddr_size <= nsa_size) + ? nsa_size + : 0; + + Operand vaddr[16]; + unsigned num_vaddr = 0; + + if (nsa_size) { + assert(num_copied_vgprs <= nsa_size); + for (unsigned i = 0; i < num_copied_vgprs; i++) + vaddr[num_vaddr++] = instr->operands[4 + i]; + for (unsigned i = num_copied_vgprs; i < std::min(vaddr_size, nsa_size); i++) + vaddr[num_vaddr++] = Operand(linear_vgpr.physReg().advance(i * 4), v1); + if (vaddr_size > nsa_size) { + RegClass rc = RegClass::get(RegType::vgpr, (vaddr_size - nsa_size) * 4); + vaddr[num_vaddr++] = Operand(PhysReg(linear_vgpr.physReg().advance(nsa_size * 4)), rc); + } + } else { + PhysReg reg = linear_vgpr.physReg(); + std::map copy_operations; + for (unsigned i = 4; i < instr->operands.size(); i++) { + Operand arg = instr->operands[i]; + Definition def(reg, RegClass::get(RegType::vgpr, arg.bytes())); + copy_operations[def.physReg()] = {arg, def, def.bytes()}; + reg = reg.advance(arg.bytes()); + } + vaddr[num_vaddr++] = linear_vgpr; + + Pseudo_instruction pi = {}; + handle_operands(copy_operations, ctx, ctx->program->gfx_level, &pi); + } + + instr->mimg().strict_wqm = false; + + if ((3 + num_vaddr) > instr->operands.size()) { + MIMG_instruction *new_instr = create_instruction( + instr->opcode, Format::MIMG, 3 + num_vaddr, instr->definitions.size()); + std::copy(instr->definitions.cbegin(), instr->definitions.cend(), + new_instr->definitions.begin()); + new_instr->operands[0] = instr->operands[0]; + new_instr->operands[1] = instr->operands[1]; + new_instr->operands[2] = instr->operands[2]; + memcpy((uint8_t*)new_instr + sizeof(Instruction), (uint8_t*)instr.get() + sizeof(Instruction), + sizeof(MIMG_instruction) - sizeof(Instruction)); + instr.reset(new_instr); + } else { + while (instr->operands.size() > (3 + num_vaddr)) + instr->operands.pop_back(); + } + std::copy(vaddr, vaddr + num_vaddr, std::next(instr->operands.begin(), 3)); +} + void lower_to_hw_instr(Program* program) { @@ -2802,6 +2862,9 @@ lower_to_hw_instr(Program* program) ctx.instructions.emplace_back(std::move(instr)); emit_set_mode(bld, block->fp_mode, set_round, false); + } else if (instr->isMIMG() && instr->mimg().strict_wqm) { + lower_image_sample(&ctx, instr); + ctx.instructions.emplace_back(std::move(instr)); } else { ctx.instructions.emplace_back(std::move(instr)); } diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 1e8625efa26..0fb22f18e3e 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -2445,7 +2445,8 @@ get_affinities(ra_ctx& ctx, std::vector& live_out_per_block) op.getTemp().type() == instr->definitions[0].getTemp().type()) ctx.vectors[op.tempId()] = instr.get(); } - } else if (instr->format == Format::MIMG && instr->operands.size() > 4) { + } else if (instr->format == Format::MIMG && instr->operands.size() > 4 && + !instr->mimg().strict_wqm) { for (unsigned i = 3; i < instr->operands.size(); i++) ctx.vectors[instr->operands[i].tempId()] = instr.get(); } else if (instr->opcode == aco_opcode::p_split_vector && diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 7a21f90b00f..5e2ff297405 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -696,21 +696,36 @@ validate_ir(Program* program) "TFE/LWE loads", instr.get()); } - check(instr->operands.size() == 4 || program->gfx_level >= GFX10, - "NSA is only supported on GFX10+", instr.get()); - for (unsigned i = 3; i < instr->operands.size(); i++) { - check(instr->operands[i].hasRegClass() && - instr->operands[i].regClass().type() == RegType::vgpr, - "MIMG operands[3+] (VADDR) must be VGPR", instr.get()); - if (instr->operands.size() > 4) { - if (program->gfx_level < GFX11) { - check(instr->operands[i].regClass() == v1, - "GFX10 MIMG VADDR must be v1 if NSA is used", instr.get()); - } else { - if (instr->opcode != aco_opcode::image_bvh_intersect_ray && - instr->opcode != aco_opcode::image_bvh64_intersect_ray && i < 7) { + + if (instr->mimg().strict_wqm) { + check(instr->operands[3].isTemp() && instr->operands[3].regClass().is_linear_vgpr(), + "MIMG operands[3] must be temp linear VGPR.", instr.get()); + + unsigned total_size = 0; + for (unsigned i = 4; i < instr->operands.size(); i++) { + check(instr->operands[i].isTemp() && instr->operands[i].regClass() == v1, + "MIMG operands[4+] (VADDR) must be v1", instr.get()); + total_size += instr->operands[i].bytes(); + } + check(total_size <= instr->operands[3].bytes(), + "MIMG operands[4+] must fit within operands[3].", instr.get()); + } else { + check(instr->operands.size() == 4 || program->gfx_level >= GFX10, + "NSA is only supported on GFX10+", instr.get()); + for (unsigned i = 3; i < instr->operands.size(); i++) { + check(instr->operands[i].hasRegClass() && + instr->operands[i].regClass().type() == RegType::vgpr, + "MIMG operands[3+] (VADDR) must be VGPR", instr.get()); + if (instr->operands.size() > 4) { + if (program->gfx_level < GFX11) { check(instr->operands[i].regClass() == v1, - "first 4 GFX11 MIMG VADDR must be v1 if NSA is used", instr.get()); + "GFX10 MIMG VADDR must be v1 if NSA is used", instr.get()); + } else { + if (instr->opcode != aco_opcode::image_bvh_intersect_ray && + instr->opcode != aco_opcode::image_bvh64_intersect_ray && i < 7) { + check(instr->operands[i].regClass() == v1, + "first 4 GFX11 MIMG VADDR must be v1 if NSA is used", instr.get()); + } } } }