aco: add MIMG_instruction::strict_wqm

This lets us use linear VGPRs for part of the texture sample's address.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22636>
This commit is contained in:
Rhys Perry 2023-03-21 14:44:09 +00:00 committed by Marge Bot
parent 1a6a57ac96
commit 35c133a77b
6 changed files with 113 additions and 24 deletions

View file

@ -5930,14 +5930,7 @@ static MIMG_instruction*
emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::vector<Temp> coords,
bool needs_wqm = false, Operand vdata = Operand(v1))
{
/* Limit NSA instructions to 3 dwords on GFX10 to avoid stability issues.
* On GFX11 the first 4 vaddr are single registers and the last contains the remaining
* vector.
*/
size_t nsa_size = bld.program->gfx_level == GFX10 ? 5
: bld.program->gfx_level == GFX10_3 ? 13
: bld.program->gfx_level >= GFX11 ? 4
: 0;
size_t nsa_size = bld.program->dev.max_nsa_vgprs;
nsa_size = bld.program->gfx_level >= GFX11 || coords.size() <= nsa_size ? nsa_size : 0;
for (unsigned i = 0; i < std::min(coords.size(), nsa_size); i++) {

View file

@ -188,6 +188,21 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info,
program->dev.scratch_global_offset_max = 4095;
}
if (program->gfx_level >= GFX11) {
/* GFX11 can have only 1 NSA dword. The last VGPR isn't included here because it contains the
* rest of the address.
*/
program->dev.max_nsa_vgprs = 4;
} else if (program->gfx_level >= GFX10_3) {
/* GFX10.3 can have up to 3 NSA dwords. */
program->dev.max_nsa_vgprs = 13;
} else if (program->gfx_level >= GFX10) {
/* Limit NSA instructions to 1 NSA dword on GFX10 to avoid stability issues. */
program->dev.max_nsa_vgprs = 5;
} else {
program->dev.max_nsa_vgprs = 0;
}
program->wgp_mode = wgp_mode;
program->progress = CompilationProgress::after_isel;

View file

@ -1619,7 +1619,8 @@ struct MIMG_instruction : public Instruction {
bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
bool d16 : 1; /* Convert 32-bit data to 16-bit data */
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
uint8_t padding0 : 2;
bool strict_wqm : 1; /* VADDR is a linear VGPR and additional VGPRs may be copied into it */
uint8_t padding0 : 1;
uint8_t padding1;
uint8_t padding2;
};
@ -2095,6 +2096,7 @@ struct DeviceInfo {
int16_t scratch_global_offset_min;
int16_t scratch_global_offset_max;
unsigned max_nsa_vgprs;
};
enum class CompilationProgress {

View file

@ -2138,6 +2138,66 @@ hw_init_scratch(Builder& bld, Definition def, Operand scratch_addr, Operand scra
}
}
void
lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
{
Operand linear_vgpr = instr->operands[3];
unsigned nsa_size = ctx->program->dev.max_nsa_vgprs;
unsigned vaddr_size = linear_vgpr.size();
unsigned num_copied_vgprs = instr->operands.size() - 4;
nsa_size = num_copied_vgprs > 0 && (ctx->program->gfx_level >= GFX11 || vaddr_size <= nsa_size)
? nsa_size
: 0;
Operand vaddr[16];
unsigned num_vaddr = 0;
if (nsa_size) {
assert(num_copied_vgprs <= nsa_size);
for (unsigned i = 0; i < num_copied_vgprs; i++)
vaddr[num_vaddr++] = instr->operands[4 + i];
for (unsigned i = num_copied_vgprs; i < std::min(vaddr_size, nsa_size); i++)
vaddr[num_vaddr++] = Operand(linear_vgpr.physReg().advance(i * 4), v1);
if (vaddr_size > nsa_size) {
RegClass rc = RegClass::get(RegType::vgpr, (vaddr_size - nsa_size) * 4);
vaddr[num_vaddr++] = Operand(PhysReg(linear_vgpr.physReg().advance(nsa_size * 4)), rc);
}
} else {
PhysReg reg = linear_vgpr.physReg();
std::map<PhysReg, copy_operation> copy_operations;
for (unsigned i = 4; i < instr->operands.size(); i++) {
Operand arg = instr->operands[i];
Definition def(reg, RegClass::get(RegType::vgpr, arg.bytes()));
copy_operations[def.physReg()] = {arg, def, def.bytes()};
reg = reg.advance(arg.bytes());
}
vaddr[num_vaddr++] = linear_vgpr;
Pseudo_instruction pi = {};
handle_operands(copy_operations, ctx, ctx->program->gfx_level, &pi);
}
instr->mimg().strict_wqm = false;
if ((3 + num_vaddr) > instr->operands.size()) {
MIMG_instruction *new_instr = create_instruction<MIMG_instruction>(
instr->opcode, Format::MIMG, 3 + num_vaddr, instr->definitions.size());
std::copy(instr->definitions.cbegin(), instr->definitions.cend(),
new_instr->definitions.begin());
new_instr->operands[0] = instr->operands[0];
new_instr->operands[1] = instr->operands[1];
new_instr->operands[2] = instr->operands[2];
memcpy((uint8_t*)new_instr + sizeof(Instruction), (uint8_t*)instr.get() + sizeof(Instruction),
sizeof(MIMG_instruction) - sizeof(Instruction));
instr.reset(new_instr);
} else {
while (instr->operands.size() > (3 + num_vaddr))
instr->operands.pop_back();
}
std::copy(vaddr, vaddr + num_vaddr, std::next(instr->operands.begin(), 3));
}
void
lower_to_hw_instr(Program* program)
{
@ -2802,6 +2862,9 @@ lower_to_hw_instr(Program* program)
ctx.instructions.emplace_back(std::move(instr));
emit_set_mode(bld, block->fp_mode, set_round, false);
} else if (instr->isMIMG() && instr->mimg().strict_wqm) {
lower_image_sample(&ctx, instr);
ctx.instructions.emplace_back(std::move(instr));
} else {
ctx.instructions.emplace_back(std::move(instr));
}

View file

@ -2445,7 +2445,8 @@ get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
op.getTemp().type() == instr->definitions[0].getTemp().type())
ctx.vectors[op.tempId()] = instr.get();
}
} else if (instr->format == Format::MIMG && instr->operands.size() > 4) {
} else if (instr->format == Format::MIMG && instr->operands.size() > 4 &&
!instr->mimg().strict_wqm) {
for (unsigned i = 3; i < instr->operands.size(); i++)
ctx.vectors[instr->operands[i].tempId()] = instr.get();
} else if (instr->opcode == aco_opcode::p_split_vector &&

View file

@ -696,21 +696,36 @@ validate_ir(Program* program)
"TFE/LWE loads",
instr.get());
}
check(instr->operands.size() == 4 || program->gfx_level >= GFX10,
"NSA is only supported on GFX10+", instr.get());
for (unsigned i = 3; i < instr->operands.size(); i++) {
check(instr->operands[i].hasRegClass() &&
instr->operands[i].regClass().type() == RegType::vgpr,
"MIMG operands[3+] (VADDR) must be VGPR", instr.get());
if (instr->operands.size() > 4) {
if (program->gfx_level < GFX11) {
check(instr->operands[i].regClass() == v1,
"GFX10 MIMG VADDR must be v1 if NSA is used", instr.get());
} else {
if (instr->opcode != aco_opcode::image_bvh_intersect_ray &&
instr->opcode != aco_opcode::image_bvh64_intersect_ray && i < 7) {
if (instr->mimg().strict_wqm) {
check(instr->operands[3].isTemp() && instr->operands[3].regClass().is_linear_vgpr(),
"MIMG operands[3] must be temp linear VGPR.", instr.get());
unsigned total_size = 0;
for (unsigned i = 4; i < instr->operands.size(); i++) {
check(instr->operands[i].isTemp() && instr->operands[i].regClass() == v1,
"MIMG operands[4+] (VADDR) must be v1", instr.get());
total_size += instr->operands[i].bytes();
}
check(total_size <= instr->operands[3].bytes(),
"MIMG operands[4+] must fit within operands[3].", instr.get());
} else {
check(instr->operands.size() == 4 || program->gfx_level >= GFX10,
"NSA is only supported on GFX10+", instr.get());
for (unsigned i = 3; i < instr->operands.size(); i++) {
check(instr->operands[i].hasRegClass() &&
instr->operands[i].regClass().type() == RegType::vgpr,
"MIMG operands[3+] (VADDR) must be VGPR", instr.get());
if (instr->operands.size() > 4) {
if (program->gfx_level < GFX11) {
check(instr->operands[i].regClass() == v1,
"first 4 GFX11 MIMG VADDR must be v1 if NSA is used", instr.get());
"GFX10 MIMG VADDR must be v1 if NSA is used", instr.get());
} else {
if (instr->opcode != aco_opcode::image_bvh_intersect_ray &&
instr->opcode != aco_opcode::image_bvh64_intersect_ray && i < 7) {
check(instr->operands[i].regClass() == v1,
"first 4 GFX11 MIMG VADDR must be v1 if NSA is used", instr.get());
}
}
}
}