aco: lower p_constaddr into separate instructions earlier

This allows them to be scheduled properly and simplifies the assembler a
little.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8994>
This commit is contained in:
Rhys Perry 2021-02-01 12:42:38 +00:00
parent ab957bb899
commit 0af7ff49fd
5 changed files with 48 additions and 45 deletions

View file

@ -1,5 +1,6 @@
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
#include <map>
#include "aco_ir.h" #include "aco_ir.h"
#include "aco_builder.h" #include "aco_builder.h"
@ -10,11 +11,16 @@
namespace aco { namespace aco {
struct constaddr_info {
unsigned getpc_end;
unsigned add_literal;
};
struct asm_context { struct asm_context {
Program *program; Program *program;
enum chip_class chip_class; enum chip_class chip_class;
std::vector<std::pair<int, SOPP_instruction*>> branches; std::vector<std::pair<int, SOPP_instruction*>> branches;
std::vector<unsigned> constaddrs; std::map<unsigned, constaddr_info> constaddrs;
const int16_t* opcode; const int16_t* opcode;
// TODO: keep track of branch instructions referring blocks // TODO: keep track of branch instructions referring blocks
// and, when emitting the block, correct the offset in instr // and, when emitting the block, correct the offset in instr
@ -45,39 +51,17 @@ static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg)
void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr) void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)
{ {
/* lower remaining pseudo-instructions */ /* lower remaining pseudo-instructions */
if (instr->opcode == aco_opcode::p_constaddr) { if (instr->opcode == aco_opcode::p_constaddr_getpc) {
unsigned dest = instr->definitions[0].physReg(); ctx.constaddrs[instr->operands[0].constantValue()].getpc_end = out.size() + 1;
unsigned offset = instr->operands[0].constantValue();
/* s_getpc_b64 dest[0:1] */ instr->opcode = aco_opcode::s_getpc_b64;
uint32_t encoding = (0b101111101 << 23); instr->operands.pop_back();
uint32_t opcode = ctx.opcode[(int)aco_opcode::s_getpc_b64]; } else if (instr->opcode == aco_opcode::p_constaddr_addlo) {
if (opcode >= 55 && ctx.chip_class <= GFX9) { ctx.constaddrs[instr->operands[1].constantValue()].add_literal = out.size() + 1;
assert(ctx.chip_class == GFX9 && opcode < 60);
opcode = opcode - 4;
}
encoding |= dest << 16;
encoding |= opcode << 8;
out.push_back(encoding);
/* s_add_u32 dest[0], dest[0], ... */ instr->opcode = aco_opcode::s_add_u32;
encoding = (0b10 << 30); instr->operands[1] = Operand(0u);
encoding |= ctx.opcode[(int)aco_opcode::s_add_u32] << 23; instr->operands[1].setFixed(PhysReg(255));
encoding |= dest << 16;
encoding |= dest;
encoding |= 255 << 8;
out.push_back(encoding);
ctx.constaddrs.push_back(out.size());
out.push_back(offset);
/* s_addc_u32 dest[1], dest[1], 0 */
encoding = (0b10 << 30);
encoding |= ctx.opcode[(int)aco_opcode::s_addc_u32] << 23;
encoding |= (dest + 1) << 16;
encoding |= dest + 1;
encoding |= 128 << 8;
out.push_back(encoding);
return;
} }
uint32_t opcode = ctx.opcode[(int)instr->opcode]; uint32_t opcode = ctx.opcode[(int)instr->opcode];
@ -798,14 +782,14 @@ static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned i
for (; branch_it != ctx.branches.end(); ++branch_it) for (; branch_it != ctx.branches.end(); ++branch_it)
branch_it->first += insert_count; branch_it->first += insert_count;
/* Find first constant address after the inserted code */ /* Update the locations of p_constaddr instructions */
auto caddr_it = std::find_if(ctx.constaddrs.begin(), ctx.constaddrs.end(), [insert_before](const int &caddr_pos) -> bool { for (auto& constaddr : ctx.constaddrs) {
return (unsigned)caddr_pos >= insert_before; constaddr_info& info = constaddr.second;
}); if (info.getpc_end >= insert_before)
info.getpc_end += insert_count;
/* Update the locations of constant addresses */ if (info.add_literal >= insert_before)
for (; caddr_it != ctx.constaddrs.end(); ++caddr_it) info.add_literal += insert_count;
(*caddr_it) += insert_count; }
} }
static void fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out) static void fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
@ -928,8 +912,10 @@ void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out) void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
{ {
for (unsigned addr : ctx.constaddrs) for (auto& constaddr : ctx.constaddrs) {
out[addr] += (out.size() - addr + 1u) * 4u; constaddr_info& info = constaddr.second;
out[info.add_literal] += (out.size() - info.getpc_end) * 4u;
}
} }
unsigned emit_program(Program* program, unsigned emit_program(Program* program,

View file

@ -5550,7 +5550,7 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
offset = bld.vadd32(bld.def(v1), Operand(base), offset); offset = bld.vadd32(bld.def(v1), Operand(base), offset);
Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)), bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
Operand(MIN2(base + range, ctx->shader->constant_data_size)), Operand(MIN2(base + range, ctx->shader->constant_data_size)),
Operand(desc_type)); Operand(desc_type));
unsigned size = instr->dest.ssa.bit_size / 8; unsigned size = instr->dest.ssa.bit_size / 8;

View file

@ -1979,6 +1979,17 @@ void lower_to_hw_instr(Program* program)
unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute."); unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute.");
break; break;
} }
case aco_opcode::p_constaddr:
{
unsigned id = instr->definitions[0].tempId();
PhysReg reg = instr->definitions[0].physReg();
bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand(id));
bld.sop2(aco_opcode::p_constaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
Operand(reg, s1), Operand(id));
bld.sop2(aco_opcode::s_addc_u32, Definition(reg.advance(4), s1), bld.def(s1, scc),
Operand(reg.advance(4), s1), Operand(0u), Operand(scc, s1));
break;
}
default: default:
break; break;
} }

View file

@ -288,6 +288,8 @@ opcode("p_exit_early_if")
# simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64 # simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64
opcode("p_bpermute") opcode("p_bpermute")
opcode("p_constaddr")
# SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc) # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
SOP2 = { SOP2 = {
# GFX6, GFX7, GFX8, GFX9, GFX10, name # GFX6, GFX7, GFX8, GFX9, GFX10, name
@ -344,6 +346,8 @@ SOP2 = {
( -1, -1, -1, 0x34, 0x34, "s_pack_hh_b32_b16"), ( -1, -1, -1, 0x34, 0x34, "s_pack_hh_b32_b16"),
( -1, -1, -1, 0x2c, 0x35, "s_mul_hi_u32"), ( -1, -1, -1, 0x2c, 0x35, "s_mul_hi_u32"),
( -1, -1, -1, 0x2d, 0x36, "s_mul_hi_i32"), ( -1, -1, -1, 0x2d, 0x36, "s_mul_hi_i32"),
# actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP2.
( -1, -1, -1, -1, -1, "p_constaddr_addlo"),
} }
for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP2: for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP2:
opcode(name, gfx7, gfx9, gfx10, Format.SOP2) opcode(name, gfx7, gfx9, gfx10, Format.SOP2)
@ -457,7 +461,7 @@ SOP1 = {
( -1, -1, -1, -1, 0x47, "s_andn2_wrexec_b32"), ( -1, -1, -1, -1, 0x47, "s_andn2_wrexec_b32"),
( -1, -1, -1, -1, 0x49, "s_movrelsd_2_b32"), ( -1, -1, -1, -1, 0x49, "s_movrelsd_2_b32"),
# actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP1. # actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP1.
( -1, -1, -1, -1, -1, "p_constaddr"), ( -1, -1, -1, -1, -1, "p_constaddr_getpc"),
} }
for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP1: for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP1:
opcode(name, gfx7, gfx9, gfx10, Format.SOP1) opcode(name, gfx7, gfx9, gfx10, Format.SOP1)

View file

@ -220,7 +220,9 @@ BEGIN_TEST(assembler.long_jump.constaddr)
//>> s_getpc_b64 s[0:1] ; be801f00 //>> s_getpc_b64 s[0:1] ; be801f00
//! s_add_u32 s0, s0, 0xe0 ; 8000ff00 000000e0 //! s_add_u32 s0, s0, 0xe0 ; 8000ff00 000000e0
bld.sop1(aco_opcode::p_constaddr, Definition(PhysReg(0), s2), Operand(0u)); bld.sop1(aco_opcode::p_constaddr_getpc, Definition(PhysReg(0), s2), Operand(0u));
bld.sop2(aco_opcode::p_constaddr_addlo, Definition(PhysReg(0), s1), bld.def(s1, scc),
Operand(PhysReg(0), s1), Operand(0u));
program->blocks[2].linear_preds.push_back(0u); program->blocks[2].linear_preds.push_back(0u);
program->blocks[2].linear_preds.push_back(1u); program->blocks[2].linear_preds.push_back(1u);