diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 48a2b55301f..994ed2860e1 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "aco_ir.h" #include "aco_builder.h" @@ -10,11 +11,16 @@ namespace aco { +struct constaddr_info { + unsigned getpc_end; + unsigned add_literal; +}; + struct asm_context { Program *program; enum chip_class chip_class; std::vector> branches; - std::vector constaddrs; + std::map constaddrs; const int16_t* opcode; // TODO: keep track of branch instructions referring blocks // and, when emitting the block, correct the offset in instr @@ -45,39 +51,17 @@ static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg) void emit_instruction(asm_context& ctx, std::vector& out, Instruction* instr) { /* lower remaining pseudo-instructions */ - if (instr->opcode == aco_opcode::p_constaddr) { - unsigned dest = instr->definitions[0].physReg(); - unsigned offset = instr->operands[0].constantValue(); + if (instr->opcode == aco_opcode::p_constaddr_getpc) { + ctx.constaddrs[instr->operands[0].constantValue()].getpc_end = out.size() + 1; - /* s_getpc_b64 dest[0:1] */ - uint32_t encoding = (0b101111101 << 23); - uint32_t opcode = ctx.opcode[(int)aco_opcode::s_getpc_b64]; - if (opcode >= 55 && ctx.chip_class <= GFX9) { - assert(ctx.chip_class == GFX9 && opcode < 60); - opcode = opcode - 4; - } - encoding |= dest << 16; - encoding |= opcode << 8; - out.push_back(encoding); + instr->opcode = aco_opcode::s_getpc_b64; + instr->operands.pop_back(); + } else if (instr->opcode == aco_opcode::p_constaddr_addlo) { + ctx.constaddrs[instr->operands[1].constantValue()].add_literal = out.size() + 1; - /* s_add_u32 dest[0], dest[0], ... */ - encoding = (0b10 << 30); - encoding |= ctx.opcode[(int)aco_opcode::s_add_u32] << 23; - encoding |= dest << 16; - encoding |= dest; - encoding |= 255 << 8; - out.push_back(encoding); - ctx.constaddrs.push_back(out.size()); - out.push_back(offset); - - /* s_addc_u32 dest[1], dest[1], 0 */ - encoding = (0b10 << 30); - encoding |= ctx.opcode[(int)aco_opcode::s_addc_u32] << 23; - encoding |= (dest + 1) << 16; - encoding |= dest + 1; - encoding |= 128 << 8; - out.push_back(encoding); - return; + instr->opcode = aco_opcode::s_add_u32; + instr->operands[1] = Operand(0u); + instr->operands[1].setFixed(PhysReg(255)); } uint32_t opcode = ctx.opcode[(int)instr->opcode]; @@ -798,14 +782,14 @@ static void insert_code(asm_context& ctx, std::vector& out, unsigned i for (; branch_it != ctx.branches.end(); ++branch_it) branch_it->first += insert_count; - /* Find first constant address after the inserted code */ - auto caddr_it = std::find_if(ctx.constaddrs.begin(), ctx.constaddrs.end(), [insert_before](const int &caddr_pos) -> bool { - return (unsigned)caddr_pos >= insert_before; - }); - - /* Update the locations of constant addresses */ - for (; caddr_it != ctx.constaddrs.end(); ++caddr_it) - (*caddr_it) += insert_count; + /* Update the locations of p_constaddr instructions */ + for (auto& constaddr : ctx.constaddrs) { + constaddr_info& info = constaddr.second; + if (info.getpc_end >= insert_before) + info.getpc_end += insert_count; + if (info.add_literal >= insert_before) + info.add_literal += insert_count; + } } static void fix_branches_gfx10(asm_context& ctx, std::vector& out) @@ -928,8 +912,10 @@ void fix_branches(asm_context& ctx, std::vector& out) void fix_constaddrs(asm_context& ctx, std::vector& out) { - for (unsigned addr : ctx.constaddrs) - out[addr] += (out.size() - addr + 1u) * 4u; + for (auto& constaddr : ctx.constaddrs) { + constaddr_info& info = constaddr.second; + out[info.add_literal] += (out.size() - info.getpc_end) * 4u; + } } unsigned emit_program(Program* program, diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index eda4e9d4340..888359ff5a3 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5550,7 +5550,7 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr) offset = bld.vadd32(bld.def(v1), Operand(base), offset); Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), - bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)), + bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)), Operand(MIN2(base + range, ctx->shader->constant_data_size)), Operand(desc_type)); unsigned size = instr->dest.ssa.bit_size / 8; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index a2129728d16..42139bd0c01 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1979,6 +1979,17 @@ void lower_to_hw_instr(Program* program) unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute."); break; } + case aco_opcode::p_constaddr: + { + unsigned id = instr->definitions[0].tempId(); + PhysReg reg = instr->definitions[0].physReg(); + bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand(id)); + bld.sop2(aco_opcode::p_constaddr_addlo, Definition(reg, s1), bld.def(s1, scc), + Operand(reg, s1), Operand(id)); + bld.sop2(aco_opcode::s_addc_u32, Definition(reg.advance(4), s1), bld.def(s1, scc), + Operand(reg.advance(4), s1), Operand(0u), Operand(scc, s1)); + break; + } default: break; } diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index a2ae4f17308..9138fe2bac9 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -288,6 +288,8 @@ opcode("p_exit_early_if") # simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64 opcode("p_bpermute") +opcode("p_constaddr") + # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc) SOP2 = { # GFX6, GFX7, GFX8, GFX9, GFX10, name @@ -344,6 +346,8 @@ SOP2 = { ( -1, -1, -1, 0x34, 0x34, "s_pack_hh_b32_b16"), ( -1, -1, -1, 0x2c, 0x35, "s_mul_hi_u32"), ( -1, -1, -1, 0x2d, 0x36, "s_mul_hi_i32"), + # actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP2. + ( -1, -1, -1, -1, -1, "p_constaddr_addlo"), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP2: opcode(name, gfx7, gfx9, gfx10, Format.SOP2) @@ -457,7 +461,7 @@ SOP1 = { ( -1, -1, -1, -1, 0x47, "s_andn2_wrexec_b32"), ( -1, -1, -1, -1, 0x49, "s_movrelsd_2_b32"), # actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP1. - ( -1, -1, -1, -1, -1, "p_constaddr"), + ( -1, -1, -1, -1, -1, "p_constaddr_getpc"), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP1: opcode(name, gfx7, gfx9, gfx10, Format.SOP1) diff --git a/src/amd/compiler/tests/test_assembler.cpp b/src/amd/compiler/tests/test_assembler.cpp index bd6055cc20a..47698699d1d 100644 --- a/src/amd/compiler/tests/test_assembler.cpp +++ b/src/amd/compiler/tests/test_assembler.cpp @@ -220,7 +220,9 @@ BEGIN_TEST(assembler.long_jump.constaddr) //>> s_getpc_b64 s[0:1] ; be801f00 //! s_add_u32 s0, s0, 0xe0 ; 8000ff00 000000e0 - bld.sop1(aco_opcode::p_constaddr, Definition(PhysReg(0), s2), Operand(0u)); + bld.sop1(aco_opcode::p_constaddr_getpc, Definition(PhysReg(0), s2), Operand(0u)); + bld.sop2(aco_opcode::p_constaddr_addlo, Definition(PhysReg(0), s1), bld.def(s1, scc), + Operand(PhysReg(0), s1), Operand(0u)); program->blocks[2].linear_preds.push_back(0u); program->blocks[2].linear_preds.push_back(1u);