diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 1b081b3d7c1..8631189b7bf 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -41,14 +41,15 @@ struct constaddr_info { }; struct asm_context { - Program *program; + Program* program; enum chip_class chip_class; std::vector> branches; std::map constaddrs; const int16_t* opcode; // TODO: keep track of branch instructions referring blocks // and, when emitting the block, correct the offset in instr - asm_context(Program* program_) : program(program_), chip_class(program->chip_class) { + asm_context(Program* program_) : program(program_), chip_class(program->chip_class) + { if (chip_class <= GFX7) opcode = &instr_info.opcode_gfx7[0]; else if (chip_class <= GFX9) @@ -60,7 +61,8 @@ struct asm_context { int subvector_begin_pos = -1; }; -static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg) +static uint32_t +get_sdwa_sel(unsigned sel, PhysReg reg) { if (sel & sdwa_isra) { unsigned size = sdwa_rasize & sel; @@ -72,7 +74,9 @@ static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg) return sel & sdwa_asuint; } -unsigned get_mimg_nsa_dwords(const Instruction *instr) { +unsigned +get_mimg_nsa_dwords(const Instruction* instr) +{ unsigned addr_dwords = instr->operands.size() - 3; for (unsigned i = 1; i < addr_dwords; i++) { if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4)) @@ -81,7 +85,8 @@ unsigned get_mimg_nsa_dwords(const Instruction *instr) { return 0; } -void emit_instruction(asm_context& ctx, std::vector& out, Instruction* instr) +void +emit_instruction(asm_context& ctx, std::vector& out, Instruction* instr) { /* lower remaining pseudo-instructions */ if (instr->opcode == aco_opcode::p_constaddr_getpc) { @@ -99,11 +104,11 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* uint32_t opcode = ctx.opcode[(int)instr->opcode]; if (opcode == (uint32_t)-1) { - char *outmem; + char* outmem; size_t outsize; struct u_memstream mem; u_memstream_open(&mem, &outmem, &outsize); - FILE *const memf = u_memstream_get(&mem); + FILE* const memf = u_memstream_get(&mem); fprintf(memf, "Unsupported opcode: "); aco_print_instr(instr, memf); @@ -144,11 +149,11 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* uint32_t encoding = (0b1011 << 28); encoding |= opcode << 23; - encoding |= - !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) ? - instr->definitions[0].physReg() << 16 : - !instr->operands.empty() && instr->operands[0].physReg() <= 127 ? - instr->operands[0].physReg() << 16 : 0; + encoding |= !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) + ? instr->definitions[0].physReg() << 16 + : !instr->operands.empty() && instr->operands[0].physReg() <= 127 + ? instr->operands[0].physReg() << 16 + : 0; encoding |= sopk.imm; out.push_back(encoding); break; @@ -177,7 +182,7 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* SOPP_instruction& sopp = instr->sopp(); uint32_t encoding = (0b101111111 << 23); encoding |= opcode << 16; - encoding |= (uint16_t) sopp.imm; + encoding |= (uint16_t)sopp.imm; if (sopp.block != -1) { sopp.pass_flags = 0; ctx.branches.emplace_back(out.size(), &sopp); @@ -208,7 +213,8 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* } out.push_back(encoding); /* SMRD instructions can take a literal on GFX7 */ - if (instr->operands.size() >= 2 && instr->operands[1].isConstant() && instr->operands[1].constantValue() >= 1024) + if (instr->operands.size() >= 2 && instr->operands[1].isConstant() && + instr->operands[1].constantValue() >= 1024) out.push_back(instr->operands[1].constantValue() >> 2); return; } @@ -235,7 +241,8 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* } if (is_load || instr->operands.size() >= 3) { /* SDATA */ - encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg()) << 6; + encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg()) + << 6; } if (instr->operands.size() >= 1) { /* SBASE */ encoding |= instr->operands[0].physReg() >> 1; @@ -246,14 +253,16 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* int32_t offset = 0; uint32_t soffset = ctx.chip_class >= GFX10 - ? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */ - : 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on GFX8 and below) */ + ? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */ + : 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on + GFX8 and below) */ if (instr->operands.size() >= 2) { - const Operand &op_off1 = instr->operands[1]; + const Operand& op_off1 = instr->operands[1]; if (ctx.chip_class <= GFX9) { offset = op_off1.isConstant() ? op_off1.constantValue() : op_off1.physReg(); } else { - /* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an SGPR */ + /* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an + * SGPR */ if (op_off1.isConstant()) { offset = op_off1.constantValue(); } else { @@ -263,8 +272,9 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* } if (soe) { - const Operand &op_off2 = instr->operands.back(); - assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant and an SGPR at the same time */ + const Operand& op_off2 = instr->operands.back(); + assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant + and an SGPR at the same time */ assert(!op_off2.isConstant()); soffset = op_off2.physReg(); } @@ -368,9 +378,13 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding = 0; unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0; encoding |= (0xFF & reg) << 24; - reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0) ? instr->operands[2].physReg() : 0; + reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0) + ? instr->operands[2].physReg() + : 0; encoding |= (0xFF & reg) << 16; - reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0) ? instr->operands[1].physReg() : 0; + reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0) + ? instr->operands[1].physReg() + : 0; encoding |= (0xFF & reg) << 8; encoding |= (0xFF & instr->operands[0].physReg()); out.push_back(encoding); @@ -402,7 +416,8 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= instr->operands[2].physReg() << 24; encoding |= (mubuf.tfe ? 1 : 0) << 23; encoding |= (instr->operands[0].physReg() >> 2) << 16; - unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg(); + unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() + : instr->definitions[0].physReg(); encoding |= (0xFF & reg) << 8; encoding |= (0xFF & instr->operands[1].physReg()); out.push_back(encoding); @@ -435,7 +450,8 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= (mtbuf.tfe ? 1 : 0) << 23; encoding |= (mtbuf.slc ? 1 : 0) << 22; encoding |= (instr->operands[0].physReg() >> 2) << 16; - unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg(); + unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() + : instr->definitions[0].physReg(); encoding |= (0xFF & reg) << 8; encoding |= (0xFF & instr->operands[1].physReg()); @@ -465,7 +481,8 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= mimg.a16 ? 1 << 15 : 0; encoding |= mimg.da ? 1 << 14 : 0; } else { - encoding |= mimg.r128 ? 1 << 15 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */ + encoding |= mimg.r128 ? 1 << 15 + : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */ encoding |= nsa_dwords << 1; encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */ encoding |= mimg.dlc ? 1 << 7 : 0; @@ -485,7 +502,8 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* assert(!mimg.d16 || ctx.chip_class >= GFX9); encoding |= mimg.d16 ? 1 << 31 : 0; if (ctx.chip_class >= GFX10) { - encoding |= mimg.a16 ? 1 << 30 : 0; /* GFX10: A16 still exists, but is in a different place */ + /* GFX10: A16 still exists, but is in a different place */ + encoding |= mimg.a16 ? 1 << 30 : 0; } out.push_back(encoding); @@ -539,7 +557,8 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* assert(ctx.chip_class >= GFX10 || instr->operands[1].physReg() != 0x7F); assert(instr->format != Format::FLAT); encoding |= instr->operands[1].physReg() << 16; - } else if (instr->format != Format::FLAT || ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */ + } else if (instr->format != Format::FLAT || + ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */ if (ctx.chip_class <= GFX9) encoding |= 0x7F << 16; else @@ -611,7 +630,7 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* } encoding |= vop3.opsel << 11; for (unsigned i = 0; i < 3; i++) - encoding |= vop3.abs[i] << (8+i); + encoding |= vop3.abs[i] << (8 + i); if (instr->definitions.size() == 2) encoding |= instr->definitions[1].physReg() << 8; encoding |= (0xFF & instr->definitions[0].physReg()); @@ -625,7 +644,7 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* } encoding |= vop3.omod << 27; for (unsigned i = 0; i < 3; i++) - encoding |= vop3.neg[i] << (29+i); + encoding |= vop3.neg[i] << (29 + i); out.push_back(encoding); } else if (instr->isVOP3P()) { @@ -645,7 +664,7 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= vop3.opsel_lo << 11; encoding |= ((vop3.opsel_hi & 0x4) ? 1 : 0) << 14; for (unsigned i = 0; i < 3; i++) - encoding |= vop3.neg_hi[i] << (8+i); + encoding |= vop3.neg_hi[i] << (8 + i); encoding |= (0xFF & instr->definitions[0].physReg()); out.push_back(encoding); encoding = 0; @@ -653,17 +672,17 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= instr->operands[i].physReg() << (i * 9); encoding |= (vop3.opsel_hi & 0x3) << 27; for (unsigned i = 0; i < 3; i++) - encoding |= vop3.neg_lo[i] << (29+i); + encoding |= vop3.neg_lo[i] << (29 + i); out.push_back(encoding); - } else if (instr->isDPP()){ + } else if (instr->isDPP()) { assert(ctx.chip_class >= GFX8); DPP_instruction& dpp = instr->dpp(); /* first emit the instruction without the DPP operand */ Operand dpp_op = instr->operands[0]; instr->operands[0] = Operand(PhysReg{250}, v1); - instr->format = (Format) ((uint16_t) instr->format & ~(uint16_t)Format::DPP); + instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP); emit_instruction(ctx, out, instr); uint32_t encoding = (0xF & dpp.row_mask) << 28; encoding |= (0xF & dpp.bank_mask) << 24; @@ -684,7 +703,7 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* /* first emit the instruction without the SDWA operand */ Operand sdwa_op = instr->operands[0]; instr->operands[0] = Operand(PhysReg{249}, v1); - instr->format = (Format) ((uint16_t) instr->format & ~(uint16_t)Format::SDWA); + instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::SDWA); emit_instruction(ctx, out, instr); uint32_t encoding = 0; @@ -737,7 +756,8 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* } } -void emit_block(asm_context& ctx, std::vector& out, Block& block) +void +emit_block(asm_context& ctx, std::vector& out, Block& block) { for (aco_ptr& instr : block.instructions) { #if 0 @@ -754,15 +774,15 @@ void emit_block(asm_context& ctx, std::vector& out, Block& block) } } -void fix_exports(asm_context& ctx, std::vector& out, Program* program) +void +fix_exports(asm_context& ctx, std::vector& out, Program* program) { bool exported = false; for (Block& block : program->blocks) { if (!(block.kind & block_kind_export_end)) continue; std::vector>::reverse_iterator it = block.instructions.rbegin(); - while ( it != block.instructions.rend()) - { + while (it != block.instructions.rend()) { if ((*it)->isEXP()) { Export_instruction& exp = (*it)->exp(); if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG) { @@ -785,15 +805,18 @@ void fix_exports(asm_context& ctx, std::vector& out, Program* program) if (!exported) { /* Abort in order to avoid a GPU hang. */ - bool is_vertex_or_ngg = (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG); - aco_err(program, "Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment"); + bool is_vertex_or_ngg = + (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG); + aco_err(program, + "Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment"); aco_print_program(program, stderr); abort(); } } -static void insert_code(asm_context& ctx, std::vector& out, unsigned insert_before, - unsigned insert_count, const uint32_t *insert_data) +static void +insert_code(asm_context& ctx, std::vector& out, unsigned insert_before, + unsigned insert_count, const uint32_t* insert_data) { out.insert(out.begin() + insert_before, insert_data, insert_data + insert_count); @@ -804,9 +827,9 @@ static void insert_code(asm_context& ctx, std::vector& out, unsigned i } /* Find first branch after the inserted code */ - auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), [insert_before](const auto &branch) -> bool { - return (unsigned)branch.first >= insert_before; - }); + auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), + [insert_before](const auto& branch) -> bool + { return (unsigned)branch.first >= insert_before; }); /* Update the locations of branches */ for (; branch_it != ctx.branches.end(); ++branch_it) @@ -822,15 +845,21 @@ static void insert_code(asm_context& ctx, std::vector& out, unsigned i } } -static void fix_branches_gfx10(asm_context& ctx, std::vector& out) +static void +fix_branches_gfx10(asm_context& ctx, std::vector& out) { - /* Branches with an offset of 0x3f are buggy on GFX10, we workaround by inserting NOPs if needed. */ + /* Branches with an offset of 0x3f are buggy on GFX10, + * we workaround by inserting NOPs if needed. + */ bool gfx10_3f_bug = false; do { - auto buggy_branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), [&ctx](const auto &branch) -> bool { - return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) == 0x3f; - }); + auto buggy_branch_it = std::find_if( + ctx.branches.begin(), ctx.branches.end(), + [&ctx](const auto& branch) -> bool { + return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) == + 0x3f; + }); gfx10_3f_bug = buggy_branch_it != ctx.branches.end(); @@ -842,7 +871,9 @@ static void fix_branches_gfx10(asm_context& ctx, std::vector& out) } while (gfx10_3f_bug); } -void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards, std::vector& out) +void +emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards, + std::vector& out) { Builder bld(ctx.program); @@ -857,26 +888,13 @@ void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards, /* for conditional branches, skip the long jump if the condition is false */ aco_opcode inv; switch (branch->opcode) { - case aco_opcode::s_cbranch_scc0: - inv = aco_opcode::s_cbranch_scc1; - break; - case aco_opcode::s_cbranch_scc1: - inv = aco_opcode::s_cbranch_scc0; - break; - case aco_opcode::s_cbranch_vccz: - inv = aco_opcode::s_cbranch_vccnz; - break; - case aco_opcode::s_cbranch_vccnz: - inv = aco_opcode::s_cbranch_vccz; - break; - case aco_opcode::s_cbranch_execz: - inv = aco_opcode::s_cbranch_execnz; - break; - case aco_opcode::s_cbranch_execnz: - inv = aco_opcode::s_cbranch_execz; - break; - default: - unreachable("Unhandled long jump."); + case aco_opcode::s_cbranch_scc0: inv = aco_opcode::s_cbranch_scc1; break; + case aco_opcode::s_cbranch_scc1: inv = aco_opcode::s_cbranch_scc0; break; + case aco_opcode::s_cbranch_vccz: inv = aco_opcode::s_cbranch_vccnz; break; + case aco_opcode::s_cbranch_vccnz: inv = aco_opcode::s_cbranch_vccz; break; + case aco_opcode::s_cbranch_execz: inv = aco_opcode::s_cbranch_execnz; break; + case aco_opcode::s_cbranch_execnz: inv = aco_opcode::s_cbranch_execz; break; + default: unreachable("Unhandled long jump."); } instr.reset(bld.sopp(inv, -1, 7)); emit_instruction(ctx, out, instr.get()); @@ -891,7 +909,9 @@ void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards, emit_instruction(ctx, out, instr.get()); branch->pass_flags = out.size(); - instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi, Operand(backwards ? UINT32_MAX : 0u)).instr); + instr.reset( + bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi, Operand(backwards ? UINT32_MAX : 0u)) + .instr); emit_instruction(ctx, out, instr.get()); /* restore SCC and clear the LSB of the new PC */ @@ -901,11 +921,13 @@ void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards, emit_instruction(ctx, out, instr.get()); /* create the s_setpc_b64 to jump */ - instr.reset(bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr); + instr.reset( + bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr); emit_instruction(ctx, out, instr.get()); } -void fix_branches(asm_context& ctx, std::vector& out) +void +fix_branches(asm_context& ctx, std::vector& out) { bool repeat = false; do { @@ -914,11 +936,12 @@ void fix_branches(asm_context& ctx, std::vector& out) if (ctx.chip_class == GFX10) fix_branches_gfx10(ctx, out); - for (std::pair &branch : ctx.branches) { + for (std::pair& branch : ctx.branches) { int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1; if ((offset < INT16_MIN || offset > INT16_MAX) && !branch.second->pass_flags) { std::vector long_jump; - bool backwards = ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first; + bool backwards = + ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first; emit_long_jump(ctx, branch.second, backwards, long_jump); out[branch.first] = long_jump[0]; @@ -934,13 +957,14 @@ void fix_branches(asm_context& ctx, std::vector& out) out[branch.first + branch.second->pass_flags - 1] = offset * 4; } else { out[branch.first] &= 0xffff0000u; - out[branch.first] |= (uint16_t) offset; + out[branch.first] |= (uint16_t)offset; } } } while (repeat); } -void fix_constaddrs(asm_context& ctx, std::vector& out) +void +fix_constaddrs(asm_context& ctx, std::vector& out) { for (auto& constaddr : ctx.constaddrs) { constaddr_info& info = constaddr.second; @@ -948,13 +972,12 @@ void fix_constaddrs(asm_context& ctx, std::vector& out) } } -unsigned emit_program(Program* program, - std::vector& code) +unsigned +emit_program(Program* program, std::vector& code) { asm_context ctx(program); - if (program->stage.hw == HWStage::VS || - program->stage.hw == HWStage::FS || + if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS || program->stage.hw == HWStage::NGG) fix_exports(ctx, code, program); @@ -986,4 +1009,4 @@ unsigned emit_program(Program* program, return exec_size; } -} +} // namespace aco diff --git a/src/amd/compiler/aco_dead_code_analysis.cpp b/src/amd/compiler/aco_dead_code_analysis.cpp index 5b32b4f468a..3d565f0c141 100644 --- a/src/amd/compiler/aco_dead_code_analysis.cpp +++ b/src/amd/compiler/aco_dead_code_analysis.cpp @@ -40,7 +40,8 @@ struct dce_ctx { std::vector uses; std::vector> live; - dce_ctx(Program* program) : current_block(program->blocks.size() - 1), uses(program->peekAllocationId()) + dce_ctx(Program* program) + : current_block(program->blocks.size() - 1), uses(program->peekAllocationId()) { live.reserve(program->blocks.size()); for (Block& block : program->blocks) @@ -48,7 +49,8 @@ struct dce_ctx { } }; -void process_block(dce_ctx& ctx, Block& block) +void +process_block(dce_ctx& ctx, Block& block) { std::vector& live = ctx.live[block.index]; assert(live.size() == block.instructions.size()); @@ -72,23 +74,26 @@ void process_block(dce_ctx& ctx, Block& block) if (process_predecessors) { for (unsigned pred_idx : block.linear_preds) - ctx.current_block = std::max(ctx.current_block, (int) pred_idx); + ctx.current_block = std::max(ctx.current_block, (int)pred_idx); } } } /* end namespace */ -bool is_dead(const std::vector& uses, Instruction *instr) +bool +is_dead(const std::vector& uses, Instruction* instr) { if (instr->definitions.empty() || instr->isBranch()) return false; if (std::any_of(instr->definitions.begin(), instr->definitions.end(), - [&uses] (const Definition& def) { return !def.isTemp() || uses[def.tempId()];})) + [&uses](const Definition& def) { return !def.isTemp() || uses[def.tempId()]; })) return false; return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel)); } -std::vector dead_code_analysis(Program *program) { +std::vector +dead_code_analysis(Program* program) +{ dce_ctx ctx(program); @@ -105,5 +110,4 @@ std::vector dead_code_analysis(Program *program) { return ctx.uses; } -} - +} // namespace aco diff --git a/src/amd/compiler/aco_dominance.cpp b/src/amd/compiler/aco_dominance.cpp index 45013b59688..c3dda2be957 100644 --- a/src/amd/compiler/aco_dominance.cpp +++ b/src/amd/compiler/aco_dominance.cpp @@ -38,7 +38,8 @@ namespace aco { -void dominator_tree(Program* program) +void +dominator_tree(Program* program) { program->blocks[0].logical_idom = 0; program->blocks[0].linear_idom = 0; @@ -48,7 +49,7 @@ void dominator_tree(Program* program) int new_logical_idom = -1; int new_linear_idom = -1; for (unsigned pred_idx : block.logical_preds) { - if ((int) program->blocks[pred_idx].logical_idom == -1) + if ((int)program->blocks[pred_idx].logical_idom == -1) continue; if (new_logical_idom == -1) { @@ -56,16 +57,16 @@ void dominator_tree(Program* program) continue; } - while ((int) pred_idx != new_logical_idom) { - if ((int) pred_idx > new_logical_idom) + while ((int)pred_idx != new_logical_idom) { + if ((int)pred_idx > new_logical_idom) pred_idx = program->blocks[pred_idx].logical_idom; - if ((int) pred_idx < new_logical_idom) + if ((int)pred_idx < new_logical_idom) new_logical_idom = program->blocks[new_logical_idom].logical_idom; } } for (unsigned pred_idx : block.linear_preds) { - if ((int) program->blocks[pred_idx].linear_idom == -1) + if ((int)program->blocks[pred_idx].linear_idom == -1) continue; if (new_linear_idom == -1) { @@ -73,10 +74,10 @@ void dominator_tree(Program* program) continue; } - while ((int) pred_idx != new_linear_idom) { - if ((int) pred_idx > new_linear_idom) + while ((int)pred_idx != new_linear_idom) { + if ((int)pred_idx > new_linear_idom) pred_idx = program->blocks[pred_idx].linear_idom; - if ((int) pred_idx < new_linear_idom) + if ((int)pred_idx < new_linear_idom) new_linear_idom = program->blocks[new_linear_idom].linear_idom; } } @@ -86,5 +87,5 @@ void dominator_tree(Program* program) } } -} +} // namespace aco #endif diff --git a/src/amd/compiler/aco_form_hard_clauses.cpp b/src/amd/compiler/aco_form_hard_clauses.cpp index 8fbedc32fe5..fe806f55c79 100644 --- a/src/amd/compiler/aco_form_hard_clauses.cpp +++ b/src/amd/compiler/aco_form_hard_clauses.cpp @@ -31,15 +31,15 @@ namespace aco { namespace { /* there can also be LDS and VALU clauses, but I don't see how those are interesting */ -enum clause_type -{ +enum clause_type { clause_vmem, clause_flat, clause_smem, clause_other, }; -void emit_clause(Builder& bld, unsigned num_instrs, aco_ptr *instrs) +void +emit_clause(Builder& bld, unsigned num_instrs, aco_ptr* instrs) { unsigned start = 0; @@ -61,7 +61,8 @@ void emit_clause(Builder& bld, unsigned num_instrs, aco_ptr *instrs } /* end namespace */ -void form_hard_clauses(Program *program) +void +form_hard_clauses(Program* program) { for (Block& block : program->blocks) { unsigned num_instrs = 0; @@ -77,7 +78,8 @@ void form_hard_clauses(Program *program) clause_type type = clause_other; if (instr->isVMEM() && !instr->operands.empty()) { - if (program->chip_class == GFX10 && instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0) + if (program->chip_class == GFX10 && instr->isMIMG() && + get_mimg_nsa_dwords(instr.get()) > 0) type = clause_other; else type = clause_vmem; @@ -109,4 +111,4 @@ void form_hard_clauses(Program *program) block.instructions = std::move(new_instructions); } } -} +} // namespace aco diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index ddd4037f6b3..3ef70854c0a 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -34,12 +34,15 @@ namespace aco { namespace { struct NOP_ctx_gfx6 { - void join(const NOP_ctx_gfx6 &other) { - set_vskip_mode_then_vector = MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector); + void join(const NOP_ctx_gfx6& other) + { + set_vskip_mode_then_vector = + MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector); valu_wr_vcc_then_vccz = MAX2(valu_wr_vcc_then_vccz, other.valu_wr_vcc_then_vccz); valu_wr_exec_then_execz = MAX2(valu_wr_exec_then_execz, other.valu_wr_exec_then_execz); valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas); - salu_wr_m0_then_gds_msg_ttrace = MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace); + salu_wr_m0_then_gds_msg_ttrace = + MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace); valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp); salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds); salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel); @@ -53,23 +56,21 @@ struct NOP_ctx_gfx6 { } } - bool operator==(const NOP_ctx_gfx6 &other) + bool operator==(const NOP_ctx_gfx6& other) { - return - set_vskip_mode_then_vector == other.set_vskip_mode_then_vector && - valu_wr_vcc_then_vccz == other.valu_wr_vcc_then_vccz && - valu_wr_exec_then_execz == other.valu_wr_exec_then_execz && - valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas && - vmem_store_then_wr_data == other.vmem_store_then_wr_data && - salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace && - valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp && - salu_wr_m0_then_lds == other.salu_wr_m0_then_lds && - salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel && - setreg_then_getsetreg == other.setreg_then_getsetreg && - smem_clause == other.smem_clause && - smem_write == other.smem_write && - BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) && - BITSET_EQUAL(smem_clause_write, other.smem_clause_write); + return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector && + valu_wr_vcc_then_vccz == other.valu_wr_vcc_then_vccz && + valu_wr_exec_then_execz == other.valu_wr_exec_then_execz && + valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas && + vmem_store_then_wr_data == other.vmem_store_then_wr_data && + salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace && + valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp && + salu_wr_m0_then_lds == other.salu_wr_m0_then_lds && + salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel && + setreg_then_getsetreg == other.setreg_then_getsetreg && + smem_clause == other.smem_clause && smem_write == other.smem_write && + BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) && + BITSET_EQUAL(smem_clause_write, other.smem_clause_write); } void add_wait_states(unsigned amount) @@ -154,7 +155,8 @@ struct NOP_ctx_gfx10 { std::bitset<128> sgprs_read_by_VMEM; std::bitset<128> sgprs_read_by_SMEM; - void join(const NOP_ctx_gfx10 &other) { + void join(const NOP_ctx_gfx10& other) + { has_VOPC |= other.has_VOPC; has_nonVALU_exec_read |= other.has_nonVALU_exec_read; has_VMEM |= other.has_VMEM; @@ -167,23 +169,19 @@ struct NOP_ctx_gfx10 { sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM; } - bool operator==(const NOP_ctx_gfx10 &other) + bool operator==(const NOP_ctx_gfx10& other) { - return - has_VOPC == other.has_VOPC && - has_nonVALU_exec_read == other.has_nonVALU_exec_read && - has_VMEM == other.has_VMEM && - has_branch_after_VMEM == other.has_branch_after_VMEM && - has_DS == other.has_DS && - has_branch_after_DS == other.has_branch_after_DS && - has_NSA_MIMG == other.has_NSA_MIMG && - has_writelane == other.has_writelane && - sgprs_read_by_VMEM == other.sgprs_read_by_VMEM && - sgprs_read_by_SMEM == other.sgprs_read_by_SMEM; + return has_VOPC == other.has_VOPC && has_nonVALU_exec_read == other.has_nonVALU_exec_read && + has_VMEM == other.has_VMEM && has_branch_after_VMEM == other.has_branch_after_VMEM && + has_DS == other.has_DS && has_branch_after_DS == other.has_branch_after_DS && + has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane && + sgprs_read_by_VMEM == other.sgprs_read_by_VMEM && + sgprs_read_by_SMEM == other.sgprs_read_by_SMEM; } }; -int get_wait_states(aco_ptr& instr) +int +get_wait_states(aco_ptr& instr) { if (instr->opcode == aco_opcode::s_nop) return instr->sopp().imm + 1; @@ -193,16 +191,16 @@ int get_wait_states(aco_ptr& instr) return 1; } -bool regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size) +bool +regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size) { - return a_reg > b_reg ? - (a_reg - b_reg < b_size) : - (b_reg - a_reg < a_size); + return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size); } template -int handle_raw_hazard_internal(Program *program, Block *block, - int nops_needed, PhysReg reg, uint32_t mask) +int +handle_raw_hazard_internal(Program* program, Block* block, int nops_needed, PhysReg reg, + uint32_t mask) { unsigned mask_size = util_last_bit(mask); for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) { @@ -217,10 +215,8 @@ int handle_raw_hazard_internal(Program *program, Block *block, } } - bool is_hazard = writemask != 0 && - ((pred->isVALU() && Valu) || - (pred->isVINTRP() && Vintrp) || - (pred->isSALU() && Salu)); + bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) || + (pred->isVINTRP() && Vintrp) || (pred->isSALU() && Salu)); if (is_hazard) return nops_needed; @@ -238,17 +234,19 @@ int handle_raw_hazard_internal(Program *program, Block *block, * huge value. */ for (unsigned lin_pred : block->linear_preds) { res = std::max(res, handle_raw_hazard_internal( - program, &program->blocks[lin_pred], nops_needed, reg, mask)); + program, &program->blocks[lin_pred], nops_needed, reg, mask)); } return res; } template -void handle_raw_hazard(Program *program, Block *cur_block, int *NOPs, int min_states, Operand op) +void +handle_raw_hazard(Program* program, Block* cur_block, int* NOPs, int min_states, Operand op) { if (*NOPs >= min_states) return; - int res = handle_raw_hazard_internal(program, cur_block, min_states, op.physReg(), u_bit_consecutive(0, op.size())); + int res = handle_raw_hazard_internal( + program, cur_block, min_states, op.physReg(), u_bit_consecutive(0, op.size())); *NOPs = MAX2(*NOPs, res); } @@ -256,7 +254,9 @@ static auto handle_valu_then_read_hazard = handle_raw_hazard; static auto handle_vintrp_then_read_hazard = handle_raw_hazard; static auto handle_valu_salu_then_read_hazard = handle_raw_hazard; -void set_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) { +void +set_bitset_range(BITSET_WORD* words, unsigned start, unsigned size) +{ unsigned end = start + size - 1; unsigned start_mod = start % BITSET_WORDBITS; if (start_mod + size <= BITSET_WORDBITS) { @@ -268,7 +268,9 @@ void set_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) { } } -bool test_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) { +bool +test_bitset_range(BITSET_WORD* words, unsigned start, unsigned size) +{ unsigned end = start + size - 1; unsigned start_mod = start % BITSET_WORDBITS; if (start_mod + size <= BITSET_WORDBITS) { @@ -291,18 +293,21 @@ bool test_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) { * * SMEM clauses are only present on GFX8+, and only matter when XNACK is set. */ -void handle_smem_clause_hazards(Program *program, NOP_ctx_gfx6 &ctx, - aco_ptr& instr, int *NOPs) +void +handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr& instr, + int* NOPs) { /* break off from previous SMEM clause if needed */ if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) { /* Don't allow clauses with store instructions since the clause's * instructions may use the same address. */ - if (ctx.smem_write || instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) { + if (ctx.smem_write || instr->definitions.empty() || + instr_info.is_atomic[(unsigned)instr->opcode]) { *NOPs = 1; } else if (program->dev.xnack_enabled) { for (Operand op : instr->operands) { - if (!op.isConstant() && test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) { + if (!op.isConstant() && + test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) { *NOPs = 1; break; } @@ -316,8 +321,10 @@ void handle_smem_clause_hazards(Program *program, NOP_ctx_gfx6 &ctx, } /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */ -void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &ctx, - aco_ptr& instr, std::vector>& new_instructions) +void +handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx, + aco_ptr& instr, + std::vector>& new_instructions) { /* check hazards */ int NOPs = 0; @@ -343,14 +350,17 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c handle_smem_clause_hazards(program, ctx, instr, &NOPs); } else if (instr->isSALU()) { - if (instr->opcode == aco_opcode::s_setreg_b32 || instr->opcode == aco_opcode::s_setreg_imm32_b32 || + if (instr->opcode == aco_opcode::s_setreg_b32 || + instr->opcode == aco_opcode::s_setreg_imm32_b32 || instr->opcode == aco_opcode::s_getreg_b32) { NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg); } if (program->chip_class == GFX9) { - if (instr->opcode == aco_opcode::s_movrels_b32 || instr->opcode == aco_opcode::s_movrels_b64 || - instr->opcode == aco_opcode::s_movreld_b32 || instr->opcode == aco_opcode::s_movreld_b64) { + if (instr->opcode == aco_opcode::s_movrels_b32 || + instr->opcode == aco_opcode::s_movrels_b64 || + instr->opcode == aco_opcode::s_movreld_b32 || + instr->opcode == aco_opcode::s_movreld_b64) { NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel); } } @@ -398,7 +408,8 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c handle_vintrp_then_read_hazard(program, cur_block, &NOPs, 1, instr->operands[0]); } - if (instr->opcode == aco_opcode::v_div_fmas_f32 || instr->opcode == aco_opcode::v_div_fmas_f64) + if (instr->opcode == aco_opcode::v_div_fmas_f32 || + instr->opcode == aco_opcode::v_div_fmas_f64) NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas); } else if (instr->isVMEM() || instr->isFlatLike()) { /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */ @@ -412,13 +423,11 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector); if (program->chip_class == GFX9) { - bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && - instr->flatlike().lds; - if (instr->isVINTRP() || + bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds; + if (instr->isVINTRP() || lds_scratch_global || instr->opcode == aco_opcode::ds_read_addtid_b32 || instr->opcode == aco_opcode::ds_write_addtid_b32 || - instr->opcode == aco_opcode::buffer_store_lds_dword || - lds_scratch_global) { + instr->opcode == aco_opcode::buffer_store_lds_dword) { NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds); } } @@ -428,7 +437,8 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles if (NOPs) { /* create NOP */ - aco_ptr nop{create_instruction(aco_opcode::s_nop, Format::SOPP, 0, 0)}; + aco_ptr nop{ + create_instruction(aco_opcode::s_nop, Format::SOPP, 0, 0)}; nop->imm = NOPs - 1; nop->block = -1; new_instructions.emplace_back(std::move(nop)); @@ -485,7 +495,8 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c ctx.salu_wr_m0_then_lds = 1; ctx.salu_wr_m0_then_moverel = 1; } - } else if (instr->opcode == aco_opcode::s_setreg_b32 || instr->opcode == aco_opcode::s_setreg_imm32_b32) { + } else if (instr->opcode == aco_opcode::s_setreg_b32 || + instr->opcode == aco_opcode::s_setreg_imm32_b32) { SOPK_instruction& sopk = instr->sopk(); unsigned offset = (sopk.imm >> 6) & 0x1f; unsigned size = ((sopk.imm >> 11) & 0x1f) + 1; @@ -497,19 +508,16 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c } } else if (instr->isVMEM() || instr->isFlatLike()) { /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */ - bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && - instr->operands.size() == 4 && - instr->operands[3].size() > 2 && - instr->operands[2].physReg() >= 128; - /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit store) */ + bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 && + instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128; + /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit + * store) */ bool consider_mimg = instr->isMIMG() && instr->operands[1].regClass().type() == RegType::vgpr && - instr->operands[1].size() > 2 && - instr->operands[0].size() == 4; + instr->operands[1].size() > 2 && instr->operands[0].size() == 4; /* FLAT/GLOBAL/SCRATCH store with >64-bit data */ - bool consider_flat = instr->isFlatLike() && - instr->operands.size() == 3 && - instr->operands[2].size() > 2; + bool consider_flat = + instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2; if (consider_buf || consider_mimg || consider_flat) { PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg(); unsigned size = instr->operands[consider_flat ? 2 : 3].size(); @@ -520,22 +528,26 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c } template -bool check_written_regs(const aco_ptr &instr, const std::bitset &check_regs) +bool +check_written_regs(const aco_ptr& instr, const std::bitset& check_regs) { - return std::any_of(instr->definitions.begin(), instr->definitions.end(), [&check_regs](const Definition &def) -> bool { - bool writes_any = false; - for (unsigned i = 0; i < def.size(); i++) { - unsigned def_reg = def.physReg() + i; - writes_any |= def_reg < check_regs.size() && check_regs[def_reg]; - } - return writes_any; - }); + return std::any_of(instr->definitions.begin(), instr->definitions.end(), + [&check_regs](const Definition& def) -> bool + { + bool writes_any = false; + for (unsigned i = 0; i < def.size(); i++) { + unsigned def_reg = def.physReg() + i; + writes_any |= def_reg < check_regs.size() && check_regs[def_reg]; + } + return writes_any; + }); } template -void mark_read_regs(const aco_ptr &instr, std::bitset ®_reads) +void +mark_read_regs(const aco_ptr& instr, std::bitset& reg_reads) { - for (const Operand &op : instr->operands) { + for (const Operand& op : instr->operands) { for (unsigned i = 0; i < op.size(); i++) { unsigned reg = op.physReg() + i; if (reg < reg_reads.size()) @@ -544,7 +556,8 @@ void mark_read_regs(const aco_ptr &instr, std::bitset ®_reads } } -bool VALU_writes_sgpr(aco_ptr& instr) +bool +VALU_writes_sgpr(aco_ptr& instr) { if (instr->isVOPC()) return true; @@ -557,24 +570,26 @@ bool VALU_writes_sgpr(aco_ptr& instr) return false; } -bool instr_writes_exec(const aco_ptr& instr) +bool +instr_writes_exec(const aco_ptr& instr) { - return std::any_of(instr->definitions.begin(), instr->definitions.end(), [](const Definition &def) -> bool { - return def.physReg() == exec_lo || def.physReg() == exec_hi; - }); + return std::any_of(instr->definitions.begin(), instr->definitions.end(), + [](const Definition& def) -> bool + { return def.physReg() == exec_lo || def.physReg() == exec_hi; }); } -bool instr_writes_sgpr(const aco_ptr& instr) +bool +instr_writes_sgpr(const aco_ptr& instr) { - return std::any_of(instr->definitions.begin(), instr->definitions.end(), [](const Definition &def) -> bool { - return def.getTemp().type() == RegType::sgpr; - }); + return std::any_of(instr->definitions.begin(), instr->definitions.end(), + [](const Definition& def) -> bool + { return def.getTemp().type() == RegType::sgpr; }); } -inline bool instr_is_branch(const aco_ptr& instr) +inline bool +instr_is_branch(const aco_ptr& instr) { - return instr->opcode == aco_opcode::s_branch || - instr->opcode == aco_opcode::s_cbranch_scc0 || + return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 || instr->opcode == aco_opcode::s_cbranch_scc1 || instr->opcode == aco_opcode::s_cbranch_vccz || instr->opcode == aco_opcode::s_cbranch_vccnz || @@ -586,19 +601,20 @@ inline bool instr_is_branch(const aco_ptr& instr) instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user || instr->opcode == aco_opcode::s_subvector_loop_begin || instr->opcode == aco_opcode::s_subvector_loop_end || - instr->opcode == aco_opcode::s_setpc_b64 || - instr->opcode == aco_opcode::s_swappc_b64 || - instr->opcode == aco_opcode::s_getpc_b64 || - instr->opcode == aco_opcode::s_call_b64; + instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 || + instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64; } -void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10 &ctx, - aco_ptr& instr, std::vector>& new_instructions) +void +handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx, + aco_ptr& instr, + std::vector>& new_instructions) { - //TODO: s_dcache_inv needs to be in it's own group on GFX10 + // TODO: s_dcache_inv needs to be in it's own group on GFX10 /* VMEMtoScalarWriteHazard - * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between. + * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" + * in-between. */ if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) { /* Remember all SGPRs that are read by the VMEM instruction */ @@ -624,7 +640,8 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10 ctx.sgprs_read_by_VMEM.reset(); /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */ - aco_ptr depctr{create_instruction(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)}; + aco_ptr depctr{ + create_instruction(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)}; depctr->imm = 0xffe3; depctr->block = -1; new_instructions.emplace_back(std::move(depctr)); @@ -639,13 +656,13 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10 */ if (instr->isVOPC()) { ctx.has_VOPC = true; - } else if (ctx.has_VOPC && - (instr->opcode == aco_opcode::v_permlane16_b32 || - instr->opcode == aco_opcode::v_permlanex16_b32)) { + } else if (ctx.has_VOPC && (instr->opcode == aco_opcode::v_permlane16_b32 || + instr->opcode == aco_opcode::v_permlanex16_b32)) { ctx.has_VOPC = false; /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */ - aco_ptr v_mov{create_instruction(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)}; + aco_ptr v_mov{ + create_instruction(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)}; v_mov->definitions[0] = Definition(instr->operands[0].physReg(), v1); v_mov->operands[0] = Operand(instr->operands[0].physReg(), v1); new_instructions.emplace_back(std::move(v_mov)); @@ -663,7 +680,8 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10 ctx.has_nonVALU_exec_read = false; /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */ - aco_ptr depctr{create_instruction(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)}; + aco_ptr depctr{ + create_instruction(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)}; depctr->imm = 0xfffe; depctr->block = -1; new_instructions.emplace_back(std::move(depctr)); @@ -689,7 +707,8 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10 ctx.sgprs_read_by_SMEM.reset(); /* Insert s_mov to mitigate the problem */ - aco_ptr s_mov{create_instruction(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)}; + aco_ptr s_mov{ + create_instruction(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)}; s_mov->definitions[0] = Definition(sgpr_null, s1); s_mov->operands[0] = Operand(0u); new_instructions.emplace_back(std::move(s_mov)); @@ -738,14 +757,16 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10 ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false; /* Insert s_waitcnt_vscnt to mitigate the problem */ - aco_ptr wait{create_instruction(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)}; + aco_ptr wait{ + create_instruction(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)}; wait->definitions[0] = Definition(sgpr_null, s1); wait->imm = 0; new_instructions.emplace_back(std::move(wait)); } /* NSAToVMEMBug - * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] != 0). + * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] != + * 0). */ if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) { ctx.has_NSA_MIMG = true; @@ -772,11 +793,12 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10 } template -using HandleInstr = void (*)(Program *, Block *block, Ctx&, aco_ptr&, +using HandleInstr = void (*)(Program*, Block* block, Ctx&, aco_ptr&, std::vector>&); template Handle> -void handle_block(Program *program, Ctx& ctx, Block& block) +void +handle_block(Program* program, Ctx& ctx, Block& block) { if (block.instructions.empty()) return; @@ -793,14 +815,15 @@ void handle_block(Program *program, Ctx& ctx, Block& block) } template Handle> -void mitigate_hazards(Program *program) +void +mitigate_hazards(Program* program) { std::vector all_ctx(program->blocks.size()); std::stack loop_header_indices; for (unsigned i = 0; i < program->blocks.size(); i++) { Block& block = program->blocks[i]; - Ctx &ctx = all_ctx[i]; + Ctx& ctx = all_ctx[i]; if (block.kind & block_kind_loop_header) { loop_header_indices.push(i); @@ -832,7 +855,8 @@ void mitigate_hazards(Program *program) } /* end namespace */ -void insert_NOPs(Program* program) +void +insert_NOPs(Program* program) { if (program->chip_class >= GFX10_3) ; /* no hazards/bugs to mitigate */ @@ -842,4 +866,4 @@ void insert_NOPs(Program* program) mitigate_hazards(program); } -} +} // namespace aco diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 29a74e15843..288ade88764 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -24,6 +24,7 @@ #include "aco_builder.h" #include "aco_ir.h" + #include "util/u_math.h" #include @@ -55,10 +56,9 @@ struct wqm_ctx { std::vector defined_in; std::vector needs_wqm; std::vector branch_wqm; /* true if the branch condition in this block should be in wqm */ - wqm_ctx(Program* program_) : program(program_), - defined_in(program->peekAllocationId(), 0xFFFF), - needs_wqm(program->peekAllocationId()), - branch_wqm(program->blocks.size()) + wqm_ctx(Program* program_) + : program(program_), defined_in(program->peekAllocationId(), 0xFFFF), + needs_wqm(program->peekAllocationId()), branch_wqm(program->blocks.size()) { for (unsigned i = 0; i < program->blocks.size(); i++) worklist.insert(i); @@ -72,13 +72,15 @@ struct loop_info { bool has_divergent_break; bool has_divergent_continue; bool has_discard; /* has a discard or demote */ - loop_info(Block* b, uint16_t num, uint8_t needs_, bool breaks, bool cont, bool discard) : - loop_header(b), num_exec_masks(num), needs(needs_), has_divergent_break(breaks), - has_divergent_continue(cont), has_discard(discard) {} + loop_info(Block* b, uint16_t num, uint8_t needs_, bool breaks, bool cont, bool discard) + : loop_header(b), num_exec_masks(num), needs(needs_), has_divergent_break(breaks), + has_divergent_continue(cont), has_discard(discard) + {} }; struct block_info { - std::vector> exec; /* Vector of exec masks. Either a temporary or const -1. */ + std::vector> + exec; /* Vector of exec masks. Either a temporary or const -1. */ std::vector instr_needs; uint8_t block_needs; uint8_t ever_again_needs; @@ -87,14 +89,16 @@ struct block_info { }; struct exec_ctx { - Program *program; + Program* program; std::vector info; std::vector loop; bool handle_wqm = false; - exec_ctx(Program *program_) : program(program_), info(program->blocks.size()) {} + exec_ctx(Program* program_) : program(program_), info(program->blocks.size()) {} }; -bool needs_exact(aco_ptr& instr) { +bool +needs_exact(aco_ptr& instr) +{ if (instr->isMUBUF()) { return instr->mubuf().disable_wqm; } else if (instr->isMTBUF()) { @@ -108,7 +112,8 @@ bool needs_exact(aco_ptr& instr) { } } -void set_needs_wqm(wqm_ctx &ctx, Temp tmp) +void +set_needs_wqm(wqm_ctx& ctx, Temp tmp) { if (!ctx.needs_wqm[tmp.id()]) { ctx.needs_wqm[tmp.id()] = true; @@ -117,7 +122,8 @@ void set_needs_wqm(wqm_ctx &ctx, Temp tmp) } } -void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx) +void +mark_block_wqm(wqm_ctx& ctx, unsigned block_idx) { if (ctx.branch_wqm[block_idx]) return; @@ -136,7 +142,8 @@ void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx) mark_block_wqm(ctx, pred_idx); } -void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block) +void +get_block_needs(wqm_ctx& ctx, exec_ctx& exec_ctx, Block* block) { block_info& info = exec_ctx.info[block->index]; @@ -146,8 +153,8 @@ void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block) aco_ptr& instr = block->instructions[i]; WQMState needs = needs_exact(instr) ? Exact : Unspecified; - bool propagate_wqm = instr->opcode == aco_opcode::p_wqm || - instr->opcode == aco_opcode::p_as_uniform; + bool propagate_wqm = + instr->opcode == aco_opcode::p_wqm || instr->opcode == aco_opcode::p_as_uniform; bool preserve_wqm = instr->opcode == aco_opcode::p_discard_if; bool pred_by_exec = needs_exec_mask(instr.get()); for (const Definition& definition : instr->definitions) { @@ -214,7 +221,8 @@ void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block) * breaks, which might benefit from being in exact) by adding Exact_Branch to a * divergent branch surrounding the nested loop, if such a branch exists. */ -void handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader) +void +handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader) { for (unsigned idx = preheader + 1; idx < exec_ctx.program->blocks.size(); idx++) { Block& block = exec_ctx.program->blocks[idx]; @@ -231,7 +239,8 @@ void handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader) * ensure that the exact exec mask is not empty by adding Exact_Branch to * the outer divergent branch. */ -void handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader) +void +handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader) { assert(exec_ctx.program->blocks[preheader + 1].kind & block_kind_loop_header); @@ -265,7 +274,8 @@ void handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader) } } -void calculate_wqm_needs(exec_ctx& exec_ctx) +void +calculate_wqm_needs(exec_ctx& exec_ctx) { wqm_ctx ctx(exec_ctx.program); @@ -307,14 +317,12 @@ void calculate_wqm_needs(exec_ctx& exec_ctx) exec_ctx.info[i].block_needs |= Exact; /* if discard is used somewhere in nested CF, we need to preserve the WQM mask */ - if ((block.kind & block_kind_discard || - block.kind & block_kind_uses_discard_if) && + if ((block.kind & block_kind_discard || block.kind & block_kind_uses_discard_if) && ever_again_needs & WQM) exec_ctx.info[i].block_needs |= Preserve_WQM; ever_again_needs |= exec_ctx.info[i].block_needs & ~Exact_Branch; - if (block.kind & block_kind_discard || - block.kind & block_kind_uses_discard_if || + if (block.kind & block_kind_discard || block.kind & block_kind_uses_discard_if || block.kind & block_kind_uses_demote) ever_again_needs |= Exact; @@ -327,7 +335,8 @@ void calculate_wqm_needs(exec_ctx& exec_ctx) exec_ctx.handle_wqm = true; } -Operand get_exec_op(Operand t) +Operand +get_exec_op(Operand t) { if (t.isUndefined()) return Operand(exec, t.regClass()); @@ -335,7 +344,8 @@ Operand get_exec_op(Operand t) return t; } -void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx) +void +transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx) { if (ctx.info[idx].exec.back().second & mask_type_wqm) return; @@ -346,7 +356,8 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx) ctx.info[idx].exec.back().first = exec_mask; } - exec_mask = bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), get_exec_op(exec_mask)); + exec_mask = bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), + get_exec_op(exec_mask)); ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm); return; } @@ -355,11 +366,12 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx) assert(ctx.info[idx].exec.back().second & mask_type_wqm); assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); assert(ctx.info[idx].exec.back().first.isTemp()); - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), - ctx.info[idx].exec.back().first); + ctx.info[idx].exec.back().first = bld.pseudo( + aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first); } -void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx) +void +transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx) { if (ctx.info[idx].exec.back().second & mask_type_exact) return; @@ -372,8 +384,8 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx) assert(ctx.info[idx].exec.back().second & mask_type_exact); assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); assert(ctx.info[idx].exec.back().first.isTemp()); - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), - ctx.info[idx].exec.back().first); + ctx.info[idx].exec.back().first = bld.pseudo( + aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first); return; } /* otherwise, we create an exact mask and push to the stack */ @@ -382,14 +394,15 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx) wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), Definition(exec, bld.lm), ctx.info[idx].exec[0].first, Operand(exec, bld.lm)); } else { - bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc), ctx.info[idx].exec[0].first, wqm); + bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc), + ctx.info[idx].exec[0].first, wqm); } ctx.info[idx].exec.back().first = Operand(wqm); ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type_exact); } -unsigned add_coupling_code(exec_ctx& ctx, Block* block, - std::vector>& instructions) +unsigned +add_coupling_code(exec_ctx& ctx, Block* block, std::vector>& instructions) { unsigned idx = block->index; Builder bld(ctx.program, &instructions); @@ -417,7 +430,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, } else { uint8_t mask = mask_type_global; if (ctx.program->needs_wqm) { - bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), Operand(exec, bld.lm)); + bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), + Operand(exec, bld.lm)); mask |= mask_type_wqm; } else { mask |= mask_type_exact; @@ -440,7 +454,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, if (info.has_discard) { aco_ptr phi; for (int i = 0; i < info.num_exec_masks - 1; i++) { - phi.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)); + phi.reset(create_instruction(aco_opcode::p_linear_phi, + Format::PSEUDO, preds.size(), 1)); phi->definitions[0] = bld.def(bld.lm); phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[i].first); ctx.info[idx].exec[i].first = bld.insert(std::move(phi)); @@ -450,14 +465,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, /* create ssa name for restore mask */ if (info.has_divergent_break) { /* this phi might be trivial but ensures a parallelcopy on the loop header */ - aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; + aco_ptr phi{create_instruction( + aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; phi->definitions[0] = bld.def(bld.lm); phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first); ctx.info[idx].exec.back().first = bld.insert(std::move(phi)); } /* create ssa name for loop active mask */ - aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; + aco_ptr phi{create_instruction( + aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; if (info.has_divergent_continue) phi->definitions[0] = bld.def(bld.lm); else @@ -466,7 +483,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, Temp loop_active = bld.insert(std::move(phi)); if (info.has_divergent_break) { - uint8_t mask_type = (ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop; + uint8_t mask_type = + (ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop; ctx.info[idx].exec.emplace_back(loop_active, mask_type); } else { ctx.info[idx].exec.back().first = Operand(loop_active); @@ -482,8 +500,10 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, } uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); - ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), - ctx.info[idx].exec.back().first), mask_type); + ctx.info[idx].exec.emplace_back( + bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), + ctx.info[idx].exec.back().first), + mask_type); } return i; @@ -514,14 +534,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, aco_ptr& phi = header->instructions[instr_idx++]; assert(phi->opcode == aco_opcode::p_linear_phi); for (unsigned i = 1; i < phi->operands.size(); i++) - phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first); + phi->operands[i] = + get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first); } if (info.has_divergent_break) { aco_ptr& phi = header->instructions[instr_idx]; assert(phi->opcode == aco_opcode::p_linear_phi); for (unsigned i = 1; i < phi->operands.size(); i++) - phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first); + phi->operands[i] = + get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first); } assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2); @@ -541,7 +563,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, ctx.info[idx].exec.emplace_back(same, type); } else { /* create phi for loop footer */ - aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; + aco_ptr phi{create_instruction( + aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; phi->definitions[0] = bld.def(bld.lm); if (exec_idx == info.num_exec_masks - 1u) { phi->definitions[0] = Definition(exec, bld.lm); @@ -578,8 +601,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); if (get_exec_op(ctx.info[idx].exec.back().first).isTemp()) { /* move current exec mask into exec register */ - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), - ctx.info[idx].exec.back().first); + ctx.info[idx].exec.back().first = bld.pseudo( + aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first); } ctx.loop.pop_back(); @@ -591,8 +614,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, } else { assert(preds.size() == 2); /* if one of the predecessors ends in exact mask, we pop it from stack */ - unsigned num_exec_masks = std::min(ctx.info[preds[0]].exec.size(), - ctx.info[preds[1]].exec.size()); + unsigned num_exec_masks = + std::min(ctx.info[preds[0]].exec.size(), ctx.info[preds[1]].exec.size()); if (block->kind & block_kind_merge) num_exec_masks--; @@ -605,14 +628,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, if (ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) { Operand t = ctx.info[preds[0]].exec[i].first; /* discard/demote can change the state of the current exec mask */ - assert(!t.isTemp() || ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second); + assert(!t.isTemp() || + ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second); uint8_t mask = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second; ctx.info[idx].exec.emplace_back(t, mask); continue; } bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge); - Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm), + Temp phi = bld.pseudo(aco_opcode::p_linear_phi, + in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm), get_exec_op(ctx.info[preds[0]].exec[i].first), get_exec_op(ctx.info[preds[1]].exec[i].first)); uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second; @@ -654,9 +679,9 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, return i; } -void process_instructions(exec_ctx& ctx, Block* block, - std::vector>& instructions, - unsigned idx) +void +process_instructions(exec_ctx& ctx, Block* block, std::vector>& instructions, + unsigned idx) { WQMState state; if (ctx.info[block->index].exec.back().second & mask_type_wqm) @@ -667,17 +692,16 @@ void process_instructions(exec_ctx& ctx, Block* block, } /* if the block doesn't need both, WQM and Exact, we can skip processing the instructions */ - bool process = (ctx.handle_wqm && - (ctx.info[block->index].block_needs & state) != - (ctx.info[block->index].block_needs & (WQM | Exact))) || + bool process = (ctx.handle_wqm && (ctx.info[block->index].block_needs & state) != + (ctx.info[block->index].block_needs & (WQM | Exact))) || block->kind & block_kind_uses_discard_if || - block->kind & block_kind_uses_demote || - block->kind & block_kind_needs_lowering; + block->kind & block_kind_uses_demote || block->kind & block_kind_needs_lowering; if (!process) { std::vector>::iterator it = std::next(block->instructions.begin(), idx); instructions.insert(instructions.end(), std::move_iterator>::iterator>(it), - std::move_iterator>::iterator>(block->instructions.end())); + std::move_iterator>::iterator>( + block->instructions.end())); return; } @@ -700,11 +724,13 @@ void process_instructions(exec_ctx& ctx, Block* block, /* discard from current exec */ const Operand cond = instr->operands[0]; Temp exit_cond = bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), - Operand(exec, bld.lm), cond).def(1).getTemp(); + Operand(exec, bld.lm), cond) + .def(1) + .getTemp(); /* discard from inner to outer exec mask on stack */ for (int i = num - 2; i >= 0; i--) { - Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), + Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), ctx.info[block->index].exec[i].first, cond); ctx.info[block->index].exec[i].first = Operand(andn2->definitions[0].getTemp()); exit_cond = andn2->definitions[1].getTemp(); @@ -726,14 +752,16 @@ void process_instructions(exec_ctx& ctx, Block* block, Definition dst = instr->definitions[0]; assert(dst.size() == bld.lm.size()); if (state == Exact) { - instr.reset(create_instruction(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1)); + instr.reset(create_instruction(bld.w64or32(Builder::s_mov), + Format::SOP1, 1, 1)); instr->operands[0] = Operand(0u); instr->definitions[0] = dst; } else { std::pair& exact_mask = ctx.info[block->index].exec[0]; assert(exact_mask.second & mask_type_exact); - instr.reset(create_instruction(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2)); + instr.reset(create_instruction(bld.w64or32(Builder::s_andn2), + Format::SOP2, 2, 2)); instr->operands[0] = Operand(exec, bld.lm); /* current exec */ instr->operands[1] = Operand(exact_mask.first); instr->definitions[0] = dst; @@ -741,7 +769,8 @@ void process_instructions(exec_ctx& ctx, Block* block, } } else if (instr->opcode == aco_opcode::p_demote_to_helper) { /* turn demote into discard_if with only exact masks */ - assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) == (mask_type_exact | mask_type_global)); + assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) == + (mask_type_exact | mask_type_global)); int num; Temp cond, exit_cond; @@ -749,8 +778,9 @@ void process_instructions(exec_ctx& ctx, Block* block, assert(instr->operands[0].constantValue() == -1u); /* transition to exact and set exec to zero */ exit_cond = bld.tmp(s1); - cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)), - Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm)); + cond = + bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)), + Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm)); num = ctx.info[block->index].exec.size() - 2; if (!(ctx.info[block->index].exec.back().second & mask_type_exact)) { @@ -767,7 +797,7 @@ void process_instructions(exec_ctx& ctx, Block* block, for (int i = num; i >= 0; i--) { if (ctx.info[block->index].exec[i].second & mask_type_exact) { - Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), + Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), ctx.info[block->index].exec[i].first, cond); if (i == (int)ctx.info[block->index].exec.size() - 1) { andn2->operands[0] = Operand(exec, bld.lm); @@ -783,14 +813,14 @@ void process_instructions(exec_ctx& ctx, Block* block, instr->opcode = aco_opcode::p_exit_early_if; instr->operands[0] = bld.scc(exit_cond); state = Exact; - } bld.insert(std::move(instr)); } } -void add_branch_code(exec_ctx& ctx, Block* block) +void +add_branch_code(exec_ctx& ctx, Block* block) { unsigned idx = block->index; Builder bld(ctx.program, block); @@ -806,8 +836,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) } assert(ctx.info[idx].exec.size() <= 2); - if (ctx.info[idx].ever_again_needs == 0 || - ctx.info[idx].ever_again_needs == Exact) { + if (ctx.info[idx].ever_again_needs == 0 || ctx.info[idx].ever_again_needs == Exact) { /* transition to Exact */ aco_ptr branch = std::move(block->instructions.back()); block->instructions.pop_back(); @@ -838,8 +867,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) Block& loop_block = ctx.program->blocks[i]; needs |= ctx.info[i].block_needs; - if (loop_block.kind & block_kind_uses_discard_if || - loop_block.kind & block_kind_discard || + if (loop_block.kind & block_kind_uses_discard_if || loop_block.kind & block_kind_discard || loop_block.kind & block_kind_uses_demote) has_discard = true; if (loop_block.loop_nest_depth != loop_nest_depth) @@ -871,12 +899,8 @@ void add_branch_code(exec_ctx& ctx, Block* block) if (block->kind & block_kind_top_level) num_exec_masks = std::min(num_exec_masks, 2u); - ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], - num_exec_masks, - needs, - has_divergent_break, - has_divergent_continue, - has_discard); + ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], num_exec_masks, needs, + has_divergent_break, has_divergent_continue, has_discard); } /* For normal breaks, this is the exec mask. For discard+break, it's the @@ -903,7 +927,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm)); for (int i = num - 1; i >= 0; i--) { - Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), + Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), get_exec_op(ctx.info[block->index].exec[i].first), cond); if (i == (int)ctx.info[idx].exec.size() - 1) andn2->definitions[0] = Definition(exec, bld.lm); @@ -919,8 +943,10 @@ void add_branch_code(exec_ctx& ctx, Block* block) } if (block->kind & block_kind_continue_or_break) { - assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind & block_kind_loop_header); - assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind & block_kind_loop_exit); + assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind & + block_kind_loop_header); + assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind & + block_kind_loop_exit); assert(block->instructions.back()->opcode == aco_opcode::p_branch); block->instructions.pop_back(); @@ -931,8 +957,10 @@ void add_branch_code(exec_ctx& ctx, Block* block) } if (need_parallelcopy) - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first); - bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]); + ctx.info[idx].exec.back().first = bld.pseudo( + aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first); + bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), + block->linear_succs[1], block->linear_succs[0]); return; } @@ -949,8 +977,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) if (block->kind & block_kind_branch) { - if (ctx.handle_wqm && - ctx.info[idx].exec.size() >= 2 && + if (ctx.handle_wqm && ctx.info[idx].exec.size() >= 2 && ctx.info[idx].exec.back().second == mask_type_exact && !(ctx.info[idx].block_needs & Exact_Branch) && ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].second & mask_type_wqm) { @@ -972,7 +999,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), cond); } else { Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), - Definition(exec, bld.lm), cond, Operand(exec, bld.lm)); + Definition(exec, bld.lm), cond, Operand(exec, bld.lm)); ctx.info[idx].exec.back().first = Operand(old_exec); } @@ -980,7 +1007,8 @@ void add_branch_code(exec_ctx& ctx, Block* block) /* add next current exec to the stack */ ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type); - bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]); + bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), + block->linear_succs[1], block->linear_succs[0]); return; } @@ -990,9 +1018,11 @@ void add_branch_code(exec_ctx& ctx, Block* block) block->instructions.pop_back(); assert(ctx.info[idx].exec.size() >= 2); Operand orig_exec = ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].first; - bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec, Operand(exec, bld.lm)); + bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec, + Operand(exec, bld.lm)); - bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]); + bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), + block->linear_succs[1], block->linear_succs[0]); return; } @@ -1020,7 +1050,8 @@ void add_branch_code(exec_ctx& ctx, Block* block) bld.copy(Definition(exec, bld.lm), Operand(0u, bld.lm == s2)); } - bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); + bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), + block->linear_succs[1], block->linear_succs[0]); return; } @@ -1048,12 +1079,14 @@ void add_branch_code(exec_ctx& ctx, Block* block) bld.copy(Definition(exec, bld.lm), Operand(0u, bld.lm == s2)); } - bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); + bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), + block->linear_succs[1], block->linear_succs[0]); return; } } -void process_block(exec_ctx& ctx, Block* block) +void +process_block(exec_ctx& ctx, Block* block) { std::vector> instructions; instructions.reserve(block->instructions.size()); @@ -1072,8 +1105,8 @@ void process_block(exec_ctx& ctx, Block* block) } /* end namespace */ - -void insert_exec_mask(Program *program) +void +insert_exec_mask(Program* program) { exec_ctx ctx(program); @@ -1082,8 +1115,6 @@ void insert_exec_mask(Program *program) for (Block& block : program->blocks) process_block(ctx, &block); - -} - } +} // namespace aco diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 83c6dac0263..e4788270c98 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -23,6 +23,7 @@ */ #include "aco_ir.h" + #include "common/sid.h" #include @@ -49,7 +50,8 @@ namespace { * - or erase gprs with counters higher than to be waited for. */ -// TODO: do a more clever insertion of wait_cnt (lgkm_cnt) when there is a load followed by a use of a previous load +// TODO: do a more clever insertion of wait_cnt (lgkm_cnt) +// when there is a load followed by a use of a previous load /* Instructions of the same event will finish in-order except for smem * and maybe flat. Instructions of different events may not finish in-order. */ @@ -77,54 +79,50 @@ enum counter_type : uint8_t { num_counters = 4, }; -static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock; +static const uint16_t exp_events = + event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock; static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg; static const uint16_t vm_events = event_vmem | event_flat; static const uint16_t vs_events = event_vmem_store; -uint8_t get_counters_for_event(wait_event ev) +uint8_t +get_counters_for_event(wait_event ev) { switch (ev) { case event_smem: case event_lds: case event_gds: - case event_sendmsg: - return counter_lgkm; - case event_vmem: - return counter_vm; - case event_vmem_store: - return counter_vs; - case event_flat: - return counter_vm | counter_lgkm; + case event_sendmsg: return counter_lgkm; + case event_vmem: return counter_vm; + case event_vmem_store: return counter_vs; + case event_flat: return counter_vm | counter_lgkm; case event_exp_pos: case event_exp_param: case event_exp_mrt_null: case event_gds_gpr_lock: - case event_vmem_gpr_lock: - return counter_exp; - default: - return 0; + case event_vmem_gpr_lock: return counter_exp; + default: return 0; } } struct wait_entry { wait_imm imm; - uint16_t events; /* use wait_event notion */ + uint16_t events; /* use wait_event notion */ uint8_t counters; /* use counter_type notion */ - bool wait_on_read:1; - bool logical:1; - bool has_vmem_nosampler:1; - bool has_vmem_sampler:1; + bool wait_on_read : 1; + bool logical : 1; + bool has_vmem_nosampler : 1; + bool has_vmem_sampler : 1; wait_entry(wait_event event_, wait_imm imm_, bool logical_, bool wait_on_read_) - : imm(imm_), events(event_), counters(get_counters_for_event(event_)), - wait_on_read(wait_on_read_), logical(logical_), - has_vmem_nosampler(false), has_vmem_sampler(false) {} + : imm(imm_), events(event_), counters(get_counters_for_event(event_)), + wait_on_read(wait_on_read_), logical(logical_), has_vmem_nosampler(false), + has_vmem_sampler(false) + {} bool join(const wait_entry& other) { - bool changed = (other.events & ~events) || - (other.counters & ~counters) || + bool changed = (other.events & ~events) || (other.counters & ~counters) || (other.wait_on_read && !wait_on_read) || (other.has_vmem_nosampler && !has_vmem_nosampler) || (other.has_vmem_sampler && !has_vmem_sampler); @@ -156,7 +154,8 @@ struct wait_entry { if (counter == counter_exp) { imm.exp = wait_imm::unset_counter; - events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock); + events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | + event_vmem_gpr_lock); } if (counter == counter_vs) { @@ -170,7 +169,7 @@ struct wait_entry { }; struct wait_ctx { - Program *program; + Program* program; enum chip_class chip_class; uint16_t max_vm_cnt; uint16_t max_exp_cnt; @@ -189,24 +188,21 @@ struct wait_ctx { wait_imm barrier_imm[storage_count]; uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */ - std::map gpr_map; + std::map gpr_map; wait_ctx() {} - wait_ctx(Program *program_) - : program(program_), - chip_class(program_->chip_class), - max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14), - max_exp_cnt(6), - max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14), - max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0), - unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {} + wait_ctx(Program* program_) + : program(program_), chip_class(program_->chip_class), + max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14), max_exp_cnt(6), + max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14), + max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0), + unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) + {} bool join(const wait_ctx* other, bool logical) { - bool changed = other->exp_cnt > exp_cnt || - other->vm_cnt > vm_cnt || - other->lgkm_cnt > lgkm_cnt || - other->vs_cnt > vs_cnt || + bool changed = other->exp_cnt > exp_cnt || other->vm_cnt > vm_cnt || + other->lgkm_cnt > lgkm_cnt || other->vs_cnt > vs_cnt || (other->pending_flat_lgkm && !pending_flat_lgkm) || (other->pending_flat_vm && !pending_flat_vm); @@ -218,12 +214,11 @@ struct wait_ctx { pending_flat_vm |= other->pending_flat_vm; pending_s_buffer_store |= other->pending_s_buffer_store; - for (const auto& entry : other->gpr_map) - { + for (const auto& entry : other->gpr_map) { if (entry.second.logical != logical) continue; - using iterator = std::map::iterator; + using iterator = std::map::iterator; const std::pair insert_pair = gpr_map.insert(entry); if (insert_pair.second) { changed = true; @@ -241,12 +236,14 @@ struct wait_ctx { return changed; } - void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter) { + void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter) + { entry.remove_counter(counter); } }; -wait_imm check_instr(Instruction* instr, wait_ctx& ctx) +wait_imm +check_instr(Instruction* instr, wait_ctx& ctx) { wait_imm wait; @@ -257,7 +254,7 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx) /* check consecutively read gprs */ for (unsigned j = 0; j < op.size(); j++) { PhysReg reg{op.physReg() + j}; - std::map::iterator it = ctx.gpr_map.find(reg); + std::map::iterator it = ctx.gpr_map.find(reg); if (it == ctx.gpr_map.end() || !it->second.wait_on_read) continue; @@ -267,22 +264,24 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx) for (const Definition& def : instr->definitions) { /* check consecutively written gprs */ - for (unsigned j = 0; j < def.getTemp().size(); j++) - { + for (unsigned j = 0; j < def.getTemp().size(); j++) { PhysReg reg{def.physReg() + j}; - std::map::iterator it = ctx.gpr_map.find(reg); + std::map::iterator it = ctx.gpr_map.find(reg); if (it == ctx.gpr_map.end()) continue; /* Vector Memory reads and writes return in the order they were issued */ - bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4; + bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && + instr->operands[1].regClass() == s4; if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem) && - it->second.has_vmem_nosampler == !has_sampler && it->second.has_vmem_sampler == has_sampler) + it->second.has_vmem_nosampler == !has_sampler && + it->second.has_vmem_sampler == has_sampler) continue; /* LDS reads and writes return in the order they were issued. same for GDS */ - if (instr->isDS() && (it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds)) + if (instr->isDS() && + (it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds)) continue; wait.combine(it->second.imm); @@ -292,7 +291,8 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx) return wait; } -wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr) +wait_imm +parse_wait_instr(wait_ctx& ctx, Instruction* instr) { if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->definitions[0].physReg() == sgpr_null) { @@ -305,10 +305,12 @@ wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr) return wait_imm(); } -wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics) +wait_imm +perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics) { wait_imm imm; - sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup; + sync_scope subgroup_scope = + ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup; if ((sync.semantics & semantics) && sync.scope > subgroup_scope) { unsigned storage = sync.storage; while (storage) { @@ -321,7 +323,8 @@ wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantic if (bar_scope_lds <= subgroup_scope) events &= ~event_lds; - /* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations in-order for the same workgroup */ + /* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations + * in-order for the same workgroup */ if (!ctx.program->wgp_mode && sync.scope <= scope_workgroup) events &= ~(event_vmem | event_vmem_store | event_smem); @@ -333,7 +336,8 @@ wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantic return imm; } -void force_waitcnt(wait_ctx& ctx, wait_imm& imm) +void +force_waitcnt(wait_ctx& ctx, wait_imm& imm) { if (ctx.vm_cnt) imm.vm = 0; @@ -348,7 +352,8 @@ void force_waitcnt(wait_ctx& ctx, wait_imm& imm) } } -wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info) +wait_imm +kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info) { wait_imm imm; @@ -364,7 +369,6 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info) imm.combine(parse_wait_instr(ctx, instr)); - /* It's required to wait for scalar stores before "writing back" data. * It shouldn't cost anything anyways since we're about to do s_endpgm. */ @@ -380,20 +384,19 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info) * * TODO: Refine this when we have proper alias analysis. */ - if (ctx.pending_s_buffer_store && - !instr->smem().definitions.empty() && + if (ctx.pending_s_buffer_store && !instr->smem().definitions.empty() && !instr->smem().sync.can_reorder()) { imm.lgkm = 0; } } if (ctx.program->early_rast && instr->opcode == aco_opcode::exp) { - if (instr->exp().dest >= V_008DFC_SQ_EXP_POS && - instr->exp().dest < V_008DFC_SQ_EXP_PRIM) { + if (instr->exp().dest >= V_008DFC_SQ_EXP_POS && instr->exp().dest < V_008DFC_SQ_EXP_PRIM) { - /* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos export. - * Wait for all stores (and atomics) to complete, so PS can read them. - * TODO: This only really applies to DONE pos exports. Consider setting the DONE bit earlier. + /* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos + * export. Wait for all stores (and atomics) to complete, so PS can read them. + * TODO: This only really applies to DONE pos exports. + * Consider setting the DONE bit earlier. */ if (ctx.vs_cnt > 0) imm.vs = 0; @@ -444,9 +447,8 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info) } /* remove all gprs with higher counter from map */ - std::map::iterator it = ctx.gpr_map.begin(); - while (it != ctx.gpr_map.end()) - { + std::map::iterator it = ctx.gpr_map.begin(); + while (it != ctx.gpr_map.end()) { if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp) ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp); if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm) @@ -472,13 +474,15 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info) return imm; } -void update_barrier_counter(uint8_t *ctr, unsigned max) +void +update_barrier_counter(uint8_t* ctr, unsigned max) { if (*ctr != wait_imm::unset_counter && *ctr < max) (*ctr)++; } -void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync) +void +update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync) { for (unsigned i = 0; i < storage_count; i++) { wait_imm& bar = ctx.barrier_imm[i]; @@ -506,7 +510,8 @@ void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memor } } -void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memory_sync_info()) +void +update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_sync_info()) { uint8_t counters = get_counters_for_event(event); @@ -529,7 +534,7 @@ void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memo if (ctx.pending_flat_vm) counters &= ~counter_vm; - for (std::pair& e : ctx.gpr_map) { + for (std::pair& e : ctx.gpr_map) { wait_entry& entry = e.second; if (entry.events & ctx.unordered_events) @@ -537,18 +542,23 @@ void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memo assert(entry.events); - if ((counters & counter_exp) && (entry.events & exp_events) == event && entry.imm.exp < ctx.max_exp_cnt) + if ((counters & counter_exp) && (entry.events & exp_events) == event && + entry.imm.exp < ctx.max_exp_cnt) entry.imm.exp++; - if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event && entry.imm.lgkm < ctx.max_lgkm_cnt) + if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event && + entry.imm.lgkm < ctx.max_lgkm_cnt) entry.imm.lgkm++; - if ((counters & counter_vm) && (entry.events & vm_events) == event && entry.imm.vm < ctx.max_vm_cnt) + if ((counters & counter_vm) && (entry.events & vm_events) == event && + entry.imm.vm < ctx.max_vm_cnt) entry.imm.vm++; - if ((counters & counter_vs) && (entry.events & vs_events) == event && entry.imm.vs < ctx.max_vs_cnt) + if ((counters & counter_vs) && (entry.events & vs_events) == event && + entry.imm.vs < ctx.max_vs_cnt) entry.imm.vs++; } } -void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_sync_info()) +void +update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info()) { assert(ctx.chip_class < GFX10); @@ -559,8 +569,7 @@ void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_s update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync); - for (std::pair e : ctx.gpr_map) - { + for (std::pair e : ctx.gpr_map) { if (e.second.counters & counter_vm) e.second.imm.vm = 0; if (e.second.counters & counter_lgkm) @@ -570,8 +579,9 @@ void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_s ctx.pending_flat_vm = true; } -void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read, - bool has_sampler=false) +void +insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read, + bool has_sampler = false) { uint16_t counters = get_counters_for_event(event); wait_imm imm; @@ -589,24 +599,27 @@ void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event new_entry.has_vmem_sampler = (event & event_vmem) && has_sampler; for (unsigned i = 0; i < rc.size(); i++) { - auto it = ctx.gpr_map.emplace(PhysReg{reg.reg()+i}, new_entry); + auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry); if (!it.second) it.first->second.join(new_entry); } } -void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, bool has_sampler=false) +void +insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, bool has_sampler = false) { if (!op.isConstant() && !op.isUndefined()) insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, has_sampler); } -void insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, bool has_sampler=false) +void +insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, bool has_sampler = false) { insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, has_sampler); } -void gen(Instruction* instr, wait_ctx& ctx) +void +gen(Instruction* instr, wait_ctx& ctx) { switch (instr->format) { case Format::EXP: { @@ -622,13 +635,11 @@ void gen(Instruction* instr, wait_ctx& ctx) update_counters(ctx, ev); /* insert new entries for exported vgprs */ - for (unsigned i = 0; i < 4; i++) - { + for (unsigned i = 0; i < 4; i++) { if (exp_instr.enabled_mask & (1 << i)) { unsigned idx = exp_instr.compressed ? i >> 1 : i; assert(idx < exp_instr.operands.size()); insert_wait_entry(ctx, exp_instr.operands[idx], ev); - } } insert_wait_entry(ctx, exec, s2, ev, false); @@ -651,8 +662,7 @@ void gen(Instruction* instr, wait_ctx& ctx) if (!instr->definitions.empty()) insert_wait_entry(ctx, instr->definitions[0], event_smem); - else if (ctx.chip_class >= GFX10 && - !smem.sync.can_reorder()) + else if (ctx.chip_class >= GFX10 && !smem.sync.can_reorder()) ctx.pending_s_buffer_store = true; break; @@ -677,23 +687,21 @@ void gen(Instruction* instr, wait_ctx& ctx) case Format::MTBUF: case Format::MIMG: case Format::GLOBAL: { - wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store; + wait_event ev = + !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store; update_counters(ctx, ev, get_sync_info(instr)); - bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4; + bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && + instr->operands[1].regClass() == s4; if (!instr->definitions.empty()) insert_wait_entry(ctx, instr->definitions[0], ev, has_sampler); - if (ctx.chip_class == GFX6 && - instr->format != Format::MIMG && - instr->operands.size() == 4) { + if (ctx.chip_class == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) { ctx.exp_cnt++; update_counters(ctx, event_vmem_gpr_lock); insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock); - } else if (ctx.chip_class == GFX6 && - instr->isMIMG() && - !instr->operands[2].isUndefined()) { + } else if (ctx.chip_class == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) { ctx.exp_cnt++; update_counters(ctx, event_vmem_gpr_lock); insert_wait_entry(ctx, instr->operands[2], event_vmem_gpr_lock); @@ -702,35 +710,37 @@ void gen(Instruction* instr, wait_ctx& ctx) break; } case Format::SOPP: { - if (instr->opcode == aco_opcode::s_sendmsg || - instr->opcode == aco_opcode::s_sendmsghalt) + if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_sendmsghalt) update_counters(ctx, event_sendmsg); break; } - default: - break; + default: break; } } -void emit_waitcnt(wait_ctx& ctx, std::vector>& instructions, wait_imm imm) +void +emit_waitcnt(wait_ctx& ctx, std::vector>& instructions, wait_imm imm) { if (imm.vs != wait_imm::unset_counter) { assert(ctx.chip_class >= GFX10); - SOPK_instruction* waitcnt_vs = create_instruction(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1); + SOPK_instruction* waitcnt_vs = + create_instruction(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1); waitcnt_vs->definitions[0] = Definition(sgpr_null, s1); waitcnt_vs->imm = imm.vs; instructions.emplace_back(waitcnt_vs); imm.vs = wait_imm::unset_counter; } if (!imm.empty()) { - SOPP_instruction* waitcnt = create_instruction(aco_opcode::s_waitcnt, Format::SOPP, 0, 0); + SOPP_instruction* waitcnt = + create_instruction(aco_opcode::s_waitcnt, Format::SOPP, 0, 0); waitcnt->imm = imm.pack(ctx.chip_class); waitcnt->block = -1; instructions.emplace_back(waitcnt); } } -void handle_block(Program *program, Block& block, wait_ctx& ctx) +void +handle_block(Program* program, Block& block, wait_ctx& ctx) { std::vector> new_instructions; @@ -763,7 +773,8 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx) } /* end namespace */ -void insert_wait_states(Program* program) +void +insert_wait_states(Program* program) { /* per BB ctx */ std::vector done(program->blocks.size()); @@ -818,5 +829,4 @@ void insert_wait_states(Program* program) } } -} - +} // namespace aco diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 2af31108aae..c7bdbb8b3c4 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -47,14 +47,15 @@ namespace { #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__) -static void _isel_err(isel_context *ctx, const char *file, unsigned line, - const nir_instr *instr, const char *msg) +static void +_isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr, + const char* msg) { - char *out; + char* out; size_t outsize; struct u_memstream mem; u_memstream_open(&mem, &out, &outsize); - FILE *const memf = u_memstream_get(&mem); + FILE* const memf = u_memstream_get(&mem); fprintf(memf, "%s: ", msg); nir_print_instr(instr, memf); @@ -90,43 +91,48 @@ struct loop_context { bool divergent_if_old; }; -static bool visit_cf_list(struct isel_context *ctx, - struct exec_list *list); +static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list); -static void add_logical_edge(unsigned pred_idx, Block *succ) +static void +add_logical_edge(unsigned pred_idx, Block* succ) { succ->logical_preds.emplace_back(pred_idx); } - -static void add_linear_edge(unsigned pred_idx, Block *succ) +static void +add_linear_edge(unsigned pred_idx, Block* succ) { succ->linear_preds.emplace_back(pred_idx); } -static void add_edge(unsigned pred_idx, Block *succ) +static void +add_edge(unsigned pred_idx, Block* succ) { add_logical_edge(pred_idx, succ); add_linear_edge(pred_idx, succ); } -static void append_logical_start(Block *b) +static void +append_logical_start(Block* b) { Builder(NULL, b).pseudo(aco_opcode::p_logical_start); } -static void append_logical_end(Block *b) +static void +append_logical_end(Block* b) { Builder(NULL, b).pseudo(aco_opcode::p_logical_end); } -Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def) +Temp +get_ssa_temp(struct isel_context* ctx, nir_ssa_def* def) { uint32_t id = ctx->first_temp_id + def->index; return Temp(id, ctx->program->temp_rc[id]); } -Temp emit_mbcnt(isel_context *ctx, Temp dst, Operand mask = Operand(), Operand base = Operand(0u)) +Temp +emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand(0u)) { Builder bld(ctx->program, ctx->block); assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec)); @@ -142,7 +148,8 @@ Temp emit_mbcnt(isel_context *ctx, Temp dst, Operand mask = Operand(), Operand b if (mask.isTemp()) { RegClass rc = RegClass(mask.regClass().type(), 1); - Builder::Result mask_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask); + Builder::Result mask_split = + bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask); mask_lo = Operand(mask_split.def(0).getTemp()); mask_hi = Operand(mask_split.def(1).getTemp()); } else if (mask.physReg() == exec) { @@ -158,7 +165,8 @@ Temp emit_mbcnt(isel_context *ctx, Temp dst, Operand mask = Operand(), Operand b return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo); } -Temp emit_wqm(Builder& bld, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false) +Temp +emit_wqm(Builder& bld, Temp src, Temp dst = Temp(0, s1), bool program_needs_wqm = false) { if (!dst.id()) dst = bld.tmp(src.regClass()); @@ -178,7 +186,8 @@ Temp emit_wqm(Builder& bld, Temp src, Temp dst=Temp(0, s1), bool program_needs_w return dst; } -static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data) +static Temp +emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data) { if (index.regClass() == s1) return bld.readlane(bld.def(s1), data, index); @@ -190,14 +199,18 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data index_op.setLateKill(true); input_data.setLateKill(true); - return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data); + return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), + index_op, input_data); } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) { /* GFX10 wave64 mode: emulate full-wave bpermute */ Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index); - Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo); - Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp()); - Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1); + Builder::Result index_is_lo_split = + bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo); + Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), + index_is_lo_split.def(1).getTemp()); + Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), + index_is_lo_split.def(0).getTemp(), index_is_lo_n1); Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index); Operand input_data(data); @@ -209,7 +222,8 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data * Note, that these have twice the allocation granularity of normal VGPRs */ ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule; - return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half); + return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), + index_x4, input_data, same_half); } else { /* GFX8-9 or GFX10 wave32: bpermute works normally */ Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index); @@ -217,7 +231,8 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data } } -static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsigned mask) +static Temp +emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask) { if (ctx->options->chip_class >= GFX8) { unsigned and_mask = mask & 0x1f; @@ -247,7 +262,8 @@ static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsig return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false); } -Temp as_vgpr(isel_context *ctx, Temp val) +Temp +as_vgpr(isel_context* ctx, Temp val) { if (val.type() == RegType::sgpr) { Builder bld(ctx->program, ctx->block); @@ -257,8 +273,9 @@ Temp as_vgpr(isel_context *ctx, Temp val) return val; } -//assumes a != 0xffffffff -void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b) +// assumes a != 0xffffffff +void +emit_v_div_u32(isel_context* ctx, Temp dst, Temp a, uint32_t b) { assert(b != 0); Builder bld(ctx->program, ctx->block); @@ -285,13 +302,14 @@ void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b) Temp pre_shift_dst = a; if (pre_shift) { pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst; - bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a); + bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), + Operand((uint32_t)info.pre_shift), a); } Temp increment_dst = pre_shift_dst; if (increment) { increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst; - bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst); + bld.vadd32(Definition(increment_dst), Operand((uint32_t)info.increment), pre_shift_dst); } Temp multiply_dst = increment_dst; @@ -302,18 +320,20 @@ void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b) } if (post_shift) { - bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst); + bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), + multiply_dst); } } -void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst) +void +emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst) { Builder bld(ctx->program, ctx->block); bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx)); } - -Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc) +Temp +emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc) { /* no need to extract the whole vector */ if (src.regClass() == dst_rc) { @@ -347,7 +367,8 @@ Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst } } -void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components) +void +emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components) { if (num_components == 1) return; @@ -365,9 +386,10 @@ void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components) } else { rc = RegClass(vec_src.type(), vec_src.size() / num_components); } - aco_ptr split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)}; + aco_ptr split{create_instruction( + aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)}; split->operands[0] = Operand(vec_src); - std::array elems; + std::array elems; for (unsigned i = 0; i < num_components; i++) { elems[i] = ctx->program->allocateTmp(rc); split->definitions[i] = Definition(elems[i]); @@ -378,7 +400,8 @@ void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components) /* This vector expansion uses a mask to determine which elements in the new vector * come from the original vector. The other elements are undefined. */ -void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask) +void +expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask) { emit_split_vector(ctx, vec_src, util_bitcount(mask)); @@ -395,14 +418,16 @@ void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_compo } unsigned component_size = dst.size() / num_components; - std::array elems; + std::array elems; - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; + aco_ptr vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; vec->definitions[0] = Definition(dst); unsigned k = 0; for (unsigned i = 0; i < num_components; i++) { if (mask & (1 << i)) { - Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size)); + Temp src = + emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size)); if (dst.type() == RegType::sgpr) src = bld.as_uniform(src); vec->operands[i] = Operand(src); @@ -416,7 +441,8 @@ void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_compo } /* adjust misaligned small bit size loads */ -void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst) +void +byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst) { Builder bld(ctx->program, ctx->block); Operand shift; @@ -426,9 +452,11 @@ void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst) shift = Operand(offset.constantValue() * 8); } else { /* bit_offset = 8 * (offset & 0x3) */ - Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u)); + Temp tmp = + bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u)); select = bld.tmp(s1); - shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, Operand(3u)); + shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, + Operand(3u)); } if (vec.size() == 1) { @@ -463,7 +491,8 @@ void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst) } } -void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, unsigned component_size) +void +byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size) { Builder bld(ctx->program, ctx->block); if (offset.isTemp()) { @@ -471,10 +500,12 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un if (vec.size() == 4) { tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1); - bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec); + bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), + Definition(tmp[2]), Definition(tmp[3]), vec); } else if (vec.size() == 3) { tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1); - bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec); + bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), + Definition(tmp[2]), vec); } else if (vec.size() == 2) { tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1]; bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec); @@ -506,17 +537,18 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un for (unsigned i = skip; i < num_components; i++) elems[i - skip] = emit_extract_vector(ctx, vec, i, rc); - /* if dst is vgpr - split the src and create a shrunk version according to the mask. */ if (dst.type() == RegType::vgpr) { + /* if dst is vgpr - split the src and create a shrunk version according to the mask. */ num_components = dst.bytes() / component_size; - aco_ptr create_vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; + aco_ptr create_vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; for (unsigned i = 0; i < num_components; i++) create_vec->operands[i] = Operand(elems[i]); create_vec->definitions[0] = Definition(dst); bld.insert(std::move(create_vec)); - /* if dst is sgpr - split the src, but move the original to sgpr. */ } else if (skip) { + /* if dst is sgpr - split the src, but move the original to sgpr. */ vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec); byte_align_scalar(ctx, vec, offset, dst); } else { @@ -527,7 +559,8 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un ctx->allocated_vec.emplace(dst.id(), elems); } -Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2)) +Temp +bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2)) { Builder bld(ctx->program, ctx->block); if (!dst.id()) @@ -536,10 +569,12 @@ Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2 assert(val.regClass() == s1); assert(dst.regClass() == bld.lm); - return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t) -1), Operand(0u), bld.scc(val)); + return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t)-1), Operand(0u), + bld.scc(val)); } -Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1)) +Temp +bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1)) { Builder bld(ctx->program, ctx->block); if (!dst.id()) @@ -563,9 +598,12 @@ Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1 * * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined. */ -Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool sign_extend, Temp dst=Temp()) +Temp +convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, + bool sign_extend, Temp dst = Temp()) { - assert(!(sign_extend && dst_bits < src_bits) && "Shrinking integers is not supported for signed inputs"); + assert(!(sign_extend && dst_bits < src_bits) && + "Shrinking integers is not supported for signed inputs"); if (!dst.id()) { if (dst_bits % 32 == 0 || src.type() == RegType::sgpr) @@ -592,14 +630,15 @@ Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, u if (tmp == src) { } else if (src.regClass() == s1) { assert(src_bits < 32); - bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), - src, Operand(0u), Operand(src_bits), Operand((unsigned)sign_extend)); + bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand(0u), + Operand(src_bits), Operand((unsigned)sign_extend)); } else if (ctx->options->chip_class >= GFX8) { assert(src_bits < 32); assert(src_bits != 8 || src.regClass() == v1b); assert(src_bits != 16 || src.regClass() == v2b); assert(dst_bits >= 16); - aco_ptr sdwa{create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)}; + aco_ptr sdwa{ + create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)}; sdwa->operands[0] = Operand(src); sdwa->definitions[0] = Definition(tmp); if (sign_extend) @@ -617,7 +656,8 @@ Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, u if (dst_bits == 64) { if (sign_extend && dst.regClass() == s2) { - Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u)); + Temp high = + bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u)); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high); } else if (sign_extend && dst.regClass() == v2) { Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp); @@ -636,7 +676,8 @@ enum sgpr_extract_mode { sgpr_extract_undef, }; -Temp extract_8_16_bit_sgpr_element(isel_context *ctx, Temp dst, nir_alu_src *src, sgpr_extract_mode mode) +Temp +extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode) { Temp vec = get_ssa_temp(ctx, src->src.ssa); unsigned src_size = src->src.ssa->bit_size; @@ -655,7 +696,8 @@ Temp extract_8_16_bit_sgpr_element(isel_context *ctx, Temp dst, nir_alu_src *src bld.copy(Definition(tmp), vec); else bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec), - Operand(swizzle), Operand(src_size), Operand((uint32_t)(mode == sgpr_extract_sext))); + Operand(swizzle), Operand(src_size), + Operand((uint32_t)(mode == sgpr_extract_sext))); if (dst.regClass() == s2) convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst); @@ -663,7 +705,8 @@ Temp extract_8_16_bit_sgpr_element(isel_context *ctx, Temp dst, nir_alu_src *src return dst; } -Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1) +Temp +get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1) { if (src.src.ssa->num_components == 1 && size == 1) return get_ssa_temp(ctx, src.src.ssa); @@ -685,17 +728,19 @@ Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1) if (elem_size < 4 && vec.type() == RegType::sgpr) { assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16); assert(size == 1); - return extract_8_16_bit_sgpr_element( - ctx, ctx->program->allocateTmp(s1), &src, sgpr_extract_undef); + return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src, + sgpr_extract_undef); } - RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4); + RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() + : RegClass(vec.type(), elem_size / 4); if (size == 1) { return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc); } else { assert(size <= 4); - std::array elems; - aco_ptr vec_instr{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)}; + std::array elems; + aco_ptr vec_instr{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, size, 1)}; for (unsigned i = 0; i < size; ++i) { elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc); vec_instr->operands[i] = Operand{elems[i]}; @@ -708,7 +753,8 @@ Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1) } } -Temp get_alu_src_vop3p(struct isel_context *ctx, nir_alu_src src) +Temp +get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src) { /* returns v2b or v1 for vop3p usage. * The source expects exactly 2 16bit components @@ -735,28 +781,32 @@ Temp get_alu_src_vop3p(struct isel_context *ctx, nir_alu_src src) } } -uint32_t get_alu_src_ub(isel_context *ctx, nir_alu_instr *instr, int src_idx) +uint32_t +get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx) { - nir_ssa_scalar scalar = nir_ssa_scalar{instr->src[src_idx].src.ssa, - instr->src[src_idx].swizzle[0]}; + nir_ssa_scalar scalar = + nir_ssa_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]}; return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config); } -Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr, bool non_uniform=false) +Temp +convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false) { if (ptr.size() == 2) return ptr; Builder bld(ctx->program, ctx->block); if (ptr.type() == RegType::vgpr && !non_uniform) ptr = bld.as_uniform(ptr); - return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), - ptr, Operand((unsigned)ctx->options->address32_hi)); + return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr, + Operand((unsigned)ctx->options->address32_hi)); } -void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, - Temp dst, bool writes_scc, uint8_t uses_ub = 0) +void +emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, + bool writes_scc, uint8_t uses_ub = 0) { - aco_ptr sop2{create_instruction(op, Format::SOP2, 2, writes_scc ? 2 : 1)}; + aco_ptr sop2{ + create_instruction(op, Format::SOP2, 2, writes_scc ? 2 : 1)}; sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0])); sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1])); sop2->definitions[0] = Definition(dst); @@ -778,10 +828,10 @@ void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o ctx->block->instructions.emplace_back(std::move(sop2)); } -void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, - bool commutative, bool swap_srcs=false, - bool flush_denorms = false, bool nuw = false, - uint8_t uses_ub = 0) +void +emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, + bool commutative, bool swap_srcs = false, bool flush_denorms = false, + bool nuw = false, uint8_t uses_ub = 0) { Builder bld(ctx->program, ctx->block); bld.is_precise = instr->exact; @@ -824,8 +874,8 @@ void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o } } -void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr, - aco_opcode op, Temp dst) +void +emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst) { Builder bld(ctx->program, ctx->block); bld.is_precise = instr->exact; @@ -849,11 +899,12 @@ void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr, bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); } -void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, - bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false) +void +emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, + bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false) { assert(num_sources == 2 || num_sources == 3); - Temp src[3] = { Temp(0, v1), Temp(0, v1), Temp(0, v1) }; + Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)}; bool has_sgpr = false; for (unsigned i = 0; i < num_sources; i++) { src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]); @@ -874,7 +925,8 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode if (dst.size() == 1) bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp); else - bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(UINT64_C(0x3FF0000000000000)), tmp); + bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(UINT64_C(0x3FF0000000000000)), + tmp); } else if (num_sources == 3) { bld.vop3(op, Definition(dst), src[0], src[1], src[2]); } else { @@ -882,8 +934,9 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode } } -Builder::Result emit_vop3p_instruction(isel_context *ctx, nir_alu_instr *instr, - aco_opcode op, Temp dst, bool swap_srcs=false) +Builder::Result +emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, + bool swap_srcs = false) { Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]); Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]); @@ -892,8 +945,10 @@ Builder::Result emit_vop3p_instruction(isel_context *ctx, nir_alu_instr *instr, assert(instr->dest.dest.ssa.num_components == 2); /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */ - unsigned opsel_lo = (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1); - unsigned opsel_hi = (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1); + unsigned opsel_lo = + (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1); + unsigned opsel_hi = + (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1); Builder bld(ctx->program, ctx->block); bld.is_precise = instr->exact; @@ -902,7 +957,8 @@ Builder::Result emit_vop3p_instruction(isel_context *ctx, nir_alu_instr *instr, return res; } -void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) +void +emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst) { Builder bld(ctx->program, ctx->block); bld.is_precise = instr->exact; @@ -913,7 +969,8 @@ void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0])); } -void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) +void +emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst) { Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = get_alu_src(ctx, instr->src[1]); @@ -924,62 +981,25 @@ void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o if (src0.type() == RegType::vgpr) { /* to swap the operands, we might also have to change the opcode */ switch (op) { - case aco_opcode::v_cmp_lt_f16: - op = aco_opcode::v_cmp_gt_f16; - break; - case aco_opcode::v_cmp_ge_f16: - op = aco_opcode::v_cmp_le_f16; - break; - case aco_opcode::v_cmp_lt_i16: - op = aco_opcode::v_cmp_gt_i16; - break; - case aco_opcode::v_cmp_ge_i16: - op = aco_opcode::v_cmp_le_i16; - break; - case aco_opcode::v_cmp_lt_u16: - op = aco_opcode::v_cmp_gt_u16; - break; - case aco_opcode::v_cmp_ge_u16: - op = aco_opcode::v_cmp_le_u16; - break; - case aco_opcode::v_cmp_lt_f32: - op = aco_opcode::v_cmp_gt_f32; - break; - case aco_opcode::v_cmp_ge_f32: - op = aco_opcode::v_cmp_le_f32; - break; - case aco_opcode::v_cmp_lt_i32: - op = aco_opcode::v_cmp_gt_i32; - break; - case aco_opcode::v_cmp_ge_i32: - op = aco_opcode::v_cmp_le_i32; - break; - case aco_opcode::v_cmp_lt_u32: - op = aco_opcode::v_cmp_gt_u32; - break; - case aco_opcode::v_cmp_ge_u32: - op = aco_opcode::v_cmp_le_u32; - break; - case aco_opcode::v_cmp_lt_f64: - op = aco_opcode::v_cmp_gt_f64; - break; - case aco_opcode::v_cmp_ge_f64: - op = aco_opcode::v_cmp_le_f64; - break; - case aco_opcode::v_cmp_lt_i64: - op = aco_opcode::v_cmp_gt_i64; - break; - case aco_opcode::v_cmp_ge_i64: - op = aco_opcode::v_cmp_le_i64; - break; - case aco_opcode::v_cmp_lt_u64: - op = aco_opcode::v_cmp_gt_u64; - break; - case aco_opcode::v_cmp_ge_u64: - op = aco_opcode::v_cmp_le_u64; - break; - default: /* eq and ne are commutative */ - break; + case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break; + case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break; + case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break; + case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break; + case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break; + case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break; + case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break; + case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break; + case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break; + case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break; + case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break; + case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break; + case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break; + case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break; + case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break; + case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break; + case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break; + case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break; + default: /* eq and ne are commutative */ break; } Temp t = src0; src0 = src1; @@ -993,7 +1013,8 @@ void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1); } -void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) +void +emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst) { Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = get_alu_src(ctx, instr->src[1]); @@ -1010,13 +1031,18 @@ void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o bool_to_vector_condition(ctx, cmp, dst); } -void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst, - aco_opcode v16_op, aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes) +void +emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op, + aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, + aco_opcode s64_op = aco_opcode::num_opcodes) { - aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes; - aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op; - bool use_valu = s_op == aco_opcode::num_opcodes || - nir_dest_is_divergent(instr->dest.dest) || + aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op + : instr->src[0].src.ssa->bit_size == 32 ? s32_op + : aco_opcode::num_opcodes; + aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op + : instr->src[0].src.ssa->bit_size == 32 ? v32_op + : v16_op; + bool use_valu = s_op == aco_opcode::num_opcodes || nir_dest_is_divergent(instr->dest.dest) || get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr || get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr; aco_opcode op = use_valu ? v_op : s_op; @@ -1029,7 +1055,9 @@ void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst, emit_sopc_instruction(ctx, instr, op, dst); } -void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst) +void +emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op, + Temp dst) { Builder bld(ctx->program, ctx->block); Temp src0 = get_alu_src(ctx, instr->src[0]); @@ -1042,7 +1070,8 @@ void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSp bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1); } -void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) +void +emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst) { Builder bld(ctx->program, ctx->block); Temp cond = get_alu_src(ctx, instr->src[0]); @@ -1082,9 +1111,11 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */ if (dst.regClass() == s1 || dst.regClass() == s2) { - assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass()); + assert((then.regClass() == s1 || then.regClass() == s2) && + els.regClass() == then.regClass()); assert(dst.size() == then.size()); - aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64; + aco_opcode op = + dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64; bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond))); } else { isel_err(&instr->instr, "Unimplemented uniform bcsel bit size"); @@ -1107,12 +1138,14 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond)); } -void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val, - aco_opcode op, uint32_t undo) +void +emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op, + uint32_t undo) { /* multiply by 16777216 to handle denormals */ - Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), - as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4)))); + Temp is_denormal = + bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), as_vgpr(ctx, val), + bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4)))); Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val); scaled = bld.vop1(op, bld.def(v1), scaled); scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled); @@ -1122,7 +1155,8 @@ void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val, bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal); } -void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val) +void +emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val) { if (ctx->block->fp_mode.denorm32 == 0) { bld.vop1(aco_opcode::v_rcp_f32, dst, val); @@ -1132,7 +1166,8 @@ void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val) emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u); } -void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val) +void +emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val) { if (ctx->block->fp_mode.denorm32 == 0) { bld.vop1(aco_opcode::v_rsq_f32, dst, val); @@ -1142,7 +1177,8 @@ void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val) emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u); } -void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val) +void +emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val) { if (ctx->block->fp_mode.denorm32 == 0) { bld.vop1(aco_opcode::v_sqrt_f32, dst, val); @@ -1152,7 +1188,8 @@ void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val) emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u); } -void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val) +void +emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val) { if (ctx->block->fp_mode.denorm32 == 0) { bld.vop1(aco_opcode::v_log_f32, dst, val); @@ -1162,7 +1199,8 @@ void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val) emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u); } -Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) +Temp +emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val) { if (ctx->options->chip_class >= GFX7) return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val); @@ -1181,11 +1219,13 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u)); /* Extract the fractional part. */ - Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu)); + Temp fract_mask = + bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu)); fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent); Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1); - bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), fract_mask); + bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), + fract_mask); Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1); Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo); @@ -1197,8 +1237,10 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi); /* Decide the operation to apply depending on the unbiased exponent. */ - Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u)); - Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, bld.copy(bld.def(v1), Operand(0u)), exp_lt0); + Temp exp_lt0 = + bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u)); + Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, + bld.copy(bld.def(v1), Operand(0u)), exp_lt0); Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0); Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand(51u)); dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51); @@ -1207,7 +1249,8 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi); } -Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) +Temp +emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val) { if (ctx->options->chip_class >= GFX7) return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val); @@ -1217,9 +1260,11 @@ Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) Temp src0 = as_vgpr(ctx, val); Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */ - Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu)); + Temp min_val = + bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu)); - Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask); + Temp isnan = + bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask); Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0); Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val); @@ -1239,11 +1284,13 @@ Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) return add->definitions[0].getTemp(); } -Temp uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1) +Temp +uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1) { if (bld.program->chip_class < GFX8) { Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true); - return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand((uint32_t) -1), add.def(1).getTemp()); + return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), + Operand((uint32_t)-1), add.def(1).getTemp()); } Builder::Result add(NULL); @@ -1256,7 +1303,8 @@ Temp uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1) return dst.getTemp(); } -void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) +void +visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) { if (!instr->dest.dest.is_ssa) { isel_err(&instr->instr, "nir alu dst not in ssa"); @@ -1265,18 +1313,19 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Builder bld(ctx->program, ctx->block); bld.is_precise = instr->exact; Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa); - switch(instr->op) { + switch (instr->op) { case nir_op_vec2: case nir_op_vec3: case nir_op_vec4: case nir_op_vec5: { - std::array elems; + std::array elems; unsigned num = instr->dest.dest.ssa.num_components; for (unsigned i = 0; i < num; ++i) elems[i] = get_alu_src(ctx, instr->src[i]); if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) { - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)}; + aco_ptr vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)}; RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u); for (unsigned i = 0; i < num; ++i) { if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword()) @@ -1291,7 +1340,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bool use_s_pack = ctx->program->chip_class >= GFX9; Temp mask = bld.copy(bld.def(s1), Operand((1u << instr->dest.dest.ssa.bit_size) - 1)); - std::array packed; + std::array packed; uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {}; for (unsigned i = 0; i < num; i++) { unsigned packed_size = use_s_pack ? 16 : 32; @@ -1303,32 +1352,36 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } if (offset != packed_size - instr->dest.dest.ssa.bit_size) - elems[i] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask); + elems[i] = + bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask); if (offset) - elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), - elems[i], Operand(offset)); + elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i], + Operand(offset)); if (packed[idx].id()) - packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), - elems[i], packed[idx]); + packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i], + packed[idx]); else packed[idx] = elems[i]; } if (use_s_pack) { for (unsigned i = 0; i < dst.size(); i++) { - bool same = !!packed[i*2].id() == !!packed[i*2+1].id(); + bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id(); - if (packed[i*2].id() && packed[i*2+1].id()) - packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i*2], packed[i*2+1]); - else if (packed[i*2+1].id()) - packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), Operand(const_vals[i * 2]), packed[i*2+1]); - else if (packed[i*2].id()) - packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i*2], Operand(const_vals[i * 2 + 1])); + if (packed[i * 2].id() && packed[i * 2 + 1].id()) + packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2], + packed[i * 2 + 1]); + else if (packed[i * 2 + 1].id()) + packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), + Operand(const_vals[i * 2]), packed[i * 2 + 1]); + else if (packed[i * 2].id()) + packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2], + Operand(const_vals[i * 2 + 1])); if (same) - const_vals[i] = const_vals[i*2] | (const_vals[i*2+1] << 16); + const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16); else const_vals[i] = 0; } @@ -1347,7 +1400,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) else if (dst.size() == 2) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1]); else - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1], packed[2]); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1], + packed[2]); } break; } @@ -1392,7 +1446,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.regClass() == s1) { bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src); } else if (dst.regClass() == v1) { - bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src)); + bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, + bld.vsub32(bld.def(v1), Operand(0u), src)); } else { isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } @@ -1401,15 +1456,19 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_isign: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == s1) { - Temp tmp = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand((uint32_t)-1)); + Temp tmp = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, + Operand((uint32_t)-1)); bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand(1u)); } else if (dst.regClass() == s2) { - Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u)); + Temp neg = + bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u)); Temp neqz; if (ctx->program->chip_class >= GFX8) neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u)); else - neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp(); + neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)) + .def(1) + .getTemp(); /* SCC gets zero-extended to 64 bit */ bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz)); } else if (dst.regClass() == v1) { @@ -1417,7 +1476,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v2) { Temp upper = emit_extract_vector(ctx, src, 1, v1); Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper); - Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); + Temp gtz = + bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz); upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); @@ -1548,8 +1608,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true); } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) { - bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), - get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0])); + bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]), + get_alu_src(ctx, instr->src[0])); } else if (dst.regClass() == v2) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst); } else if (dst.regClass() == s2) { @@ -1569,10 +1629,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true); } else if (dst.regClass() == v1) { - emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false, false, 1); + emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false, + false, 1); } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) { - bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), - get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0])); + bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]), + get_alu_src(ctx, instr->src[0])); } else if (dst.regClass() == v2) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst); } else if (dst.regClass() == s1) { @@ -1594,8 +1655,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true); } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) { - bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), - get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0])); + bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]), + get_alu_src(ctx, instr->src[0])); } else if (dst.regClass() == v2) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst); } else if (dst.regClass() == s1) { @@ -1624,9 +1685,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_ifind_msb: { Temp src = get_alu_src(ctx, instr->src[0]); if (src.regClass() == s1 || src.regClass() == s2) { - aco_opcode op = src.regClass() == s2 ? - (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) : - (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32); + aco_opcode op = src.regClass() == s2 + ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 + : aco_opcode::s_flbit_i32_i64) + : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 + : aco_opcode::s_flbit_i32); Temp msb_rev = bld.sop1(op, bld.def(s1), src); Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), @@ -1634,30 +1697,38 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp msb = sub.def(0).getTemp(); Temp carry = sub.def(1).getTemp(); - bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, bld.scc(carry)); + bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, + bld.scc(carry)); } else if (src.regClass() == v1) { - aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32; + aco_opcode op = + instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32; Temp msb_rev = bld.tmp(v1); emit_vop1_instruction(ctx, instr, op, msb_rev); Temp msb = bld.tmp(v1); - Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp(); - bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry); + Temp carry = + bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp(); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), + carry); } else if (src.regClass() == v2) { - aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32; + aco_opcode op = + instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32; Temp lo = bld.tmp(v1), hi = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand(32u)), - bld.vop1(op, bld.def(v1), lo)); + bld.vop1(op, bld.def(v1), lo)); hi = bld.vop1(op, bld.def(v1), hi); - Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand((uint32_t)-1), hi); + Temp found_hi = + bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand((uint32_t)-1), hi); Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi); Temp msb = bld.tmp(v1); - Temp carry = bld.vsub32(Definition(msb), Operand(63u), Operand(msb_rev), true).def(1).getTemp(); - bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry); + Temp carry = + bld.vsub32(Definition(msb), Operand(63u), Operand(msb_rev), true).def(1).getTemp(); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), + carry); } else { isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } @@ -1705,8 +1776,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.regClass() == s2) { Temp carry = bld.tmp(s1); - Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10); - Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry)); + Temp dst0 = + bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10); + Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, + bld.scc(carry)); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); } else if (dst.regClass() == v2) { Temp dst0 = bld.tmp(v1); @@ -1723,17 +1796,18 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src1 = get_alu_src(ctx, instr->src[1]); if (dst.regClass() == s1) { Temp tmp = bld.tmp(s1), carry = bld.tmp(s1); - bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), - src0, src1); - bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry)); + bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1); + bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), tmp, + bld.scc(carry)); } else if (dst.regClass() == v2b) { - Instruction *add_instr; + Instruction* add_instr; if (ctx->program->chip_class >= GFX10) { add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr; } else { if (src1.type() == RegType::sgpr) std::swap(src0, src1); - add_instr = bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr; + add_instr = + bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr; } add_instr->vop3().clamp = 1; } else if (dst.regClass() == v1) { @@ -1765,12 +1839,16 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.regClass() == s2) { Temp carry = bld.tmp(s1); bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10); - carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp(); + carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, + bld.scc(carry)) + .def(1) + .getTemp(); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u)); } else if (dst.regClass() == v2) { Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp(); carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp(); - carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry); + carry = + bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u)); } else { isel_err(&instr->instr, "Unimplemented NIR instr bit size"); @@ -1811,8 +1889,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); if (dst.regClass() == s2) { Temp borrow = bld.tmp(s1); - Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10); - Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(borrow)); + Temp dst0 = + bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10); + Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, + bld.scc(borrow)); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); } else if (dst.regClass() == v2) { Temp lower = bld.tmp(v1); @@ -1845,12 +1925,16 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.regClass() == s2) { Temp borrow = bld.tmp(s1); bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10); - borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp(); + borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, + bld.scc(borrow)) + .def(1) + .getTemp(); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u)); } else if (dst.regClass() == v2) { Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp(); borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp(); - borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow); + borrow = + bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u)); } else { isel_err(&instr->instr, "Unimplemented NIR instr bit size"); @@ -1870,25 +1954,22 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0); uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1); - if (src0_ub <= 0xffff && src1_ub <= 0xffff && - src0_ub * src1_ub <= 0xffff && - (ctx->options->chip_class == GFX8 || - ctx->options->chip_class == GFX9)) { + if (src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff && + (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9)) { /* If the 16-bit multiplication can't overflow, emit v_mul_lo_u16 * but only on GFX8-9 because GFX10 doesn't zero the upper 16 * bits. */ - emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, - true /* commutative */, false, false, - true /* nuw */); - } else if (src0_ub <= 0xffff && src1_ub <= 0xffff && - ctx->options->chip_class >= GFX9) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true /* commutative */, + false, false, true /* nuw */); + } else if (src0_ub <= 0xffff && src1_ub <= 0xffff && ctx->options->chip_class >= GFX9) { /* Initialize the accumulator to 0 to allow further combinations * in the optimizer. */ Operand op0(src0); Operand op1(src1); - bld.vop3(aco_opcode::v_mad_u32_u16, Definition(dst), bld.set16bit(op0), bld.set16bit(op1), Operand(0u)); + bld.vop3(aco_opcode::v_mad_u32_u16, Definition(dst), bld.set16bit(op0), + bld.set16bit(op1), Operand(0u)); } else if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) { emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst, true); } else if (nir_src_is_const(instr->src[0].src)) { @@ -1992,8 +2073,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) else emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true); } else if (dst.regClass() == v2) { - Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), - as_vgpr(ctx, src0), as_vgpr(ctx, src1)); + Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0), + as_vgpr(ctx, src1)); add->vop3().neg[1] = true; } else { isel_err(&instr->instr, "Unimplemented NIR instr bit size"); @@ -2007,9 +2088,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst); } else if (dst.regClass() == v1) { - emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32); + emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, + ctx->block->fp_mode.must_flush_denorms32); } else if (dst.regClass() == v2) { - emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst, ctx->block->fp_mode.must_flush_denorms16_64); + emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst, + ctx->block->fp_mode.must_flush_denorms16_64); } else { isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } @@ -2022,9 +2105,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true); } else if (dst.regClass() == v1) { - emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32); + emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, + ctx->block->fp_mode.must_flush_denorms32); } else if (dst.regClass() == v2) { - emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst, ctx->block->fp_mode.must_flush_denorms16_64); + emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst, + ctx->block->fp_mode.must_flush_denorms16_64); } else { isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } @@ -2032,27 +2117,23 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_cube_face_coord_amd: { Temp in = get_alu_src(ctx, instr->src[0], 3); - Temp src[3] = { emit_extract_vector(ctx, in, 0, v1), - emit_extract_vector(ctx, in, 1, v1), - emit_extract_vector(ctx, in, 2, v1) }; + Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1), + emit_extract_vector(ctx, in, 2, v1)}; Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]); ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma); Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]); Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]); - sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), - Operand(0x3f000000u/*0.5*/), + sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3f000000u /*0.5*/), bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, ma)); - tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), - Operand(0x3f000000u/*0.5*/), + tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3f000000u /*0.5*/), bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, ma)); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc); break; } case nir_op_cube_face_index_amd: { Temp in = get_alu_src(ctx, instr->src[0], 3); - Temp src[3] = { emit_extract_vector(ctx, in, 0, v1), - emit_extract_vector(ctx, in, 1, v1), - emit_extract_vector(ctx, in, 2, v1) }; + Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1), + emit_extract_vector(ctx, in, 2, v1)}; bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]); break; } @@ -2084,12 +2165,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand((uint16_t)0xbc00u), as_vgpr(ctx, src)); + bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand((uint16_t)0xbc00u), + as_vgpr(ctx, src)); } else if (dst.regClass() == v1) { bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0xbf800000u), as_vgpr(ctx, src)); } else if (dst.regClass() == v2) { if (ctx->block->fp_mode.must_flush_denorms16_64) - src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(UINT64_C(0x3FF0000000000000)), as_vgpr(ctx, src)); + src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), + Operand(UINT64_C(0x3FF0000000000000)), as_vgpr(ctx, src)); Temp upper = bld.tmp(v1), lower = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper); @@ -2102,14 +2185,19 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fabs: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Instruction *mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst), Operand((uint16_t)0x3c00), as_vgpr(ctx, src)).instr; + Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst), + Operand((uint16_t)0x3c00), as_vgpr(ctx, src)) + .instr; mul->vop3().abs[1] = true; } else if (dst.regClass() == v1) { - Instruction *mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), as_vgpr(ctx, src)).instr; + Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst), + Operand(0x3f800000u), as_vgpr(ctx, src)) + .instr; mul->vop3().abs[1] = true; } else if (dst.regClass() == v2) { if (ctx->block->fp_mode.must_flush_denorms16_64) - src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(UINT64_C(0x3FF0000000000000)), as_vgpr(ctx, src)); + src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), + Operand(UINT64_C(0x3FF0000000000000)), as_vgpr(ctx, src)); Temp upper = bld.tmp(v1), lower = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper); @@ -2122,18 +2210,21 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fsat: { if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { Temp src = get_alu_src_vop3p(ctx, instr->src[0]); - Instruction* vop3p = bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand(uint16_t(0x3C00)), - instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1); + Instruction* vop3p = + bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand(uint16_t(0x3C00)), + instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1); vop3p->vop3p().clamp = true; emit_split_vector(ctx, dst, 2); break; } Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src); + bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), + Operand((uint16_t)0x3c00), src); } else if (dst.regClass() == v1) { bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src); - /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */ + /* apparently, it is not necessary to flush denorms if this instruction is used with these + * operands */ // TODO: confirm that this holds under any circumstances } else if (dst.regClass() == v2) { Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u)); @@ -2234,10 +2325,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src0 = get_alu_src(ctx, instr->src[0]); Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0); Temp tmp0 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand(0u)); - Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc); - Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), tmp0, tmp1); - Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond); - add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), bld.copy(bld.def(v1), Operand(0u)), add); + Temp tmp1 = + bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc); + Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), + tmp0, tmp1); + Temp add = + bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), + bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond); + add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), + bld.copy(bld.def(v1), Operand(0u)), add); bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add); } } else { @@ -2272,22 +2368,32 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src0 = get_alu_src(ctx, instr->src[0]); bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0); - Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u))); - Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi)); - Temp tmp = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi)); - Instruction *sub = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi)); + Temp bitmask = + bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u))); + Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, + bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi)); + Temp tmp = + bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, + bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi)); + Instruction* sub = + bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, + bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi)); sub->vop3().neg[1] = true; tmp = sub->definitions[0].getTemp(); - Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x432fffffu)); - Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v); + Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), + Operand(0x432fffffu)); + Instruction* vop3 = + bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v); vop3->vop3().abs[0] = true; Temp cond = vop3->definitions[0].getTemp(); Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp); - Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, as_vgpr(ctx, src0_lo), cond); - Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, as_vgpr(ctx, src0_hi), cond); + Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, + as_vgpr(ctx, src0_lo), cond); + Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, + as_vgpr(ctx, src0_hi), cond); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); } @@ -2303,7 +2409,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.regClass() == v2b) { Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u)); Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src); - aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16; + aco_opcode opcode = + instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16; bld.vop1(opcode, Definition(dst), tmp); } else if (dst.regClass() == v1) { Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u)); @@ -2313,7 +2420,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (ctx->options->chip_class < GFX9) tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp); - aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32; + aco_opcode opcode = + instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32; bld.vop1(opcode, Definition(dst), tmp); } else { isel_err(&instr->instr, "Unimplemented NIR instr bit size"); @@ -2365,16 +2473,20 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) assert(ctx->program->chip_class >= GFX9); /* replace negative zero with positive zero */ src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand(0u), src); - src = bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand((uint16_t)-1), src, Operand((uint16_t)1u)); + src = bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand((uint16_t)-1), src, + Operand((uint16_t)1u)); bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src); } else if (dst.regClass() == v1) { src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0u), src); - src = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand((uint32_t)-1), src, Operand(1u)); + src = + bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand((uint32_t)-1), src, Operand(1u)); bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src); } else if (dst.regClass() == v2) { - Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); + Temp cond = + bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); Temp tmp = bld.copy(bld.def(v1), Operand(0x3FF00000u)); - Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond); + Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, + emit_extract_vector(ctx, src, 1, v1), cond); cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); tmp = bld.copy(bld.def(v1), Operand(0xBFF00000u)); @@ -2673,14 +2785,16 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) { Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src); - exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u)); + exponent = + bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u)); Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src); Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src); mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa); mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa); mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa); Temp new_exponent = bld.tmp(v1); - Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp(); + Temp borrow = + bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp(); if (ctx->program->chip_class >= GFX8) mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa); else @@ -2688,7 +2802,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu)); Temp lower = bld.tmp(v1), upper = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); - lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow); + lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), + borrow); upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow); lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower); upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper); @@ -2700,18 +2815,29 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) { if (src.type() == RegType::vgpr) src = bld.as_uniform(src); - Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u)); - exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u)); - exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent); - exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent); - Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src); - Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u)); - mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa); - mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u)); + Temp exponent = + bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u)); + exponent = + bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u)); + exponent = + bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent); + exponent = + bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent); + Temp mantissa = + bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src); + Temp sign = + bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u)); + mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), + Operand(0x800000u), mantissa); + mantissa = + bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u)); mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa); - exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent); - mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent); - Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64 + exponent = + bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent); + mantissa = + bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent); + Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, + Operand(0xffffffffu)); // exp >= 64 Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu)); mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond); Temp lower = bld.tmp(s1), upper = bld.tmp(s1); @@ -2719,15 +2845,19 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower); upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper); Temp borrow = bld.tmp(s1); - lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign); - upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, bld.scc(borrow)); + lower = + bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign); + upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, + bld.scc(borrow)); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); } else if (instr->src[0].src.ssa->bit_size == 64) { - Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u)); + Temp vec = + bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u)); Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src); Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec); - vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u)); + vec = + bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u)); Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul); Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc); Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma); @@ -2750,7 +2880,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) { Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src); - Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent); + Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), + Operand(64u), exponent); exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent); Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src); mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa); @@ -2758,7 +2889,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa); mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa); Temp new_exponent = bld.tmp(v1); - Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp(); + Temp cond_small = + bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp(); if (ctx->program->chip_class >= GFX8) mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa); else @@ -2766,38 +2898,54 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp lower = bld.tmp(v1), upper = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small); - upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small); - lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range); - upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range); + upper = + bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small); + lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, + exponent_in_range); + upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, + exponent_in_range); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) { if (src.type() == RegType::vgpr) src = bld.as_uniform(src); - Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u)); - exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u)); - exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent); - Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src); - mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa); - Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent); - Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small); + Temp exponent = + bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u)); + exponent = + bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u)); + exponent = + bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent); + Temp mantissa = + bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src); + mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), + Operand(0x800000u), mantissa); + Temp exponent_small = + bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent); + Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, + exponent_small); mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa); - Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u)); - mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large); + Temp exponent_large = + bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u)); + mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, + exponent_large); Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent); - mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond); + mantissa = + bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond); Temp lower = bld.tmp(s1), upper = bld.tmp(s1); bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); - Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u)); + Temp cond_small = + bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u)); lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small); upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); } else if (instr->src[0].src.ssa->bit_size == 64) { - Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u)); + Temp vec = + bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u)); Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src); Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec); - vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u)); + vec = + bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u)); Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul); Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc); Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma); @@ -2836,7 +2984,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) src = bool_to_scalar_condition(ctx, src); bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src); } else if (dst.regClass() == v1) { - bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), + src); } else { unreachable("Wrong destination register class for nir_op_b2f32."); } @@ -2848,7 +2997,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.regClass() == s2) { src = bool_to_scalar_condition(ctx, src); - bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src)); + bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), + bld.scc(src)); } else if (dst.regClass() == v2) { Temp one = bld.copy(bld.def(v2), Operand(0x3FF00000u)); Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src); @@ -2864,14 +3014,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_i2i64: { if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) { /* no need to do the extract in get_alu_src() */ - sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size ? - sgpr_extract_sext : sgpr_extract_undef; + sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size + ? sgpr_extract_sext + : sgpr_extract_undef; extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode); } else { const unsigned input_bitsize = instr->src[0].src.ssa->bit_size; const unsigned output_bitsize = instr->dest.dest.ssa.bit_size; - convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), - input_bitsize, output_bitsize, output_bitsize > input_bitsize, dst); + convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize, + output_bitsize > input_bitsize, dst); } break; } @@ -2881,12 +3032,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_u2u64: { if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) { /* no need to do the extract in get_alu_src() */ - sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size ? - sgpr_extract_zext : sgpr_extract_undef; + sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size + ? sgpr_extract_zext + : sgpr_extract_undef; extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode); } else { - convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), - instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst); + convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size, + instr->dest.dest.ssa.bit_size, false, dst); } break; } @@ -2920,12 +3072,16 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) assert(src.regClass() == v1 || src.regClass() == v2); assert(dst.regClass() == bld.lm); bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32, - Definition(dst), Operand(0u), src).def(0).setHint(vcc); + Definition(dst), Operand(0u), src) + .def(0) + .setHint(vcc); } else { assert(src.regClass() == s1 || src.regClass() == s2); Temp tmp; if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) { - tmp = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand(0u), src).def(1).getTemp(); + tmp = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand(0u), src) + .def(1) + .getTemp(); } else { tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32, bld.scc(bld.def(s1)), Operand(0u), src); @@ -2948,21 +3104,25 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_unpack_64_2x32_split_x: - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0])); + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), + get_alu_src(ctx, instr->src[0])); break; case nir_op_unpack_64_2x32_split_y: - bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0])); + bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), + get_alu_src(ctx, instr->src[0])); break; case nir_op_unpack_32_2x16_split_x: if (dst.type() == RegType::vgpr) { - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0])); + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), + get_alu_src(ctx, instr->src[0])); } else { bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0])); } break; case nir_op_unpack_32_2x16_split_y: if (dst.type() == RegType::vgpr) { - bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0])); + bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), + get_alu_src(ctx, instr->src[0])); } else { bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]), Operand(1u), Operand(16u), Operand(0u)); @@ -2976,7 +3136,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) src1 = emit_extract_vector(ctx, src1, 0, v2b); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1); } else { - src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0, Operand(0xFFFFu)); + src0 = + bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0, Operand(0xFFFFu)); src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1, Operand(16u)); bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1); } @@ -2988,14 +3149,17 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (val && val->u32 == 0 && ctx->program->chip_class <= GFX9) { /* upper bits zero on GFX6-GFX9 */ bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), get_alu_src(ctx, instr->src[0])); - } else if (!ctx->block->fp_mode.care_about_round16_64 || ctx->block->fp_mode.round16_64 == fp_round_tz) { + } else if (!ctx->block->fp_mode.care_about_round16_64 || + ctx->block->fp_mode.round16_64 == fp_round_tz) { if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9) emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst); else emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false); } else { - Temp src0 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[0])); - Temp src1 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[1])); + Temp src0 = + bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[0])); + Temp src1 = + bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[1])); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1); } } else { @@ -3009,7 +3173,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (src.regClass() == v1) src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src); if (dst.regClass() == v1) { - assert(ctx->block->fp_mode.must_flush_denorms16_64 == (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero)); + assert(ctx->block->fp_mode.must_flush_denorms16_64 == + (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero)); bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src); } else { isel_err(&instr->instr, "Unimplemented NIR instr bit size"); @@ -3022,9 +3187,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (src.regClass() == s1) src = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), src, Operand(16u)); else - src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp(); + src = + bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp(); if (dst.regClass() == v1) { - assert(ctx->block->fp_mode.must_flush_denorms16_64 == (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero)); + assert(ctx->block->fp_mode.must_flush_denorms16_64 == + (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero)); bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src); } else { isel_err(&instr->instr, "Unimplemented NIR instr bit size"); @@ -3042,8 +3209,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp f32, cmp_res; if (ctx->program->chip_class >= GFX8) { - Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */ - cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask); + Temp mask = bld.copy(bld.def(s1), + Operand(0x36Fu)); /* value is NOT negative/positive denormal value */ + cmp_res = + bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask); f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16); } else { /* 0x38800000 is smallest half float value (2^-14) in 32-bit float, @@ -3053,12 +3222,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u)); Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest); tmp0->vop3().abs[0] = true; - Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), f32); - cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc), tmp0->definitions[0].getTemp(), tmp1); + Temp tmp1 = + bld.vopc(aco_opcode::v_cmp_lg_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), f32); + cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc), + tmp0->definitions[0].getTemp(), tmp1); } if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) { - Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src)); + Temp copysign_0 = + bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src)); bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res); } else { bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res); @@ -3092,7 +3264,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (const_insert && const_bitmask) { lhs = Operand(const_insert->u32 & const_bitmask->u32); } else { - insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask); + insert = + bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask); lhs = Operand(insert); } @@ -3126,7 +3299,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src); if (const_offset && const_bits) { uint32_t extract = (const_bits->u32 << 16) | (const_offset->u32 & 0x1f); - aco_opcode opcode = instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32; + aco_opcode opcode = + instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32; bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand(extract)); break; } @@ -3135,20 +3309,25 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp bits = get_alu_src(ctx, instr->src[2]); if (instr->op == nir_op_ubfe) { Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset); - Temp masked = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask); + Temp masked = + bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask); bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset); } else { - Operand bits_op = const_bits ? Operand(const_bits->u32 << 16) : - bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u)); - Operand offset_op = const_offset ? Operand(const_offset->u32 & 0x1fu) : - bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(0x1fu)); + Operand bits_op = const_bits ? Operand(const_bits->u32 << 16) + : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), + bld.def(s1, scc), bits, Operand(16u)); + Operand offset_op = const_offset ? Operand(const_offset->u32 & 0x1fu) + : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), + bld.def(s1, scc), offset, Operand(0x1fu)); - Temp extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op); + Temp extract = + bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op); bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract); } } else { - aco_opcode opcode = instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32; + aco_opcode opcode = + instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32; emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3); } break; @@ -3184,12 +3363,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } assert(def.bytes() <= 4); if (def.regClass() == s1) { - bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src), - Operand(index), Operand(bits), Operand((uint32_t)is_signed)); + bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src), Operand(index), + Operand(bits), Operand((uint32_t)is_signed)); } else { src = emit_extract_vector(ctx, src, 0, def.regClass()); - bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand(index), - Operand(bits), Operand((uint32_t)is_signed)); + bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand(index), Operand(bits), + Operand((uint32_t)is_signed)); } if (dst.size() == 2) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(), Operand(0u)); @@ -3215,7 +3394,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) def = bld.def(src.type(), 1); } if (def.regClass() == s1) { - bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src), Operand(index), Operand(bits)); + bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src), Operand(index), + Operand(bits)); } else { src = emit_extract_vector(ctx, src, 0, def.regClass()); bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand(index), Operand(bits)); @@ -3234,8 +3414,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (src.regClass() == v1) { bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u)); } else if (src.regClass() == v2) { - bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), - emit_extract_vector(ctx, src, 1, v1), + bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1), bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), emit_extract_vector(ctx, src, 0, v1), Operand(0u))); } else if (src.regClass() == s2) { @@ -3246,51 +3425,63 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_flt: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32, + aco_opcode::v_cmp_lt_f64); break; } case nir_op_fge: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32, + aco_opcode::v_cmp_ge_f64); break; } case nir_op_feq: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32, + aco_opcode::v_cmp_eq_f64); break; } case nir_op_fneu: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32, + aco_opcode::v_cmp_neq_f64); break; } case nir_op_ilt: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32, + aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32); break; } case nir_op_ige: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32, + aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32); break; } case nir_op_ieq: { if (instr->src[0].src.ssa->bit_size == 1) emit_boolean_logic(ctx, instr, Builder::s_xnor, dst); else - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, - ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes); + emit_comparison( + ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32, + aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, + ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes); break; } case nir_op_ine: { if (instr->src[0].src.ssa->bit_size == 1) emit_boolean_logic(ctx, instr, Builder::s_xor, dst); else - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, - ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes); + emit_comparison( + ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32, + aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, + ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes); break; } case nir_op_ult: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32, + aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32); break; } case nir_op_uge: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32, + aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32); break; } case nir_op_fddx: @@ -3327,12 +3518,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) emit_wqm(bld, tmp, dst, true); break; } - default: - isel_err(&instr->instr, "Unknown NIR ALU instr"); + default: isel_err(&instr->instr, "Unknown NIR ALU instr"); } } -void visit_load_const(isel_context *ctx, nir_load_const_instr *instr) +void +visit_load_const(isel_context* ctx, nir_load_const_instr* instr) { Temp dst = get_ssa_temp(ctx, &instr->def); @@ -3347,7 +3538,7 @@ void visit_load_const(isel_context *ctx, nir_load_const_instr *instr) if (instr->def.bit_size == 1) { assert(dst.regClass() == bld.lm); int val = instr->value[0].b ? -1 : 0; - Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val); + Operand op = bld.lm.size() == 1 ? Operand((uint32_t)val) : Operand((uint64_t)val); bld.copy(Definition(dst), op); } else if (instr->def.bit_size == 8) { bld.copy(Definition(dst), Operand((uint32_t)instr->value[0].u8)); @@ -3358,7 +3549,8 @@ void visit_load_const(isel_context *ctx, nir_load_const_instr *instr) bld.copy(Definition(dst), Operand(instr->value[0].u32)); } else { assert(dst.size() != 1); - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; + aco_ptr vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; if (instr->def.bit_size == 64) for (unsigned i = 0; i < dst.size(); i++) vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)}; @@ -3371,10 +3563,11 @@ void visit_load_const(isel_context *ctx, nir_load_const_instr *instr) } } -uint32_t widen_mask(uint32_t mask, unsigned multiplier) +uint32_t +widen_mask(uint32_t mask, unsigned multiplier) { uint32_t new_mask = 0; - for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i) + for (unsigned i = 0; i < 32 && (1u << i) <= mask; ++i) if (mask & (1u << i)) new_mask |= ((1u << multiplier) - 1u) << (i * multiplier); return new_mask; @@ -3399,9 +3592,8 @@ struct LoadEmitInfo { }; struct EmitLoadParameters { - using Callback = Temp (*)(Builder &bld, const LoadEmitInfo &info, - Temp offset, unsigned bytes_needed, - unsigned align, unsigned const_offset, + using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset, + unsigned bytes_needed, unsigned align, unsigned const_offset, Temp dst_hint); Callback callback; @@ -3410,14 +3602,15 @@ struct EmitLoadParameters { unsigned max_const_offset_plus_one; }; -void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info, - const EmitLoadParameters ¶ms) +void +emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info, + const EmitLoadParameters& params) { unsigned load_size = info.num_components * info.component_size; unsigned component_size = info.component_size; unsigned num_vals = 0; - Temp *const vals = (Temp *)alloca(info.dst.bytes() * sizeof(Temp)); + Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp)); unsigned const_offset = info.const_offset; @@ -3435,8 +3628,7 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info, } if (byte_align) { - if (bytes_needed > 2 || - (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) || + if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) || !params.supports_8bit_16bit_loads) { if (info.component_stride) { assert(params.supports_8bit_16bit_loads && "unimplemented"); @@ -3463,22 +3655,21 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info, unsigned reduced_const_offset = const_offset; bool remove_const_offset_completely = need_to_align_offset; if (const_offset && - (remove_const_offset_completely || - const_offset >= params.max_const_offset_plus_one)) { + (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) { unsigned to_add = const_offset; if (remove_const_offset_completely) { reduced_const_offset = 0; } else { - to_add = const_offset / params.max_const_offset_plus_one * - params.max_const_offset_plus_one; + to_add = + const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one; reduced_const_offset %= params.max_const_offset_plus_one; } Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp(); if (offset.isConstant()) { offset = Operand(offset.constantValue() + to_add); } else if (offset_tmp.regClass() == s1) { - offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), - offset_tmp, Operand(to_add)); + offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp, + Operand(to_add)); } else if (offset_tmp.regClass() == v1) { offset = bld.vadd32(bld.def(v1), offset_tmp, Operand(to_add)); } else { @@ -3488,12 +3679,14 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info, if (offset_tmp.regClass() == s2) { Temp carry = bld.tmp(s1); - lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo, Operand(to_add)); + lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo, + Operand(to_add)); hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry); offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi); } else { Temp new_lo = bld.tmp(v1); - Temp carry = bld.vadd32(Definition(new_lo), lo, Operand(to_add), true).def(1).getTemp(); + Temp carry = + bld.vadd32(Definition(new_lo), lo, Operand(to_add), true).def(1).getTemp(); hi = bld.vadd32(bld.def(v1), hi, Operand(0u), false, carry); offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi); } @@ -3509,11 +3702,14 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info, if (offset.isConstant()) { aligned_offset = Operand(offset.constantValue() & 0xfffffffcu); } else if (offset_tmp.regClass() == s1) { - aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfffffffcu), offset_tmp); + aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), + Operand(0xfffffffcu), offset_tmp); } else if (offset_tmp.regClass() == s2) { - aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand((uint64_t)0xfffffffffffffffcllu), offset_tmp); + aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), + Operand((uint64_t)0xfffffffffffffffcllu), offset_tmp); } else if (offset_tmp.regClass() == v1) { - aligned_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), offset_tmp); + aligned_offset = + bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), offset_tmp); } else if (offset_tmp.regClass() == v2) { Temp hi = bld.tmp(v1), lo = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp); @@ -3521,13 +3717,11 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info, aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi); } } - Temp aligned_offset_tmp = aligned_offset.isTemp() ? - aligned_offset.getTemp() : - bld.copy(bld.def(s1), aligned_offset); + Temp aligned_offset_tmp = + aligned_offset.isTemp() ? aligned_offset.getTemp() : bld.copy(bld.def(s1), aligned_offset); - Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, - align, reduced_const_offset, - byte_align ? Temp() : info.dst); + Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align, + reduced_const_offset, byte_align ? Temp() : info.dst); /* the callback wrote directly to dst */ if (val == info.dst) { @@ -3543,7 +3737,8 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info, if (offset.isConstant()) byte_align_off = Operand(offset.constantValue() % 4u); else if (offset.size() == 2) - byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0, RegClass(offset.getTemp().type(), 1))); + byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0, + RegClass(offset.getTemp().type(), 1))); else byte_align_off = offset; } @@ -3574,7 +3769,7 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info, std::array allocated_vec; bool has_vgprs = false; for (unsigned i = 0; i < num_vals;) { - Temp *const tmp = (Temp *)alloca(num_vals * sizeof(Temp)); + Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp)); unsigned num_tmps = 0; unsigned tmp_size = 0; RegType reg_type = RegType::sgpr; @@ -3597,7 +3792,8 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info, if (tmp[0].bytes() % component_size) { /* trim tmp[0] */ assert(i == num_vals); - RegClass new_rc = RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size); + RegClass new_rc = + RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size); tmp[0] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand(0u)); } @@ -3633,7 +3829,8 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info, if (info.dst.type() == RegType::vgpr || !has_vgprs) ctx->allocated_vec.emplace(info.dst.id(), allocated_vec); - int padding_bytes = MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0); + int padding_bytes = + MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0); aco_ptr vec{create_instruction( aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)}; @@ -3652,16 +3849,16 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info, } } -Operand load_lds_size_m0(Builder& bld) +Operand +load_lds_size_m0(Builder& bld) { /* TODO: m0 does not need to be initialized on GFX9+ */ return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand(0xffffffffu))); } -Temp lds_load_callback(Builder& bld, const LoadEmitInfo &info, - Temp offset, unsigned bytes_needed, - unsigned align, unsigned const_offset, - Temp dst_hint) +Temp +lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed, + unsigned align, unsigned const_offset, Temp dst_hint) { offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset; @@ -3714,7 +3911,7 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo &info, RegClass rc = RegClass::get(RegType::vgpr, size); Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc); - Instruction *instr; + Instruction* instr; if (read2) instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1); else @@ -3724,12 +3921,11 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo &info, return val; } -const EmitLoadParameters lds_load_params { lds_load_callback, false, true, UINT32_MAX }; +const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX}; -Temp smem_load_callback(Builder& bld, const LoadEmitInfo &info, - Temp offset, unsigned bytes_needed, - unsigned align, unsigned const_offset, - Temp dst_hint) +Temp +smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed, + unsigned align, unsigned const_offset, Temp dst_hint) { unsigned size = 0; aco_opcode op; @@ -3767,15 +3963,14 @@ Temp smem_load_callback(Builder& bld, const LoadEmitInfo &info, return val; } -const EmitLoadParameters smem_load_params { smem_load_callback, true, false, 1024 }; +const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024}; -Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo &info, - Temp offset, unsigned bytes_needed, - unsigned align_, unsigned const_offset, - Temp dst_hint) +Temp +mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed, + unsigned align_, unsigned const_offset, Temp dst_hint) { Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); - Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); + Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t)0); if (info.soffset.id()) { if (soffset.isTemp()) @@ -3823,23 +4018,25 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo &info, return val; } -const EmitLoadParameters mubuf_load_params { mubuf_load_callback, true, true, 4096 }; -const EmitLoadParameters scratch_load_params { mubuf_load_callback, false, true, 4096 }; +const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096}; +const EmitLoadParameters scratch_load_params{mubuf_load_callback, false, true, 4096}; -Temp get_gfx6_global_rsrc(Builder& bld, Temp addr) +Temp +get_gfx6_global_rsrc(Builder& bld, Temp addr) { uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); if (addr.type() == RegType::vgpr) - return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), Operand(-1u), Operand(rsrc_conf)); - return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), Operand(rsrc_conf)); + return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), + Operand(-1u), Operand(rsrc_conf)); + return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), + Operand(rsrc_conf)); } -Temp global_load_callback(Builder& bld, const LoadEmitInfo &info, - Temp offset, unsigned bytes_needed, - unsigned align_, unsigned const_offset, - Temp dst_hint) +Temp +global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed, + unsigned align_, unsigned const_offset, Temp dst_hint) { unsigned bytes_size = 0; bool use_mubuf = bld.program->chip_class == GFX6; @@ -3847,27 +4044,38 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo &info, aco_opcode op; if (bytes_needed == 1) { bytes_size = 1; - op = use_mubuf ? aco_opcode::buffer_load_ubyte : global ? aco_opcode::global_load_ubyte : aco_opcode::flat_load_ubyte; + op = use_mubuf ? aco_opcode::buffer_load_ubyte + : global ? aco_opcode::global_load_ubyte + : aco_opcode::flat_load_ubyte; } else if (bytes_needed == 2) { bytes_size = 2; - op = use_mubuf ? aco_opcode::buffer_load_ushort : global ? aco_opcode::global_load_ushort : aco_opcode::flat_load_ushort; + op = use_mubuf ? aco_opcode::buffer_load_ushort + : global ? aco_opcode::global_load_ushort + : aco_opcode::flat_load_ushort; } else if (bytes_needed <= 4) { bytes_size = 4; - op = use_mubuf ? aco_opcode::buffer_load_dword : global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword; + op = use_mubuf ? aco_opcode::buffer_load_dword + : global ? aco_opcode::global_load_dword + : aco_opcode::flat_load_dword; } else if (bytes_needed <= 8) { bytes_size = 8; - op = use_mubuf ? aco_opcode::buffer_load_dwordx2 : global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2; + op = use_mubuf ? aco_opcode::buffer_load_dwordx2 + : global ? aco_opcode::global_load_dwordx2 + : aco_opcode::flat_load_dwordx2; } else if (bytes_needed <= 12 && !use_mubuf) { bytes_size = 12; op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3; } else { bytes_size = 16; - op = use_mubuf ? aco_opcode::buffer_load_dwordx4 : global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4; + op = use_mubuf ? aco_opcode::buffer_load_dwordx4 + : global ? aco_opcode::global_load_dwordx4 + : aco_opcode::flat_load_dwordx4; } RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4)); Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc); if (use_mubuf) { - aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; + aco_ptr mubuf{ + create_instruction(op, Format::MUBUF, 3, 1)}; mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset)); mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); mubuf->operands[2] = Operand(0u); @@ -3882,7 +4090,8 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo &info, } else { offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset; - aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)}; + aco_ptr flat{ + create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)}; flat->operands[0] = Operand(offset); flat->operands[1] = Operand(s1); flat->glc = info.glc; @@ -3896,10 +4105,11 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo &info, return val; } -const EmitLoadParameters global_load_params { global_load_callback, true, true, 1 }; +const EmitLoadParameters global_load_params{global_load_callback, true, true, 1}; -Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst, - Temp address, unsigned base_offset, unsigned align) +Temp +load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst, + Temp address, unsigned base_offset, unsigned align) { assert(util_is_power_of_two_nonzero(align)); @@ -3915,7 +4125,9 @@ Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, unsigned num_componen return dst; } -void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp *dst, unsigned *bytes, Temp src) +void +split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes, + Temp src) { if (!count) return; @@ -3932,7 +4144,8 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp } /* elem_size_bytes is the greatest common divisor which is a power of 2 */ - unsigned elem_size_bytes = 1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1); + unsigned elem_size_bytes = + 1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1); ASSERTED bool is_subdword = elem_size_bytes < 4; assert(!is_subdword || dst_type == RegType::vgpr); @@ -3956,12 +4169,11 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp if (elem_size_bytes % elem_size) goto split; - temps.insert(temps.end(), it->second.begin(), - it->second.begin() + src.bytes() / elem_size); + temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size); elem_size_bytes = elem_size; } - split: +split: /* split src if necessary */ if (temps.empty()) { if (is_subdword && src.type() == RegType::sgpr) @@ -3970,7 +4182,8 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp src = bld.as_uniform(src); unsigned num_elems = src.bytes() / elem_size_bytes; - aco_ptr split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)}; + aco_ptr split{create_instruction( + aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)}; split->operands[0] = Operand(src); for (unsigned i = 0; i < num_elems; i++) { temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes))); @@ -3990,7 +4203,8 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp continue; } - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)}; + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, + Format::PSEUDO, op_count, 1)}; for (unsigned j = 0; j < op_count; j++) { Temp tmp = temps[idx++]; if (dst_type == RegType::sgpr) @@ -4003,8 +4217,8 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp return; } -bool scan_write_mask(uint32_t mask, uint32_t todo_mask, - int *start, int *count) +bool +scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count) { unsigned start_elem = ffs(todo_mask) - 1; bool skip = !(mask & (1 << start_elem)); @@ -4018,13 +4232,15 @@ bool scan_write_mask(uint32_t mask, uint32_t todo_mask, return !skip; } -void advance_write_mask(uint32_t *todo_mask, int start, int count) +void +advance_write_mask(uint32_t* todo_mask, int start, int count) { *todo_mask &= ~u_bit_consecutive(0, count) << start; } -void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, - Temp address, unsigned base_offset, unsigned align) +void +store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address, + unsigned base_offset, unsigned align) { assert(util_is_power_of_two_nonzero(align)); assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8); @@ -4058,7 +4274,7 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t bool aligned8 = offset % 8 == 0 && align % 8 == 0; bool aligned16 = offset % 16 == 0 && align % 16 == 0; - //TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial + // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial aco_opcode op = aco_opcode::num_opcodes; if (byte >= 16 && aligned16 && large_ds_write) { op = aco_opcode::ds_write_b128; @@ -4121,13 +4337,16 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset); inline_offset = offsets[i]; } - assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */ - Instruction *instr; + /* offsets[i] shouldn't be large enough for this to happen */ + assert(inline_offset <= max_offset); + + Instruction* instr; if (write2) { Temp second_data = write_datas[second]; inline_offset /= split_data.bytes(); - instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset, inline_offset + write2_off); + instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset, + inline_offset + write2_off); } else { instr = bld.ds(op, address_offset, split_data, m, inline_offset); } @@ -4135,29 +4354,25 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t } } -aco_opcode get_buffer_store_op(unsigned bytes) +aco_opcode +get_buffer_store_op(unsigned bytes) { switch (bytes) { - case 1: - return aco_opcode::buffer_store_byte; - case 2: - return aco_opcode::buffer_store_short; - case 4: - return aco_opcode::buffer_store_dword; - case 8: - return aco_opcode::buffer_store_dwordx2; - case 12: - return aco_opcode::buffer_store_dwordx3; - case 16: - return aco_opcode::buffer_store_dwordx4; + case 1: return aco_opcode::buffer_store_byte; + case 2: return aco_opcode::buffer_store_short; + case 4: return aco_opcode::buffer_store_dword; + case 8: return aco_opcode::buffer_store_dwordx2; + case 12: return aco_opcode::buffer_store_dwordx3; + case 16: return aco_opcode::buffer_store_dwordx4; } unreachable("Unexpected store size"); return aco_opcode::num_opcodes; } -void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem, RegType dst_type, - Temp data, unsigned writemask, int swizzle_element_size, - unsigned *write_count, Temp *write_datas, unsigned *offsets) +void +split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type, + Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count, + Temp* write_datas, unsigned* offsets) { unsigned write_count_with_skips = 0; bool skips[16]; @@ -4211,8 +4426,9 @@ void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem } } -Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes, - unsigned split_cnt = 0u, Temp dst = Temp()) +Temp +create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type, + unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp()) { Builder bld(ctx->program, ctx->block); unsigned dword_size = elem_size_bytes / 4; @@ -4221,7 +4437,8 @@ Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType dst = bld.tmp(RegClass(reg_type, cnt * dword_size)); std::array allocated_vec; - aco_ptr instr {create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)}; + aco_ptr instr{ + create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)}; instr->definitions[0] = Definition(dst); for (unsigned i = 0; i < cnt; ++i) { @@ -4230,7 +4447,8 @@ Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType allocated_vec[i] = arr[i]; instr->operands[i] = Operand(arr[i]); } else { - Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)), Operand(0u, dword_size == 2)); + Temp zero = + bld.copy(bld.def(RegClass(reg_type, dword_size)), Operand(0u, dword_size == 2)); allocated_vec[i] = zero; instr->operands[i] = Operand(zero); } @@ -4246,7 +4464,8 @@ Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType return dst; } -inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, unsigned const_offset) +inline unsigned +resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset) { if (const_offset >= 4096) { unsigned excess_const_offset = const_offset / 4096u * 4096u; @@ -4255,7 +4474,8 @@ inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, un if (!voffset.id()) voffset = bld.copy(bld.def(v1), Operand(excess_const_offset)); else if (unlikely(voffset.regClass() == s1)) - voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), Operand(excess_const_offset), Operand(voffset)); + voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), + Operand(excess_const_offset), Operand(voffset)); else if (likely(voffset.regClass() == v1)) voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand(excess_const_offset)); else @@ -4265,9 +4485,10 @@ inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, un return const_offset; } -void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata, - unsigned const_offset = 0u, memory_sync_info sync=memory_sync_info(), - bool slc = false, bool swizzled = false) +void +emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata, + unsigned const_offset = 0u, memory_sync_info sync = memory_sync_info(), + bool slc = false, bool swizzled = false) { assert(vdata.id()); assert(vdata.size() != 3 || ctx->program->chip_class != GFX6); @@ -4279,17 +4500,20 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1); Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u); - Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset, - /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled, - /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true, - /* dlc*/ false, /* slc */ slc); + Builder::Result r = + bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset, + /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled, + /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true, + /* dlc*/ false, /* slc */ slc); r.instr->mubuf().sync = sync; } -void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset, - unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask, - bool allow_combining = true, memory_sync_info sync=memory_sync_info(), bool slc = false) +void +store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset, + unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask, + bool allow_combining = true, memory_sync_info sync = memory_sync_info(), + bool slc = false) { Builder bld(ctx->program, ctx->block); assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8); @@ -4299,19 +4523,21 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset unsigned write_count = 0; Temp write_datas[32]; unsigned offsets[32]; - split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask, - allow_combining ? 16 : 4, &write_count, write_datas, offsets); + split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask, allow_combining ? 16 : 4, + &write_count, write_datas, offsets); for (unsigned i = 0; i < write_count; i++) { unsigned const_offset = offsets[i] + base_const_offset; - emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync, slc, !allow_combining); + emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync, + slc, !allow_combining); } } -void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset, - unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components, - unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true, - bool slc = false) +void +load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset, + unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components, + unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true, + bool slc = false) { assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8); assert((num_components * elem_size_bytes) == dst.bytes()); @@ -4331,14 +4557,16 @@ void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, emit_load(ctx, bld, info, mubuf_load_params); } -Temp wave_id_in_threadgroup(isel_context *ctx) +Temp +wave_id_in_threadgroup(isel_context* ctx) { Builder bld(ctx->program, ctx->block); return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(24u | (4u << 16))); } -Temp thread_id_in_threadgroup(isel_context *ctx) +Temp +thread_id_in_threadgroup(isel_context* ctx) { /* tid_in_tg = wave_id * wave_size + tid_in_wave */ @@ -4349,28 +4577,27 @@ Temp thread_id_in_threadgroup(isel_context *ctx) return tid_in_wave; Temp wave_id_in_tg = wave_id_in_threadgroup(ctx); - Temp num_pre_threads = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg, - Operand(ctx->program->wave_size == 64 ? 6u : 5u)); + Temp num_pre_threads = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), + wave_id_in_tg, Operand(ctx->program->wave_size == 64 ? 6u : 5u)); return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave)); } -Temp get_tess_rel_patch_id(isel_context *ctx) +Temp +get_tess_rel_patch_id(isel_context* ctx) { Builder bld(ctx->program, ctx->block); switch (ctx->shader->info.stage) { case MESA_SHADER_TESS_CTRL: - return bld.pseudo(aco_opcode::p_extract, bld.def(v1), - get_arg(ctx, ctx->args->ac.tcs_rel_ids), + return bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids), Operand(0u), Operand(8u), Operand(0u)); - case MESA_SHADER_TESS_EVAL: - return get_arg(ctx, ctx->args->ac.tes_rel_patch_id); - default: - unreachable("Unsupported stage in get_tess_rel_patch_id"); + case MESA_SHADER_TESS_EVAL: return get_arg(ctx, ctx->args->ac.tes_rel_patch_id); + default: unreachable("Unsupported stage in get_tess_rel_patch_id"); } } -bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr) +bool +store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr) { unsigned write_mask = nir_intrinsic_write_mask(instr); unsigned component = nir_intrinsic_component(instr); @@ -4398,40 +4625,41 @@ bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr) return true; } -bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp dst) +bool +load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst) { /* Only TCS per-vertex inputs are supported by this function. - * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations is the same. + * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations + * is the same. */ if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq) return false; - nir_src *off_src = nir_get_io_offset_src(instr); - nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr); - nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr; - bool can_use_temps = nir_src_is_const(*off_src) && - vertex_index_instr->type == nir_instr_type_intrinsic && - nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id; + nir_src* off_src = nir_get_io_offset_src(instr); + nir_src* vertex_index_src = nir_get_io_vertex_index_src(instr); + nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr; + bool can_use_temps = + nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic && + nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id; if (!can_use_temps) return false; - unsigned idx = nir_intrinsic_base(instr) * 4u + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src); - Temp *src = &ctx->inputs.temps[idx]; + unsigned idx = nir_intrinsic_base(instr) * 4u + nir_intrinsic_component(instr) + + 4 * nir_src_as_uint(*off_src); + Temp* src = &ctx->inputs.temps[idx]; create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst); return true; } -static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos); +static void export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos); -void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr) { - if (ctx->stage == vertex_vs || - ctx->stage == tess_eval_vs || - ctx->stage == fragment_fs || - ctx->stage == vertex_ngg || - ctx->stage == tess_eval_ngg || + if (ctx->stage == vertex_vs || ctx->stage == tess_eval_vs || ctx->stage == fragment_fs || + ctx->stage == vertex_ngg || ctx->stage == tess_eval_ngg || (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) || ctx->shader->info.stage == MESA_SHADER_GEOMETRY) { bool stored_to_temps = store_output_to_temps(ctx, instr); @@ -4443,13 +4671,17 @@ void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr) unreachable("Shader stage not implemented"); } - /* For NGG VS and TES shaders the primitive ID is exported manually after the other exports so we have to emit an exp here manually */ - if (ctx->stage.hw == HWStage::NGG && (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::TES)) && + /* For NGG VS and TES shaders the primitive ID is exported manually after the other exports so we + * have to emit an exp here manually */ + if (ctx->stage.hw == HWStage::NGG && + (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::TES)) && nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PRIMITIVE_ID) export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, NULL); } -void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask) +void +emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst, + Temp prim_mask) { Temp coord1 = emit_extract_vector(ctx, src, 0, v1); Temp coord2 = emit_extract_vector(ctx, src, 1, v1); @@ -4460,47 +4692,48 @@ void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp if (ctx->program->dev.has_16bank_lds) { assert(ctx->options->chip_class <= GFX8); Builder::Result interp_p1 = - bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), - Operand(2u) /* P0 */, bld.m0(prim_mask), idx, component); - interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), - coord1, bld.m0(prim_mask), interp_p1, idx, component); - bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, - bld.m0(prim_mask), interp_p1, idx, component); + bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(2u) /* P0 */, + bld.m0(prim_mask), idx, component); + interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1, + bld.m0(prim_mask), interp_p1, idx, component); + bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask), + interp_p1, idx, component); } else { aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16; if (ctx->options->chip_class == GFX8) interp_p2_op = aco_opcode::v_interp_p2_legacy_f16; - Builder::Result interp_p1 = - bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), - coord1, bld.m0(prim_mask), idx, component); - bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), - interp_p1, idx, component); + Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1, + bld.m0(prim_mask), idx, component); + bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx, + component); } } else { - Builder::Result interp_p1 = - bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, - bld.m0(prim_mask), idx, component); + Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, + bld.m0(prim_mask), idx, component); if (ctx->program->dev.has_16bank_lds) interp_p1.instr->operands[0].setLateKill(true); - bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, - bld.m0(prim_mask), interp_p1, idx, component); + bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, + idx, component); } } -void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components) +void +emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components) { Builder bld(ctx->program, ctx->block); - aco_ptr vec(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)); + aco_ptr vec(create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)); for (unsigned i = 0; i < num_components; i++) vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i])); if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) { assert(num_components == 4); - vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3])); + vec->operands[3] = + bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3])); } if (ctx->options->adjust_frag_coord_z && @@ -4525,7 +4758,8 @@ void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components) /* xRate = xRate == 0x1 ? adjusted_frag_z : frag_z. */ Temp cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand(1u), Operand(x_rate)); - vec->operands[2] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), frag_z, adjusted_frag_z, cond); + vec->operands[2] = + bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), frag_z, adjusted_frag_z, cond); } for (Operand& op : vec->operands) @@ -4537,7 +4771,8 @@ void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components) return; } -void emit_load_frag_shading_rate(isel_context *ctx, Temp dst) +void +emit_load_frag_shading_rate(isel_context* ctx, Temp dst) { Builder bld(ctx->program, ctx->block); Temp cond; @@ -4545,27 +4780,26 @@ void emit_load_frag_shading_rate(isel_context *ctx, Temp dst) /* VRS Rate X = Ancillary[2:3] * VRS Rate Y = Ancillary[4:5] */ - Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), - get_arg(ctx, ctx->args->ac.ancillary), Operand(2u), Operand(2u)); - Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), - get_arg(ctx, ctx->args->ac.ancillary), Operand(4u), Operand(2u)); + Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary), + Operand(2u), Operand(2u)); + Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary), + Operand(4u), Operand(2u)); /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */ cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand(1u), Operand(x_rate)); - x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - bld.copy(bld.def(v1), Operand(0u)), + x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(4u)), cond); /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */ cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand(1u), Operand(y_rate)); - y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - bld.copy(bld.def(v1), Operand(0u)), + y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(1u)), cond); bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate)); } -void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr) { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); Temp coords = get_ssa_temp(ctx, instr->src[0].ssa); @@ -4578,11 +4812,11 @@ void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr if (instr->dest.ssa.num_components == 1) { emit_interp_instr(ctx, idx, component, coords, dst, prim_mask); } else { - aco_ptr vec(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1)); - for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) - { + aco_ptr vec(create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1)); + for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) { Temp tmp = ctx->program->allocateTmp(v1); - emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask); + emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask); vec->operands[i] = Operand(tmp); } vec->definitions[0] = Definition(dst); @@ -4590,8 +4824,9 @@ void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr } } -bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info, - unsigned offset, unsigned binding_align, unsigned channels) +bool +check_vertex_fetch_size(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset, + unsigned binding_align, unsigned channels) { unsigned vertex_byte_size = vtx_info->chan_byte_size * channels; if (vtx_info->chan_byte_size != 4 && channels == 3) @@ -4607,9 +4842,9 @@ bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_i (offset % vertex_byte_size == 0 && MAX2(binding_align, 1) % vertex_byte_size == 0); } -uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info, - unsigned offset, unsigned *channels, unsigned max_channels, - unsigned binding_align) +uint8_t +get_fetch_data_format(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset, + unsigned* channels, unsigned max_channels, unsigned binding_align) { if (!vtx_info->chan_byte_size) { *channels = vtx_info->num_channels; @@ -4640,18 +4875,15 @@ uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_ switch (vtx_info->chan_format) { case V_008F0C_BUF_DATA_FORMAT_8: - return std::array{V_008F0C_BUF_DATA_FORMAT_8, - V_008F0C_BUF_DATA_FORMAT_8_8, + return std::array{V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8, V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1]; case V_008F0C_BUF_DATA_FORMAT_16: - return std::array{V_008F0C_BUF_DATA_FORMAT_16, - V_008F0C_BUF_DATA_FORMAT_16_16, + return std::array{V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16, V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1]; case V_008F0C_BUF_DATA_FORMAT_32: - return std::array{V_008F0C_BUF_DATA_FORMAT_32, - V_008F0C_BUF_DATA_FORMAT_32_32, + return std::array{V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1]; } @@ -4661,7 +4893,8 @@ uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_ /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW. * so we may need to fix it up. */ -Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha) +Temp +adjust_vertex_fetch_alpha(isel_context* ctx, unsigned adjustment, Temp alpha) { Builder bld(ctx->program, ctx->block); @@ -4688,7 +4921,8 @@ Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alph return alpha; } -void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); @@ -4697,9 +4931,11 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) if (ctx->shader->info.stage == MESA_SHADER_VERTEX) { if (!nir_src_is_const(offset) || nir_src_as_uint(offset)) - isel_err(offset.ssa->parent_instr, "Unimplemented non-zero nir_intrinsic_load_input offset"); + isel_err(offset.ssa->parent_instr, + "Unimplemented non-zero nir_intrinsic_load_input offset"); - Temp vertex_buffers = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers)); + Temp vertex_buffers = + convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers)); unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0; unsigned component = nir_intrinsic_component(instr); @@ -4713,7 +4949,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) unsigned dfmt = attrib_format & 0xf; unsigned nfmt = (attrib_format >> 4) & 0x7; - const struct ac_data_format_info *vtx_info = ac_get_data_format_info(dfmt); + const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt); unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component; unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels); @@ -4721,8 +4957,8 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) if (post_shuffle) num_channels = MAX2(num_channels, 3); - unsigned desc_index = ctx->program->info->vs.use_per_attribute_vb_descs ? - location : attrib_binding; + unsigned desc_index = + ctx->program->info->vs.use_per_attribute_vb_descs ? location : attrib_binding; desc_index = util_bitcount(ctx->program->info->vs.vb_desc_usage_mask & u_bit_consecutive(0, desc_index)); Operand off = bld.copy(bld.def(s1), Operand(desc_index * 16u)); @@ -4745,12 +4981,11 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) index = bld.copy(bld.def(v1), start_instance); } } else { - index = bld.vadd32(bld.def(v1), - get_arg(ctx, ctx->args->ac.base_vertex), + index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.base_vertex), get_arg(ctx, ctx->args->ac.vertex_id)); } - Temp *const channels = (Temp *)alloca(num_channels * sizeof(Temp)); + Temp* const channels = (Temp*)alloca(num_channels * sizeof(Temp)); unsigned channel_start = 0; bool direct_fetch = false; @@ -4771,14 +5006,15 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) /* use MUBUF when possible to avoid possible alignment issues */ /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */ - bool use_mubuf = (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT || - nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || - nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) && - vtx_info->chan_byte_size == 4; + bool use_mubuf = + (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT || nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || + nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) && + vtx_info->chan_byte_size == 4; unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID; if (!use_mubuf) { - fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, &fetch_component, - vtx_info->num_channels - channel_start, binding_align); + fetch_dfmt = + get_fetch_data_format(ctx, vtx_info, fetch_offset, &fetch_component, + vtx_info->num_channels - channel_start, binding_align); } else { if (fetch_component == 3 && ctx->options->chip_class == GFX6) { /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */ @@ -4791,7 +5027,8 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) Temp fetch_index = index; if (attrib_stride != 0 && fetch_offset > attrib_stride) { - fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index); + fetch_index = + bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index); fetch_offset = fetch_offset % attrib_stride; } @@ -4812,7 +5049,8 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) assert(!use_mubuf); opcode = aco_opcode::tbuffer_load_format_d16_xy; } else { - opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x; + opcode = + use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x; } break; case 6: @@ -4824,25 +5062,26 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) assert(!use_mubuf); opcode = aco_opcode::tbuffer_load_format_d16_xyzw; } else { - opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy; + opcode = + use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy; } break; case 12: assert(ctx->options->chip_class >= GFX7 || (!use_mubuf && ctx->options->chip_class == GFX6)); - opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz; + opcode = + use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz; break; case 16: - opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw; + opcode = + use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw; break; - default: - unreachable("Unimplemented load_input vector size"); + default: unreachable("Unimplemented load_input vector size"); } Temp fetch_dst; - if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle && - !expanded && (alpha_adjust == AC_FETCH_FORMAT_NONE || - num_channels <= 3)) { + if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle && !expanded && + (alpha_adjust == AC_FETCH_FORMAT_NONE || num_channels <= 3)) { direct_fetch = true; fetch_dst = dst; } else { @@ -4850,14 +5089,14 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) } if (use_mubuf) { - Instruction *mubuf = bld.mubuf( - opcode, Definition(fetch_dst), list, fetch_index, soffset, - fetch_offset, false, false, true).instr; + Instruction* mubuf = bld.mubuf(opcode, Definition(fetch_dst), list, fetch_index, + soffset, fetch_offset, false, false, true) + .instr; mubuf->mubuf().vtx_binding = attrib_binding + 1; } else { - Instruction *mtbuf = bld.mtbuf( - opcode, Definition(fetch_dst), list, fetch_index, soffset, - fetch_dfmt, nfmt, fetch_offset, false, true).instr; + Instruction* mtbuf = bld.mtbuf(opcode, Definition(fetch_dst), list, fetch_index, + soffset, fetch_dfmt, nfmt, fetch_offset, false, true) + .instr; mtbuf->mtbuf().vtx_binding = attrib_binding + 1; } @@ -4867,24 +5106,25 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) channels[channel_start] = fetch_dst; } else { for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++) - channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, - bitsize == 16 ? v2b : v1); + channels[channel_start + i] = + emit_extract_vector(ctx, fetch_dst, i, bitsize == 16 ? v2b : v1); } channel_start += fetch_component; } if (!direct_fetch) { - bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && - nfmt != V_008F0C_BUF_NUM_FORMAT_SINT; + bool is_float = + nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && nfmt != V_008F0C_BUF_NUM_FORMAT_SINT; static const unsigned swizzle_normal[4] = {0, 1, 2, 3}; static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3}; - const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal; + const unsigned* swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal; unsigned num_components = instr->dest.ssa.num_components; - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; - std::array elems; + aco_ptr vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; + std::array elems; unsigned num_temp = 0; for (unsigned i = 0; i < num_components; i++) { unsigned idx = i + component; @@ -4913,7 +5153,8 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) } } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) { if (!nir_src_is_const(offset) || nir_src_as_uint(offset)) - isel_err(offset.ssa->parent_instr, "Unimplemented non-zero nir_intrinsic_load_input offset"); + isel_err(offset.ssa->parent_instr, + "Unimplemented non-zero nir_intrinsic_load_input offset"); Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask); @@ -4933,17 +5174,20 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) case 2: vertex_id = 1; /* P20 */ break; - default: - unreachable("invalid vertex index"); + default: unreachable("invalid vertex index"); } } if (dst.size() == 1) { - bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(vertex_id), bld.m0(prim_mask), idx, component); + bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(vertex_id), + bld.m0(prim_mask), idx, component); } else { - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; + aco_ptr vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; for (unsigned i = 0; i < dst.size(); i++) - vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(vertex_id), bld.m0(prim_mask), idx, component + i); + vec->operands[i] = + bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(vertex_id), + bld.m0(prim_mask), idx, component + i); vec->definitions[0] = Definition(dst); bld.insert(std::move(vec)); } @@ -4952,7 +5196,8 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) } } -void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr) { assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL); @@ -4965,18 +5210,17 @@ void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *ins unreachable("LDS-based TCS input should have been lowered in NIR."); } -void visit_load_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr) { switch (ctx->shader->info.stage) { - case MESA_SHADER_TESS_CTRL: - visit_load_tcs_per_vertex_input(ctx, instr); - break; - default: - unreachable("Unimplemented shader stage"); + case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break; + default: unreachable("Unimplemented shader stage"); } } -void visit_load_tess_coord(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr) { assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL); @@ -4997,20 +5241,21 @@ void visit_load_tess_coord(isel_context *ctx, nir_intrinsic_instr *instr) emit_split_vector(ctx, tess_coord, 3); } -Temp load_desc_ptr(isel_context *ctx, unsigned desc_set) +Temp +load_desc_ptr(isel_context* ctx, unsigned desc_set) { if (ctx->program->info->need_indirect_descriptor_sets) { Builder bld(ctx->program, ctx->block); Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0])); Operand off = bld.copy(bld.def(s1), Operand(desc_set << 2)); - return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off);//, false, false, false); + return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off); //, false, false, false); } return get_arg(ctx, ctx->args->descriptor_sets[desc_set]); } - -void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_load_resource(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); Temp index = get_ssa_temp(ctx, instr->src[0].ssa); @@ -5020,13 +5265,14 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr) unsigned binding = nir_intrinsic_binding(instr); Temp desc_ptr; - radv_pipeline_layout *pipeline_layout = ctx->options->layout; - radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout; + radv_pipeline_layout* pipeline_layout = ctx->options->layout; + radv_descriptor_set_layout* layout = pipeline_layout->set[desc_set].layout; unsigned offset = layout->binding[binding].offset; unsigned stride; if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { - unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset; + unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + + layout->binding[binding].dynamic_offset_offset; desc_ptr = get_arg(ctx, ctx->args->ac.push_constants); offset = pipeline_layout->push_constant_size + 16 * idx; stride = 16; @@ -5036,7 +5282,8 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr) } if (nir_src_is_const(instr->src[0])) { - index = bld.copy(bld.def(s1), Operand((uint32_t)(offset + nir_src_as_uint(instr->src[0]) * stride))); + index = bld.copy(bld.def(s1), + Operand((uint32_t)(offset + nir_src_as_uint(instr->src[0]) * stride))); } else if (index.type() == RegType::vgpr) { if (stride != 1) { bool index24bit = layout->binding[binding].array_size <= 0x1000000; @@ -5048,25 +5295,27 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr) if (stride != 1) index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index); if (offset) - index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index); + index = + bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index); } Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - std::array elems; + std::array elems; elems[0] = desc_ptr; elems[1] = index; ctx->allocated_vec.emplace(dst.id(), elems); - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), desc_ptr, index, - Operand(0u)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), desc_ptr, index, Operand(0u)); } -void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size, - Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, - bool glc=false, bool allow_smem=true, memory_sync_info sync=memory_sync_info()) +void +load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst, + Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false, + bool allow_smem = true, memory_sync_info sync = memory_sync_info()) { Builder bld(ctx->program, ctx->block); - bool use_smem = dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem; + bool use_smem = + dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem; if (use_smem) offset = bld.as_uniform(offset); else { @@ -5088,7 +5337,8 @@ void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_ emit_load(ctx, bld, info, mubuf_load_params); } -Temp load_buffer_rsrc(isel_context *ctx, Temp rsrc) +Temp +load_buffer_rsrc(isel_context* ctx, Temp rsrc) { Builder bld(ctx->program, ctx->block); Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1)); @@ -5097,17 +5347,19 @@ Temp load_buffer_rsrc(isel_context *ctx, Temp rsrc) return bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), set_ptr, binding); } -bool is_inline_ubo(isel_context *ctx, nir_src rsrc) +bool +is_inline_ubo(isel_context* ctx, nir_src rsrc) { nir_binding binding = nir_chase_binding(rsrc); if (!binding.success) return false; - radv_descriptor_set_layout *layout = ctx->options->layout->set[binding.desc_set].layout; + radv_descriptor_set_layout* layout = ctx->options->layout->set[binding.desc_set].layout; return layout->binding[binding.binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT; } -void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr) { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa); @@ -5116,17 +5368,16 @@ void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr) if (is_inline_ubo(ctx, instr->src[0])) { Temp set_ptr = bld.as_uniform(emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1))); - Temp binding_off = bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1))); + Temp binding_off = + bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1))); rsrc = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), set_ptr, binding_off); - uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + uint32_t desc_type = + S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); if (ctx->options->chip_class >= GFX10) { desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | - S_008F0C_RESOURCE_LEVEL(1); + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); } else { desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); @@ -5143,7 +5394,7 @@ void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr) } void -visit_load_sbt_amd(isel_context *ctx, nir_intrinsic_instr *instr) +visit_load_sbt_amd(isel_context* ctx, nir_intrinsic_instr* instr) { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); Temp index = get_ssa_temp(ctx, instr->src[0].ssa); @@ -5165,20 +5416,22 @@ visit_load_sbt_amd(isel_context *ctx, nir_intrinsic_instr *instr) false, true); } -void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); unsigned offset = nir_intrinsic_base(instr); unsigned count = instr->dest.ssa.num_components; - nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]); + nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]); if (index_cv && instr->dest.ssa.bit_size == 32) { unsigned start = (offset + index_cv->u32) / 4u; start -= ctx->args->ac.base_inline_push_consts; if (start + count <= ctx->args->ac.num_inline_push_consts) { - std::array elems; - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; + std::array elems; + aco_ptr vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; for (unsigned i = 0; i < count; ++i) { elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]); vec->operands[i] = Operand{elems[i]}; @@ -5192,7 +5445,8 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); if (offset != 0) // TODO check if index != 0 as well - index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index); + index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), + index); Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants)); Temp vec = dst; bool trim = false; @@ -5212,28 +5466,19 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) aco_opcode op; switch (vec.size()) { - case 1: - op = aco_opcode::s_load_dword; - break; - case 2: - op = aco_opcode::s_load_dwordx2; - break; + case 1: op = aco_opcode::s_load_dword; break; + case 2: op = aco_opcode::s_load_dwordx2; break; case 3: vec = bld.tmp(s4); trim = true; FALLTHROUGH; - case 4: - op = aco_opcode::s_load_dwordx4; - break; + case 4: op = aco_opcode::s_load_dwordx4; break; case 6: vec = bld.tmp(s8); trim = true; FALLTHROUGH; - case 8: - op = aco_opcode::s_load_dwordx8; - break; - default: - unreachable("unimplemented or forbidden load_push_constant."); + case 8: op = aco_opcode::s_load_dwordx8; break; + default: unreachable("unimplemented or forbidden load_push_constant."); } bld.smem(op, Definition(vec), ptr, index).instr->smem().prevent_overflow = true; @@ -5247,29 +5492,25 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) if (trim) { emit_split_vector(ctx, vec, 4); RegClass rc = dst.size() == 3 ? s1 : s2; - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), - emit_extract_vector(ctx, vec, 0, rc), - emit_extract_vector(ctx, vec, 1, rc), - emit_extract_vector(ctx, vec, 2, rc)); - + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc), + emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc)); } emit_split_vector(ctx, dst, instr->dest.ssa.num_components); } -void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr) { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); Builder bld(ctx->program, ctx->block); - uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + uint32_t desc_type = + S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); if (ctx->options->chip_class >= GFX10) { desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | - S_008F0C_RESOURCE_LEVEL(1); + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); } else { desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); @@ -5280,20 +5521,23 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr) Temp offset = get_ssa_temp(ctx, instr->src[0].ssa); if (base && offset.type() == RegType::sgpr) - offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base)); + offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, + Operand(base)); else if (base && offset.type() == RegType::vgpr) offset = bld.vadd32(bld.def(v1), Operand(base), offset); - Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), - bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)), - Operand(MIN2(base + range, ctx->shader->constant_data_size)), - Operand(desc_type)); + Temp rsrc = + bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), + bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), + Operand(ctx->constant_data_offset)), + Operand(MIN2(base + range, ctx->shader->constant_data_size)), Operand(desc_type)); unsigned size = instr->dest.ssa.bit_size / 8; // TODO: get alignment information for subdword constants load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0); } -void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_discard_if(isel_context* ctx, nir_intrinsic_instr* instr) { if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) ctx->cf_info.exec_potentially_empty_discard = true; @@ -5310,22 +5554,23 @@ void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr) return; } -void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr) +void +visit_discard(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) ctx->cf_info.exec_potentially_empty_discard = true; - bool divergent = ctx->cf_info.parent_if.is_divergent || - ctx->cf_info.parent_loop.has_divergent_continue; + bool divergent = + ctx->cf_info.parent_if.is_divergent || ctx->cf_info.parent_loop.has_divergent_continue; if (ctx->block->loop_nest_depth && (nir_instr_is_last(&instr->instr) && !divergent)) { /* we handle discards the same way as jump instructions */ append_logical_end(ctx->block); /* in loops, discard behaves like break */ - Block *linear_target = ctx->cf_info.parent_loop.exit; + Block* linear_target = ctx->cf_info.parent_loop.exit; ctx->block->kind |= block_kind_discard; /* uniform discard - loop ends here */ @@ -5342,7 +5587,8 @@ void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr) ctx->program->needs_exact = true; /* save exec somewhere temporarily so that it doesn't get * overwritten before the discard from outer exec masks */ - Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, bld.lm)); + Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF), + Operand(exec, bld.lm)); bld.pseudo(aco_opcode::p_discard_if, cond); ctx->block->kind |= block_kind_uses_discard_if; return; @@ -5382,25 +5628,23 @@ enum aco_descriptor_type { }; static bool -should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) { +should_declare_array(isel_context* ctx, enum glsl_sampler_dim sampler_dim, bool is_array) +{ if (sampler_dim == GLSL_SAMPLER_DIM_BUF) return false; ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array); - return dim == ac_image_cube || - dim == ac_image_1darray || - dim == ac_image_2darray || + return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray || dim == ac_image_2darraymsaa; } -Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr, - enum aco_descriptor_type desc_type, - const nir_tex_instr *tex_instr, bool write) +Temp +get_sampler_desc(isel_context* ctx, nir_deref_instr* deref_instr, + enum aco_descriptor_type desc_type, const nir_tex_instr* tex_instr, bool write) { -/* FIXME: we should lower the deref with some new nir_intrinsic_load_desc - std::unordered_map::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index); - if (it != ctx->tex_desc.end()) - return it->second; -*/ + /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc + std::unordered_map::iterator it = ctx->tex_desc.find((uint64_t) desc_type << + 32 | deref_instr->dest.ssa.index); if (it != ctx->tex_desc.end()) return it->second; + */ Temp index = Temp(); bool index_set = false; unsigned constant_index = 0; @@ -5413,13 +5657,13 @@ Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr, descriptor_set = 0; base_index = tex_instr->sampler_index; } else { - while(deref_instr->deref_type != nir_deref_type_var) { + while (deref_instr->deref_type != nir_deref_type_var) { unsigned array_size = glsl_get_aoa_size(deref_instr->type); if (!array_size) array_size = 1; assert(deref_instr->deref_type == nir_deref_type_array); - nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index); + nir_const_value* const_value = nir_src_as_const_value(deref_instr->arr.index); if (const_value) { constant_index += array_size * const_value->u32; } else { @@ -5428,13 +5672,15 @@ Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr, indirect = bld.as_uniform(indirect); if (array_size != 1) - indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect); + indirect = + bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect); if (!index_set) { index = indirect; index_set = true; } else { - index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect); + index = + bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect); } } @@ -5447,8 +5693,8 @@ Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr, Temp list = load_desc_ptr(ctx, descriptor_set); list = convert_pointer_to_64_bit(ctx, list); - struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout; - struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index; + struct radv_descriptor_set_layout* layout = ctx->options->layout->set[descriptor_set].layout; + struct radv_descriptor_set_binding_layout* binding = layout->binding + base_index; unsigned offset = binding->offset; unsigned stride = binding->size; aco_opcode opcode; @@ -5487,20 +5733,18 @@ Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr, opcode = aco_opcode::s_load_dwordx4; offset += 64; break; - default: - unreachable("invalid desc_type\n"); + default: unreachable("invalid desc_type\n"); } offset += constant_index * stride; if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset && - (!index_set || binding->immutable_samplers_equal)) { + (!index_set || binding->immutable_samplers_equal)) { if (binding->immutable_samplers_equal) constant_index = 0; - const uint32_t *samplers = radv_immutable_samplers(layout, binding); - uint32_t dword0_mask = tex_instr->op == nir_texop_tg4 ? - C_008F30_TRUNC_COORD : 0xffffffffu; + const uint32_t* samplers = radv_immutable_samplers(layout, binding); + uint32_t dword0_mask = tex_instr->op == nir_texop_tg4 ? C_008F30_TRUNC_COORD : 0xffffffffu; return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(samplers[constant_index * 4 + 0] & dword0_mask), Operand(samplers[constant_index * 4 + 1]), @@ -5512,8 +5756,9 @@ Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr, if (!index_set) { off = bld.copy(bld.def(s1), Operand(offset)); } else { - off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), - bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index))); + off = Operand( + (Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), + bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index))); } Temp res = bld.smem(opcode, bld.def(type), list, off); @@ -5522,103 +5767,80 @@ Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr, Temp components[8]; for (unsigned i = 0; i < 8; i++) components[i] = bld.tmp(s1); - bld.pseudo(aco_opcode::p_split_vector, - Definition(components[0]), - Definition(components[1]), - Definition(components[2]), - Definition(components[3]), - res); + bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]), + Definition(components[2]), Definition(components[3]), res); Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, write); - bld.pseudo(aco_opcode::p_split_vector, - bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1), - Definition(components[4]), - Definition(components[5]), - Definition(components[6]), - Definition(components[7]), - desc2); + bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1), + Definition(components[4]), Definition(components[5]), Definition(components[6]), + Definition(components[7]), desc2); - res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), - components[0], components[1], components[2], components[3], - components[4], components[5], components[6], components[7]); - } else if (desc_type == ACO_DESC_IMAGE && - ctx->options->has_image_load_dcc_bug && - !tex_instr && !write) { + res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1], + components[2], components[3], components[4], components[5], components[6], + components[7]); + } else if (desc_type == ACO_DESC_IMAGE && ctx->options->has_image_load_dcc_bug && !tex_instr && + !write) { Temp components[8]; for (unsigned i = 0; i < 8; i++) components[i] = bld.tmp(s1); - bld.pseudo(aco_opcode::p_split_vector, - Definition(components[0]), Definition(components[1]), - Definition(components[2]), Definition(components[3]), - Definition(components[4]), Definition(components[5]), - Definition(components[6]), Definition(components[7]), res); + bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]), + Definition(components[2]), Definition(components[3]), Definition(components[4]), + Definition(components[5]), Definition(components[6]), Definition(components[7]), + res); /* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a * hardware bug. */ - components[6] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), - components[6], - bld.copy(bld.def(s1), Operand((uint32_t)C_00A018_WRITE_COMPRESS_ENABLE))); + components[6] = + bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[6], + bld.copy(bld.def(s1), Operand((uint32_t)C_00A018_WRITE_COMPRESS_ENABLE))); - res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), - components[0], components[1], components[2], components[3], - components[4], components[5], components[6], components[7]); + res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1], + components[2], components[3], components[4], components[5], components[6], + components[7]); } else if (desc_type == ACO_DESC_SAMPLER && tex_instr->op == nir_texop_tg4) { Temp components[4]; for (unsigned i = 0; i < 4; i++) components[i] = bld.tmp(s1); - bld.pseudo(aco_opcode::p_split_vector, - Definition(components[0]), Definition(components[1]), + bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]), Definition(components[2]), Definition(components[3]), res); /* We want to always use the linear filtering truncation behaviour for * nir_texop_tg4, even if the sampler uses nearest/point filtering. */ - components[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), - components[0], Operand((uint32_t)C_008F30_TRUNC_COORD)); + components[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[0], + Operand((uint32_t)C_008F30_TRUNC_COORD)); - res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), - components[0], components[1], components[2], components[3]); + res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), components[0], components[1], + components[2], components[3]); } return res; } -static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array) +static int +image_type_to_components_count(enum glsl_sampler_dim dim, bool array) { switch (dim) { - case GLSL_SAMPLER_DIM_BUF: - return 1; - case GLSL_SAMPLER_DIM_1D: - return array ? 2 : 1; - case GLSL_SAMPLER_DIM_2D: - return array ? 3 : 2; - case GLSL_SAMPLER_DIM_MS: - return array ? 4 : 3; + case GLSL_SAMPLER_DIM_BUF: return 1; + case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1; + case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2; + case GLSL_SAMPLER_DIM_MS: return array ? 4 : 3; case GLSL_SAMPLER_DIM_3D: - case GLSL_SAMPLER_DIM_CUBE: - return 3; + case GLSL_SAMPLER_DIM_CUBE: return 3; case GLSL_SAMPLER_DIM_RECT: - case GLSL_SAMPLER_DIM_SUBPASS: - return 2; - case GLSL_SAMPLER_DIM_SUBPASS_MS: - return 3; - default: - break; + case GLSL_SAMPLER_DIM_SUBPASS: return 2; + case GLSL_SAMPLER_DIM_SUBPASS_MS: return 3; + default: break; } return 0; } - -static MIMG_instruction *emit_mimg(Builder& bld, aco_opcode op, - Definition dst, - Temp rsrc, - Operand samp, - std::vector coords, - unsigned wqm_mask=0, - Operand vdata=Operand(v1)) +static MIMG_instruction* +emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp, + std::vector coords, unsigned wqm_mask = 0, Operand vdata = Operand(v1)) { /* Limit NSA instructions to 3 dwords on GFX10 to avoid stability issues. */ unsigned max_nsa_size = bld.program->chip_class >= GFX10_3 ? 13 : 5; @@ -5629,7 +5851,8 @@ static MIMG_instruction *emit_mimg(Builder& bld, aco_opcode op, if (coords.size() > 1) { coord = bld.tmp(RegType::vgpr, coords.size()); - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)}; + aco_ptr vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)}; for (unsigned i = 0; i < coords.size(); i++) vec->operands[i] = Operand(coords[i]); vec->definitions[0] = Definition(coord); @@ -5659,8 +5882,8 @@ static MIMG_instruction *emit_mimg(Builder& bld, aco_opcode op, } } - aco_ptr mimg{create_instruction( - op, Format::MIMG, 3 + coords.size(), dst.isTemp())}; + aco_ptr mimg{ + create_instruction(op, Format::MIMG, 3 + coords.size(), dst.isTemp())}; if (dst.isTemp()) mimg->definitions[0] = dst; mimg->operands[0] = Operand(rsrc); @@ -5669,12 +5892,13 @@ static MIMG_instruction *emit_mimg(Builder& bld, aco_opcode op, for (unsigned i = 0; i < coords.size(); i++) mimg->operands[3 + i] = Operand(coords[i]); - MIMG_instruction *res = mimg.get(); + MIMG_instruction* res = mimg.get(); bld.insert(std::move(mimg)); return res; } -void visit_bvh64_intersect_ray_amd(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); @@ -5699,8 +5923,8 @@ void visit_bvh64_intersect_ray_amd(isel_context *ctx, nir_intrinsic_instr *instr args.push_back(emit_extract_vector(ctx, inv_dir, 1, v1)); args.push_back(emit_extract_vector(ctx, inv_dir, 2, v1)); - MIMG_instruction *mimg = emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, - Definition(dst), resource, Operand(s4), args); + MIMG_instruction* mimg = emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, Definition(dst), + resource, Operand(s4), args); mimg->dim = ac_image_1d; mimg->dmask = 0xf; mimg->unrm = true; @@ -5721,17 +5945,18 @@ void visit_bvh64_intersect_ray_amd(isel_context *ctx, nir_intrinsic_instr *instr * The sample index should be adjusted as follows: * sample_index = (fmask >> (sample_index * 4)) & 0xF; */ -static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vector& coords, Operand sample_index, Temp fmask_desc_ptr) +static Temp +adjust_sample_index_using_fmask(isel_context* ctx, bool da, std::vector& coords, + Operand sample_index, Temp fmask_desc_ptr) { Builder bld(ctx->program, ctx->block); Temp fmask = bld.tmp(v1); unsigned dim = ctx->options->chip_class >= GFX10 - ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da) - : 0; + ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da) + : 0; - MIMG_instruction *load = emit_mimg(bld, aco_opcode::image_load, - Definition(fmask), fmask_desc_ptr, - Operand(s4), coords); + MIMG_instruction* load = emit_mimg(bld, aco_opcode::image_load, Definition(fmask), + fmask_desc_ptr, Operand(s4), coords); load->glc = false; load->dlc = false; load->dmask = 0x1; @@ -5747,7 +5972,8 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec sample_index4 = Operand(0u); } } else if (sample_index.regClass() == s1) { - sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u)); + sample_index4 = + bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u)); } else { assert(sample_index.regClass() == v1); sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index); @@ -5759,14 +5985,17 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec else if (sample_index4.isConstant() && sample_index4.constantValue() == 28) final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask); else - final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u)); + final_sample = + bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u)); /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK * resource descriptor is 0 (invalid), */ Temp compare = bld.tmp(bld.lm); - bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare), - Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc); + bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare), Operand(0u), + emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)) + .def(0) + .setHint(vcc); Temp sample_index_v = bld.copy(bld.def(v1), sample_index); @@ -5774,13 +6003,15 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare); } -static std::vector get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type) +static std::vector +get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr, const struct glsl_type* type) { Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa); enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); bool is_array = glsl_sampler_type_is_array(type); - ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS); + ASSERTED bool add_frag_pos = + (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS); assert(!add_frag_pos && "Input attachments should be lowered."); bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS); bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D; @@ -5794,14 +6025,18 @@ static std::vector get_image_coords(isel_context *ctx, const nir_intrinsic /* get sample index */ if (instr->intrinsic == nir_intrinsic_image_deref_load || instr->intrinsic == nir_intrinsic_image_deref_sparse_load) { - nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]); - Operand sample_index = sample_cv ? Operand(sample_cv->u32) : Operand(emit_extract_vector(ctx, src2, 0, v1)); + nir_const_value* sample_cv = nir_src_as_const_value(instr->src[2]); + Operand sample_index = + sample_cv ? Operand(sample_cv->u32) : Operand(emit_extract_vector(ctx, src2, 0, v1)); std::vector fmask_load_address; for (unsigned i = 0; i < (is_array ? 3 : 2); i++) fmask_load_address.emplace_back(emit_extract_vector(ctx, src0, i, v1)); - Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false); - coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr); + Temp fmask_desc_ptr = + get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), + ACO_DESC_FMASK, nullptr, false); + coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, + sample_index, fmask_desc_ptr); } else { coords[count] = emit_extract_vector(ctx, src2, 0, v1); } @@ -5822,7 +6057,8 @@ static std::vector get_image_coords(isel_context *ctx, const nir_intrinsic instr->intrinsic == nir_intrinsic_image_deref_sparse_load || instr->intrinsic == nir_intrinsic_image_deref_store) { int lod_index = instr->intrinsic == nir_intrinsic_image_deref_store ? 4 : 3; - bool level_zero = nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0; + bool level_zero = + nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0; if (!level_zero) coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa)); @@ -5831,8 +6067,8 @@ static std::vector get_image_coords(isel_context *ctx, const nir_intrinsic return coords; } - -memory_sync_info get_memory_sync_info(nir_intrinsic_instr *instr, storage_class storage, unsigned semantics) +memory_sync_info +get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics) { /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */ if (semantics & semantic_atomicrmw) @@ -5848,7 +6084,8 @@ memory_sync_info get_memory_sync_info(nir_intrinsic_instr *instr, storage_class return memory_sync_info(storage, semantics); } -Operand emit_tfe_init(Builder& bld, Temp dst) +Operand +emit_tfe_init(Builder& bld, Temp dst) { Temp tmp = bld.tmp(dst.regClass()); @@ -5867,11 +6104,13 @@ Operand emit_tfe_init(Builder& bld, Temp dst) return Operand(tmp); } -void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); - const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); - const struct glsl_type *type = glsl_without_array(var->type); + const nir_variable* var = + nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + const struct glsl_type* type = glsl_without_array(var->type); const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); bool is_array = glsl_sampler_type_is_array(type); bool is_sparse = instr->intrinsic == nir_intrinsic_image_deref_sparse_load; @@ -5881,8 +6120,8 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) unsigned access = var->data.access | nir_intrinsic_access(instr); unsigned result_size = instr->dest.ssa.num_components - is_sparse; - unsigned expand_mask = nir_ssa_def_components_read(&instr->dest.ssa) & - u_bit_consecutive(0, result_size); + unsigned expand_mask = + nir_ssa_def_components_read(&instr->dest.ssa) & u_bit_consecutive(0, result_size); expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */ if (dim == GLSL_SAMPLER_DIM_BUF) expand_mask = (1u << util_last_bit(expand_mask)) - 1u; @@ -5911,25 +6150,17 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) aco_opcode opcode; switch (util_bitcount(dmask)) { - case 1: - opcode = aco_opcode::buffer_load_format_x; - break; - case 2: - opcode = aco_opcode::buffer_load_format_xy; - break; - case 3: - opcode = aco_opcode::buffer_load_format_xyz; - break; - case 4: - opcode = aco_opcode::buffer_load_format_xyzw; - break; - default: - unreachable(">4 channel buffer image load"); + case 1: opcode = aco_opcode::buffer_load_format_x; break; + case 2: opcode = aco_opcode::buffer_load_format_xy; break; + case 3: opcode = aco_opcode::buffer_load_format_xyz; break; + case 4: opcode = aco_opcode::buffer_load_format_xyzw; break; + default: unreachable(">4 channel buffer image load"); } - aco_ptr load{create_instruction(opcode, Format::MUBUF, 3 + is_sparse, 1)}; + aco_ptr load{ + create_instruction(opcode, Format::MUBUF, 3 + is_sparse, 1)}; load->operands[0] = Operand(resource); load->operands[1] = Operand(vindex); - load->operands[2] = Operand((uint32_t) 0); + load->operands[2] = Operand((uint32_t)0); load->definitions[0] = Definition(tmp); load->idxen = true; load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT); @@ -5946,8 +6177,8 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip; Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1); - MIMG_instruction *load = emit_mimg(bld, opcode, Definition(tmp), resource, - Operand(s4), coords, 0, vdata); + MIMG_instruction* load = + emit_mimg(bld, opcode, Definition(tmp), resource, Operand(s4), coords, 0, vdata); load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0; load->dlc = load->glc && ctx->options->chip_class >= GFX10; load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); @@ -5962,16 +6193,19 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) /* The result components are 64-bit but the sparse residency code is * 32-bit. So add a zero to the end so expand_vector() works correctly. */ - tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size()+1), tmp, Operand(0u)); + tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp, + Operand(0u)); } expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, expand_mask); } -void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr) { - const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); - const struct glsl_type *type = glsl_without_array(var->type); + const nir_variable* var = + nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + const struct glsl_type* type = glsl_without_array(var->type); const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); bool is_array = glsl_sampler_type_is_array(type); Temp data = get_ssa_temp(ctx, instr->src[3].ssa); @@ -5983,32 +6217,28 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0); unsigned access = var->data.access | nir_intrinsic_access(instr); - bool glc = ctx->options->chip_class == GFX6 || access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0; + bool glc = ctx->options->chip_class == GFX6 || + access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) + ? 1 + : 0; if (dim == GLSL_SAMPLER_DIM_BUF) { - Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true); + Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), + ACO_DESC_BUFFER, nullptr, true); Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); aco_opcode opcode; switch (data.size()) { - case 1: - opcode = aco_opcode::buffer_store_format_x; - break; - case 2: - opcode = aco_opcode::buffer_store_format_xy; - break; - case 3: - opcode = aco_opcode::buffer_store_format_xyz; - break; - case 4: - opcode = aco_opcode::buffer_store_format_xyzw; - break; - default: - unreachable(">4 channel buffer image store"); + case 1: opcode = aco_opcode::buffer_store_format_x; break; + case 2: opcode = aco_opcode::buffer_store_format_xy; break; + case 3: opcode = aco_opcode::buffer_store_format_xyz; break; + case 4: opcode = aco_opcode::buffer_store_format_xyzw; break; + default: unreachable(">4 channel buffer image store"); } - aco_ptr store{create_instruction(opcode, Format::MUBUF, 4, 0)}; + aco_ptr store{ + create_instruction(opcode, Format::MUBUF, 4, 0)}; store->operands[0] = Operand(rsrc); store->operands[1] = Operand(vindex); - store->operands[2] = Operand((uint32_t) 0); + store->operands[2] = Operand((uint32_t)0); store->operands[3] = Operand(data); store->idxen = true; store->glc = glc; @@ -6022,14 +6252,15 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) assert(data.type() == RegType::vgpr); std::vector coords = get_image_coords(ctx, instr, type); - Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true); + Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), + ACO_DESC_IMAGE, nullptr, true); bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0; aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip; Builder bld(ctx->program, ctx->block); - MIMG_instruction *store = emit_mimg(bld, opcode, Definition(), resource, - Operand(s4), coords, 0, Operand(data)); + MIMG_instruction* store = + emit_mimg(bld, opcode, Definition(), resource, Operand(s4), coords, 0, Operand(data)); store->glc = glc; store->dlc = false; store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); @@ -6042,11 +6273,13 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) return; } -void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr) { bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa); - const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); - const struct glsl_type *type = glsl_without_array(var->type); + const nir_variable* var = + nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + const struct glsl_type* type = glsl_without_array(var->type); const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); bool is_array = glsl_sampler_type_is_array(type); Builder bld(ctx->program, ctx->block); @@ -6056,62 +6289,64 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented."); if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap) - data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2), get_ssa_temp(ctx, instr->src[4].ssa), data); + data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2), + get_ssa_temp(ctx, instr->src[4].ssa), data); aco_opcode buf_op, buf_op64, image_op; switch (instr->intrinsic) { - case nir_intrinsic_image_deref_atomic_add: - buf_op = aco_opcode::buffer_atomic_add; - buf_op64 = aco_opcode::buffer_atomic_add_x2; - image_op = aco_opcode::image_atomic_add; - break; - case nir_intrinsic_image_deref_atomic_umin: - buf_op = aco_opcode::buffer_atomic_umin; - buf_op64 = aco_opcode::buffer_atomic_umin_x2; - image_op = aco_opcode::image_atomic_umin; - break; - case nir_intrinsic_image_deref_atomic_imin: - buf_op = aco_opcode::buffer_atomic_smin; - buf_op64 = aco_opcode::buffer_atomic_smin_x2; - image_op = aco_opcode::image_atomic_smin; - break; - case nir_intrinsic_image_deref_atomic_umax: - buf_op = aco_opcode::buffer_atomic_umax; - buf_op64 = aco_opcode::buffer_atomic_umax_x2; - image_op = aco_opcode::image_atomic_umax; - break; - case nir_intrinsic_image_deref_atomic_imax: - buf_op = aco_opcode::buffer_atomic_smax; - buf_op64 = aco_opcode::buffer_atomic_smax_x2; - image_op = aco_opcode::image_atomic_smax; - break; - case nir_intrinsic_image_deref_atomic_and: - buf_op = aco_opcode::buffer_atomic_and; - buf_op64 = aco_opcode::buffer_atomic_and_x2; - image_op = aco_opcode::image_atomic_and; - break; - case nir_intrinsic_image_deref_atomic_or: - buf_op = aco_opcode::buffer_atomic_or; - buf_op64 = aco_opcode::buffer_atomic_or_x2; - image_op = aco_opcode::image_atomic_or; - break; - case nir_intrinsic_image_deref_atomic_xor: - buf_op = aco_opcode::buffer_atomic_xor; - buf_op64 = aco_opcode::buffer_atomic_xor_x2; - image_op = aco_opcode::image_atomic_xor; - break; - case nir_intrinsic_image_deref_atomic_exchange: - buf_op = aco_opcode::buffer_atomic_swap; - buf_op64 = aco_opcode::buffer_atomic_swap_x2; - image_op = aco_opcode::image_atomic_swap; - break; - case nir_intrinsic_image_deref_atomic_comp_swap: - buf_op = aco_opcode::buffer_atomic_cmpswap; - buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2; - image_op = aco_opcode::image_atomic_cmpswap; - break; - default: - unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions."); + case nir_intrinsic_image_deref_atomic_add: + buf_op = aco_opcode::buffer_atomic_add; + buf_op64 = aco_opcode::buffer_atomic_add_x2; + image_op = aco_opcode::image_atomic_add; + break; + case nir_intrinsic_image_deref_atomic_umin: + buf_op = aco_opcode::buffer_atomic_umin; + buf_op64 = aco_opcode::buffer_atomic_umin_x2; + image_op = aco_opcode::image_atomic_umin; + break; + case nir_intrinsic_image_deref_atomic_imin: + buf_op = aco_opcode::buffer_atomic_smin; + buf_op64 = aco_opcode::buffer_atomic_smin_x2; + image_op = aco_opcode::image_atomic_smin; + break; + case nir_intrinsic_image_deref_atomic_umax: + buf_op = aco_opcode::buffer_atomic_umax; + buf_op64 = aco_opcode::buffer_atomic_umax_x2; + image_op = aco_opcode::image_atomic_umax; + break; + case nir_intrinsic_image_deref_atomic_imax: + buf_op = aco_opcode::buffer_atomic_smax; + buf_op64 = aco_opcode::buffer_atomic_smax_x2; + image_op = aco_opcode::image_atomic_smax; + break; + case nir_intrinsic_image_deref_atomic_and: + buf_op = aco_opcode::buffer_atomic_and; + buf_op64 = aco_opcode::buffer_atomic_and_x2; + image_op = aco_opcode::image_atomic_and; + break; + case nir_intrinsic_image_deref_atomic_or: + buf_op = aco_opcode::buffer_atomic_or; + buf_op64 = aco_opcode::buffer_atomic_or_x2; + image_op = aco_opcode::image_atomic_or; + break; + case nir_intrinsic_image_deref_atomic_xor: + buf_op = aco_opcode::buffer_atomic_xor; + buf_op64 = aco_opcode::buffer_atomic_xor_x2; + image_op = aco_opcode::image_atomic_xor; + break; + case nir_intrinsic_image_deref_atomic_exchange: + buf_op = aco_opcode::buffer_atomic_swap; + buf_op64 = aco_opcode::buffer_atomic_swap_x2; + image_op = aco_opcode::image_atomic_swap; + break; + case nir_intrinsic_image_deref_atomic_comp_swap: + buf_op = aco_opcode::buffer_atomic_cmpswap; + buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2; + image_op = aco_opcode::image_atomic_cmpswap; + break; + default: + unreachable("visit_image_atomic should only be called with " + "nir_intrinsic_image_deref_atomic_* instructions."); } Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); @@ -6119,8 +6354,10 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) if (dim == GLSL_SAMPLER_DIM_BUF) { Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); - Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true); - //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented."); + Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), + ACO_DESC_BUFFER, nullptr, true); + // assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet + // implemented."); aco_ptr mubuf{create_instruction( is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)}; mubuf->operands[0] = Operand(resource); @@ -6141,10 +6378,11 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) } std::vector coords = get_image_coords(ctx, instr, type); - Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true); + Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), + ACO_DESC_IMAGE, nullptr, true); Definition def = return_previous ? Definition(dst) : Definition(); - MIMG_instruction *mimg = emit_mimg(bld, image_op, def, resource, - Operand(s4), coords, 0, Operand(data)); + MIMG_instruction* mimg = + emit_mimg(bld, image_op, def, resource, Operand(s4), coords, 0, Operand(data)); mimg->glc = return_previous; mimg->dlc = false; /* Not needed for atomics */ mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); @@ -6157,7 +6395,8 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) return; } -void get_buffer_size(isel_context *ctx, Temp desc, Temp dst) +void +get_buffer_size(isel_context* ctx, Temp desc, Temp dst) { if (ctx->options->chip_class == GFX8) { /* we only have to divide by 1, 2, 4, 8, 12 or 16 */ @@ -6165,18 +6404,21 @@ void get_buffer_size(isel_context *ctx, Temp desc, Temp dst) Temp size = emit_extract_vector(ctx, desc, 2, s1); - Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), bld.copy(bld.def(v1), Operand(0xaaaaaaabu)), size); - size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), bld.as_uniform(size_div3), Operand(1u)); + Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), + bld.copy(bld.def(v1), Operand(0xaaaaaaabu)), size); + size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), + bld.as_uniform(size_div3), Operand(1u)); Temp stride = emit_extract_vector(ctx, desc, 1, s1); - stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u)); + stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, + Operand((5u << 16) | 16u)); Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand(12u)); size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12)); Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst; - bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc), - size, bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride)); + bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc), size, + bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride)); if (dst.type() == RegType::vgpr) bld.copy(Definition(dst), shr_dst); @@ -6186,16 +6428,19 @@ void get_buffer_size(isel_context *ctx, Temp desc, Temp dst) } } -void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_image_size(isel_context* ctx, nir_intrinsic_instr* instr) { - const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); - const struct glsl_type *type = glsl_without_array(var->type); + const nir_variable* var = + nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + const struct glsl_type* type = glsl_without_array(var->type); const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); bool is_array = glsl_sampler_type_is_array(type); Builder bld(ctx->program, ctx->block); if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) { - Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, false); + Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), + ACO_DESC_BUFFER, NULL, false); return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa)); } @@ -6204,19 +6449,19 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr) std::vector lod{bld.copy(bld.def(v1), Operand(0u))}; /* Resource */ - Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, false); + Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), + ACO_DESC_IMAGE, NULL, false); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - MIMG_instruction *mimg = emit_mimg(bld, aco_opcode::image_get_resinfo, - Definition(dst), resource, Operand(s4), lod); + MIMG_instruction* mimg = + emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(dst), resource, Operand(s4), lod); uint8_t& dmask = mimg->dmask; mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); mimg->dmask = (1 << instr->dest.ssa.num_components) - 1; mimg->da = glsl_sampler_type_is_array(type); - if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE && - glsl_sampler_type_is_array(type)) { + if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE && glsl_sampler_type_is_array(type)) { assert(instr->dest.ssa.num_components == 3); Temp tmp = ctx->program->allocateTmp(v3); @@ -6224,13 +6469,12 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr) emit_split_vector(ctx, tmp, 3); /* divide 3rd value by 6 by multiplying with magic number */ - Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB)); - Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c); + Temp c = bld.copy(bld.def(s1), Operand((uint32_t)0x2AAAAAAB)); + Temp by_6 = + bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c); - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), - emit_extract_vector(ctx, tmp, 0, v1), - emit_extract_vector(ctx, tmp, 1, v1), - by_6); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, tmp, 0, v1), + emit_extract_vector(ctx, tmp, 1, v1), by_6); } else if (ctx->options->chip_class == GFX9 && glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D && @@ -6242,14 +6486,18 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr) emit_split_vector(ctx, dst, instr->dest.ssa.num_components); } -void get_image_samples(isel_context *ctx, Definition dst, Temp resource) +void +get_image_samples(isel_context* ctx, Definition dst, Temp resource) { Builder bld(ctx->program, ctx->block); Temp dword3 = emit_extract_vector(ctx, resource, 3, s1); - Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16)); - Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2); - Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */)); + Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, + Operand(16u | 4u << 16)); + Temp samples = + bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2); + Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, + Operand(28u | 4u << 16 /* offset=28, width=4 */)); Operand default_sample = Operand(1u); if (ctx->options->robust_buffer_access) { @@ -6257,7 +6505,8 @@ void get_image_samples(isel_context *ctx, Definition dst, Temp resource) * all zero, then it's a null descriptor. */ Temp dword1 = emit_extract_vector(ctx, resource, 1, s1); - Temp is_non_null_descriptor = bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u)); + Temp is_non_null_descriptor = + bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u)); default_sample = Operand(is_non_null_descriptor); } @@ -6265,15 +6514,18 @@ void get_image_samples(isel_context *ctx, Definition dst, Temp resource) bld.sop2(aco_opcode::s_cselect_b32, dst, samples, default_sample, bld.scc(is_msaa)); } -void visit_image_samples(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_image_samples(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, false); + Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), + ACO_DESC_IMAGE, NULL, false); get_image_samples(ctx, Definition(dst), resource); } -void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); unsigned num_components = instr->num_components; @@ -6292,7 +6544,8 @@ void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) get_memory_sync_info(instr, storage_buffer, 0)); } -void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); Temp data = get_ssa_temp(ctx, instr->src[0].ssa); @@ -6303,13 +6556,14 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0); - bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + bool glc = + nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); unsigned write_count = 0; Temp write_datas[32]; unsigned offsets[32]; - split_buffer_store(ctx, instr, false, RegType::vgpr, - data, writemask, 16, &write_count, write_datas, offsets); + split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count, + write_datas, offsets); /* GFX6-7 are affected by a hw bug that prevents address clamping to work * correctly when the SGPR offset is used. @@ -6320,10 +6574,11 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) for (unsigned i = 0; i < write_count; i++) { aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); - aco_ptr store{create_instruction(op, Format::MUBUF, 4, 0)}; + aco_ptr store{ + create_instruction(op, Format::MUBUF, 4, 0)}; store->operands[0] = Operand(rsrc); store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); - store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); + store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t)0); store->operands[3] = Operand(write_datas[i]); store->offset = offsets[i]; store->offen = (offset.type() == RegType::vgpr); @@ -6336,7 +6591,8 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) } } -void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa); @@ -6353,54 +6609,56 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) aco_opcode op32, op64; switch (instr->intrinsic) { - case nir_intrinsic_ssbo_atomic_add: - op32 = aco_opcode::buffer_atomic_add; - op64 = aco_opcode::buffer_atomic_add_x2; - break; - case nir_intrinsic_ssbo_atomic_imin: - op32 = aco_opcode::buffer_atomic_smin; - op64 = aco_opcode::buffer_atomic_smin_x2; - break; - case nir_intrinsic_ssbo_atomic_umin: - op32 = aco_opcode::buffer_atomic_umin; - op64 = aco_opcode::buffer_atomic_umin_x2; - break; - case nir_intrinsic_ssbo_atomic_imax: - op32 = aco_opcode::buffer_atomic_smax; - op64 = aco_opcode::buffer_atomic_smax_x2; - break; - case nir_intrinsic_ssbo_atomic_umax: - op32 = aco_opcode::buffer_atomic_umax; - op64 = aco_opcode::buffer_atomic_umax_x2; - break; - case nir_intrinsic_ssbo_atomic_and: - op32 = aco_opcode::buffer_atomic_and; - op64 = aco_opcode::buffer_atomic_and_x2; - break; - case nir_intrinsic_ssbo_atomic_or: - op32 = aco_opcode::buffer_atomic_or; - op64 = aco_opcode::buffer_atomic_or_x2; - break; - case nir_intrinsic_ssbo_atomic_xor: - op32 = aco_opcode::buffer_atomic_xor; - op64 = aco_opcode::buffer_atomic_xor_x2; - break; - case nir_intrinsic_ssbo_atomic_exchange: - op32 = aco_opcode::buffer_atomic_swap; - op64 = aco_opcode::buffer_atomic_swap_x2; - break; - case nir_intrinsic_ssbo_atomic_comp_swap: - op32 = aco_opcode::buffer_atomic_cmpswap; - op64 = aco_opcode::buffer_atomic_cmpswap_x2; - break; - default: - unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions."); + case nir_intrinsic_ssbo_atomic_add: + op32 = aco_opcode::buffer_atomic_add; + op64 = aco_opcode::buffer_atomic_add_x2; + break; + case nir_intrinsic_ssbo_atomic_imin: + op32 = aco_opcode::buffer_atomic_smin; + op64 = aco_opcode::buffer_atomic_smin_x2; + break; + case nir_intrinsic_ssbo_atomic_umin: + op32 = aco_opcode::buffer_atomic_umin; + op64 = aco_opcode::buffer_atomic_umin_x2; + break; + case nir_intrinsic_ssbo_atomic_imax: + op32 = aco_opcode::buffer_atomic_smax; + op64 = aco_opcode::buffer_atomic_smax_x2; + break; + case nir_intrinsic_ssbo_atomic_umax: + op32 = aco_opcode::buffer_atomic_umax; + op64 = aco_opcode::buffer_atomic_umax_x2; + break; + case nir_intrinsic_ssbo_atomic_and: + op32 = aco_opcode::buffer_atomic_and; + op64 = aco_opcode::buffer_atomic_and_x2; + break; + case nir_intrinsic_ssbo_atomic_or: + op32 = aco_opcode::buffer_atomic_or; + op64 = aco_opcode::buffer_atomic_or_x2; + break; + case nir_intrinsic_ssbo_atomic_xor: + op32 = aco_opcode::buffer_atomic_xor; + op64 = aco_opcode::buffer_atomic_xor_x2; + break; + case nir_intrinsic_ssbo_atomic_exchange: + op32 = aco_opcode::buffer_atomic_swap; + op64 = aco_opcode::buffer_atomic_swap_x2; + break; + case nir_intrinsic_ssbo_atomic_comp_swap: + op32 = aco_opcode::buffer_atomic_cmpswap; + op64 = aco_opcode::buffer_atomic_cmpswap_x2; + break; + default: + unreachable( + "visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions."); } aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; - aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; + aco_ptr mubuf{ + create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; mubuf->operands[0] = Operand(rsrc); mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); - mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); + mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t)0); mubuf->operands[3] = Operand(data); if (return_previous) mubuf->definitions[0] = Definition(dst); @@ -6414,7 +6672,9 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) ctx->block->instructions.emplace_back(std::move(mubuf)); } -void visit_get_ssbo_size(isel_context *ctx, nir_intrinsic_instr *instr) { +void +visit_get_ssbo_size(isel_context* ctx, nir_intrinsic_instr* instr) +{ Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); @@ -6436,15 +6696,15 @@ void visit_get_ssbo_size(isel_context *ctx, nir_intrinsic_instr *instr) { } } -void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); unsigned num_components = instr->num_components; unsigned component_size = instr->dest.ssa.bit_size / 8; LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)), - get_ssa_temp(ctx, &instr->dest.ssa), - num_components, component_size}; + get_ssa_temp(ctx, &instr->dest.ssa), num_components, component_size}; info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT); info.align_mul = nir_intrinsic_align_mul(instr); info.align_offset = nir_intrinsic_align_offset(instr); @@ -6452,7 +6712,8 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr) /* VMEM stores don't update the SMEM cache and it's difficult to prove that * it's safe to use SMEM */ bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE; - if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) || !can_use_smem) { + if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) || + !can_use_smem) { emit_load(ctx, bld, info, global_load_params); } else { info.offset = Operand(bld.as_uniform(info.offset)); @@ -6460,7 +6721,8 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr) } } -void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; @@ -6469,7 +6731,8 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); Temp addr = get_ssa_temp(ctx, instr->src[1].ssa); memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0); - bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + bool glc = + nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); if (ctx->options->chip_class >= GFX7) addr = as_vgpr(ctx, addr); @@ -6477,8 +6740,8 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) unsigned write_count = 0; Temp write_datas[32]; unsigned offsets[32]; - split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, - 16, &write_count, write_datas, offsets); + split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count, + write_datas, offsets); for (unsigned i = 0; i < write_count; i++) { if (ctx->options->chip_class >= GFX7) { @@ -6490,11 +6753,12 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) Temp carry = bld.tmp(bld.lm); bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr); - bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)), - Operand(offset), addr0); - bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm), - Operand(0u), addr1, - carry).def(1).setHint(vcc); + bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), + bld.hint_vcc(Definition(carry)), Operand(offset), addr0); + bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm), Operand(0u), + addr1, carry) + .def(1) + .setHint(vcc); store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1); @@ -6504,15 +6768,9 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) bool global = ctx->options->chip_class >= GFX9; aco_opcode op; switch (write_datas[i].bytes()) { - case 1: - op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; - break; - case 2: - op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; - break; - case 4: - op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; - break; + case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break; + case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break; + case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break; case 8: op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2; break; @@ -6522,11 +6780,11 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) case 16: op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4; break; - default: - unreachable("store_global not implemented for this size."); + default: unreachable("store_global not implemented for this size."); } - aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)}; + aco_ptr flat{ + create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)}; flat->operands[0] = Operand(store_addr); flat->operands[1] = Operand(s1); flat->operands[2] = Operand(write_datas[i]); @@ -6544,7 +6802,8 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) Temp rsrc = get_gfx6_global_rsrc(bld, addr); - aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, 0)}; + aco_ptr mubuf{ + create_instruction(op, Format::MUBUF, 4, 0)}; mubuf->operands[0] = Operand(rsrc); mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); mubuf->operands[2] = Operand(0u); @@ -6561,7 +6820,8 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) } } -void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa); @@ -6582,52 +6842,54 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr) if (ctx->options->chip_class >= GFX7) { bool global = ctx->options->chip_class >= GFX9; switch (instr->intrinsic) { - case nir_intrinsic_global_atomic_add: - op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add; - op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2; - break; - case nir_intrinsic_global_atomic_imin: - op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin; - op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2; - break; - case nir_intrinsic_global_atomic_umin: - op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin; - op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2; - break; - case nir_intrinsic_global_atomic_imax: - op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax; - op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2; - break; - case nir_intrinsic_global_atomic_umax: - op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax; - op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2; - break; - case nir_intrinsic_global_atomic_and: - op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and; - op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2; - break; - case nir_intrinsic_global_atomic_or: - op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or; - op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2; - break; - case nir_intrinsic_global_atomic_xor: - op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor; - op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2; - break; - case nir_intrinsic_global_atomic_exchange: - op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap; - op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2; - break; - case nir_intrinsic_global_atomic_comp_swap: - op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap; - op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2; - break; - default: - unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions."); + case nir_intrinsic_global_atomic_add: + op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add; + op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2; + break; + case nir_intrinsic_global_atomic_imin: + op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin; + op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2; + break; + case nir_intrinsic_global_atomic_umin: + op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin; + op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2; + break; + case nir_intrinsic_global_atomic_imax: + op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax; + op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2; + break; + case nir_intrinsic_global_atomic_umax: + op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax; + op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2; + break; + case nir_intrinsic_global_atomic_and: + op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and; + op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2; + break; + case nir_intrinsic_global_atomic_or: + op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or; + op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2; + break; + case nir_intrinsic_global_atomic_xor: + op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor; + op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2; + break; + case nir_intrinsic_global_atomic_exchange: + op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap; + op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2; + break; + case nir_intrinsic_global_atomic_comp_swap: + op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap; + op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2; + break; + default: + unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* " + "instructions."); } aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; - aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)}; + aco_ptr flat{create_instruction( + op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)}; flat->operands[0] = Operand(addr); flat->operands[1] = Operand(s1); flat->operands[2] = Operand(data); @@ -6644,55 +6906,57 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr) assert(ctx->options->chip_class == GFX6); switch (instr->intrinsic) { - case nir_intrinsic_global_atomic_add: - op32 = aco_opcode::buffer_atomic_add; - op64 = aco_opcode::buffer_atomic_add_x2; - break; - case nir_intrinsic_global_atomic_imin: - op32 = aco_opcode::buffer_atomic_smin; - op64 = aco_opcode::buffer_atomic_smin_x2; - break; - case nir_intrinsic_global_atomic_umin: - op32 = aco_opcode::buffer_atomic_umin; - op64 = aco_opcode::buffer_atomic_umin_x2; - break; - case nir_intrinsic_global_atomic_imax: - op32 = aco_opcode::buffer_atomic_smax; - op64 = aco_opcode::buffer_atomic_smax_x2; - break; - case nir_intrinsic_global_atomic_umax: - op32 = aco_opcode::buffer_atomic_umax; - op64 = aco_opcode::buffer_atomic_umax_x2; - break; - case nir_intrinsic_global_atomic_and: - op32 = aco_opcode::buffer_atomic_and; - op64 = aco_opcode::buffer_atomic_and_x2; - break; - case nir_intrinsic_global_atomic_or: - op32 = aco_opcode::buffer_atomic_or; - op64 = aco_opcode::buffer_atomic_or_x2; - break; - case nir_intrinsic_global_atomic_xor: - op32 = aco_opcode::buffer_atomic_xor; - op64 = aco_opcode::buffer_atomic_xor_x2; - break; - case nir_intrinsic_global_atomic_exchange: - op32 = aco_opcode::buffer_atomic_swap; - op64 = aco_opcode::buffer_atomic_swap_x2; - break; - case nir_intrinsic_global_atomic_comp_swap: - op32 = aco_opcode::buffer_atomic_cmpswap; - op64 = aco_opcode::buffer_atomic_cmpswap_x2; - break; - default: - unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions."); + case nir_intrinsic_global_atomic_add: + op32 = aco_opcode::buffer_atomic_add; + op64 = aco_opcode::buffer_atomic_add_x2; + break; + case nir_intrinsic_global_atomic_imin: + op32 = aco_opcode::buffer_atomic_smin; + op64 = aco_opcode::buffer_atomic_smin_x2; + break; + case nir_intrinsic_global_atomic_umin: + op32 = aco_opcode::buffer_atomic_umin; + op64 = aco_opcode::buffer_atomic_umin_x2; + break; + case nir_intrinsic_global_atomic_imax: + op32 = aco_opcode::buffer_atomic_smax; + op64 = aco_opcode::buffer_atomic_smax_x2; + break; + case nir_intrinsic_global_atomic_umax: + op32 = aco_opcode::buffer_atomic_umax; + op64 = aco_opcode::buffer_atomic_umax_x2; + break; + case nir_intrinsic_global_atomic_and: + op32 = aco_opcode::buffer_atomic_and; + op64 = aco_opcode::buffer_atomic_and_x2; + break; + case nir_intrinsic_global_atomic_or: + op32 = aco_opcode::buffer_atomic_or; + op64 = aco_opcode::buffer_atomic_or_x2; + break; + case nir_intrinsic_global_atomic_xor: + op32 = aco_opcode::buffer_atomic_xor; + op64 = aco_opcode::buffer_atomic_xor_x2; + break; + case nir_intrinsic_global_atomic_exchange: + op32 = aco_opcode::buffer_atomic_swap; + op64 = aco_opcode::buffer_atomic_swap_x2; + break; + case nir_intrinsic_global_atomic_comp_swap: + op32 = aco_opcode::buffer_atomic_cmpswap; + op64 = aco_opcode::buffer_atomic_cmpswap_x2; + break; + default: + unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* " + "instructions."); } Temp rsrc = get_gfx6_global_rsrc(bld, addr); aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; - aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; + aco_ptr mubuf{ + create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; mubuf->operands[0] = Operand(rsrc); mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); mubuf->operands[2] = Operand(0u); @@ -6710,7 +6974,8 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr) } } -void visit_load_buffer(isel_context *ctx, nir_intrinsic_instr *intrin) +void +visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) { Builder bld(ctx->program, ctx->block); @@ -6728,11 +6993,12 @@ void visit_load_buffer(isel_context *ctx, nir_intrinsic_instr *intrin) unsigned num_components = intrin->dest.ssa.num_components; unsigned swizzle_element_size = swizzled ? (ctx->program->chip_class <= GFX8 ? 4 : 16) : 0; - load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, - elem_size_bytes, num_components, swizzle_element_size, !swizzled, reorder, slc); + load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, elem_size_bytes, + num_components, swizzle_element_size, !swizzled, reorder, slc); } -void visit_store_buffer(isel_context *ctx, nir_intrinsic_instr *intrin) +void +visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) { Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa); Temp descriptor = get_ssa_temp(ctx, intrin->src[1].ssa); @@ -6749,31 +7015,28 @@ void visit_store_buffer(isel_context *ctx, nir_intrinsic_instr *intrin) nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin); memory_sync_info sync(mem_mode == nir_var_shader_out ? storage_vmem_output : storage_none); - store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, - elem_size_bytes, write_mask, !swizzled, sync, slc); + store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, elem_size_bytes, + write_mask, !swizzled, sync, slc); } -sync_scope translate_nir_scope(nir_scope scope) +sync_scope +translate_nir_scope(nir_scope scope) { switch (scope) { case NIR_SCOPE_NONE: - case NIR_SCOPE_INVOCATION: - return scope_invocation; - case NIR_SCOPE_SUBGROUP: - return scope_subgroup; - case NIR_SCOPE_WORKGROUP: - return scope_workgroup; - case NIR_SCOPE_QUEUE_FAMILY: - return scope_queuefamily; - case NIR_SCOPE_DEVICE: - return scope_device; - case NIR_SCOPE_SHADER_CALL: - unreachable("unsupported scope"); + case NIR_SCOPE_INVOCATION: return scope_invocation; + case NIR_SCOPE_SUBGROUP: return scope_subgroup; + case NIR_SCOPE_WORKGROUP: return scope_workgroup; + case NIR_SCOPE_QUEUE_FAMILY: return scope_queuefamily; + case NIR_SCOPE_DEVICE: return scope_device; + case NIR_SCOPE_SHADER_CALL: unreachable("unsupported scope"); } unreachable("invalid scope"); } -void emit_scoped_barrier(isel_context *ctx, nir_intrinsic_instr *instr) { +void +emit_scoped_barrier(isel_context* ctx, nir_intrinsic_instr* instr) +{ Builder bld(ctx->program, ctx->block); unsigned semantics = 0; @@ -6787,11 +7050,10 @@ void emit_scoped_barrier(isel_context *ctx, nir_intrinsic_instr *instr) { * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory * - additionally, when NGG is used on GFX10+, shared memory is used for certain features */ - bool shared_storage_used = - ctx->stage.hw == HWStage::CS || - ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::HS || - (ctx->stage.hw == HWStage::GS && ctx->program->chip_class >= GFX9) || - ctx->stage.hw == HWStage::NGG; + bool shared_storage_used = ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::LS || + ctx->stage.hw == HWStage::HS || + (ctx->stage.hw == HWStage::GS && ctx->program->chip_class >= GFX9) || + ctx->stage.hw == HWStage::NGG; /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half. * They are allowed in CS, TCS, and in any NGG shader. @@ -6801,7 +7063,7 @@ void emit_scoped_barrier(isel_context *ctx, nir_intrinsic_instr *instr) { unsigned nir_storage = nir_intrinsic_memory_modes(instr); if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global)) - storage |= storage_buffer | storage_image; //TODO: split this when NIR gets nir_var_mem_image + storage |= storage_buffer | storage_image; // TODO: split this when NIR gets nir_var_mem_image if (shared_storage_used && (nir_storage & nir_var_mem_shared)) storage |= storage_shared; @@ -6819,7 +7081,8 @@ void emit_scoped_barrier(isel_context *ctx, nir_intrinsic_instr *instr) { exec_scope); } -void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr) { // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read() Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); @@ -6832,7 +7095,8 @@ void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr) load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align); } -void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr) { unsigned writemask = nir_intrinsic_write_mask(instr); Temp data = get_ssa_temp(ctx, instr->src[0].ssa); @@ -6843,7 +7107,8 @@ void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr) store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align); } -void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr) { unsigned offset = nir_intrinsic_base(instr); Builder bld(ctx->program, ctx->block); @@ -6853,76 +7118,75 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr) unsigned num_operands = 3; aco_opcode op32, op64, op32_rtn, op64_rtn; - switch(instr->intrinsic) { - case nir_intrinsic_shared_atomic_add: - op32 = aco_opcode::ds_add_u32; - op64 = aco_opcode::ds_add_u64; - op32_rtn = aco_opcode::ds_add_rtn_u32; - op64_rtn = aco_opcode::ds_add_rtn_u64; - break; - case nir_intrinsic_shared_atomic_imin: - op32 = aco_opcode::ds_min_i32; - op64 = aco_opcode::ds_min_i64; - op32_rtn = aco_opcode::ds_min_rtn_i32; - op64_rtn = aco_opcode::ds_min_rtn_i64; - break; - case nir_intrinsic_shared_atomic_umin: - op32 = aco_opcode::ds_min_u32; - op64 = aco_opcode::ds_min_u64; - op32_rtn = aco_opcode::ds_min_rtn_u32; - op64_rtn = aco_opcode::ds_min_rtn_u64; - break; - case nir_intrinsic_shared_atomic_imax: - op32 = aco_opcode::ds_max_i32; - op64 = aco_opcode::ds_max_i64; - op32_rtn = aco_opcode::ds_max_rtn_i32; - op64_rtn = aco_opcode::ds_max_rtn_i64; - break; - case nir_intrinsic_shared_atomic_umax: - op32 = aco_opcode::ds_max_u32; - op64 = aco_opcode::ds_max_u64; - op32_rtn = aco_opcode::ds_max_rtn_u32; - op64_rtn = aco_opcode::ds_max_rtn_u64; - break; - case nir_intrinsic_shared_atomic_and: - op32 = aco_opcode::ds_and_b32; - op64 = aco_opcode::ds_and_b64; - op32_rtn = aco_opcode::ds_and_rtn_b32; - op64_rtn = aco_opcode::ds_and_rtn_b64; - break; - case nir_intrinsic_shared_atomic_or: - op32 = aco_opcode::ds_or_b32; - op64 = aco_opcode::ds_or_b64; - op32_rtn = aco_opcode::ds_or_rtn_b32; - op64_rtn = aco_opcode::ds_or_rtn_b64; - break; - case nir_intrinsic_shared_atomic_xor: - op32 = aco_opcode::ds_xor_b32; - op64 = aco_opcode::ds_xor_b64; - op32_rtn = aco_opcode::ds_xor_rtn_b32; - op64_rtn = aco_opcode::ds_xor_rtn_b64; - break; - case nir_intrinsic_shared_atomic_exchange: - op32 = aco_opcode::ds_write_b32; - op64 = aco_opcode::ds_write_b64; - op32_rtn = aco_opcode::ds_wrxchg_rtn_b32; - op64_rtn = aco_opcode::ds_wrxchg_rtn_b64; - break; - case nir_intrinsic_shared_atomic_comp_swap: - op32 = aco_opcode::ds_cmpst_b32; - op64 = aco_opcode::ds_cmpst_b64; - op32_rtn = aco_opcode::ds_cmpst_rtn_b32; - op64_rtn = aco_opcode::ds_cmpst_rtn_b64; - num_operands = 4; - break; - case nir_intrinsic_shared_atomic_fadd: - op32 = aco_opcode::ds_add_f32; - op32_rtn = aco_opcode::ds_add_rtn_f32; - op64 = aco_opcode::num_opcodes; - op64_rtn = aco_opcode::num_opcodes; - break; - default: - unreachable("Unhandled shared atomic intrinsic"); + switch (instr->intrinsic) { + case nir_intrinsic_shared_atomic_add: + op32 = aco_opcode::ds_add_u32; + op64 = aco_opcode::ds_add_u64; + op32_rtn = aco_opcode::ds_add_rtn_u32; + op64_rtn = aco_opcode::ds_add_rtn_u64; + break; + case nir_intrinsic_shared_atomic_imin: + op32 = aco_opcode::ds_min_i32; + op64 = aco_opcode::ds_min_i64; + op32_rtn = aco_opcode::ds_min_rtn_i32; + op64_rtn = aco_opcode::ds_min_rtn_i64; + break; + case nir_intrinsic_shared_atomic_umin: + op32 = aco_opcode::ds_min_u32; + op64 = aco_opcode::ds_min_u64; + op32_rtn = aco_opcode::ds_min_rtn_u32; + op64_rtn = aco_opcode::ds_min_rtn_u64; + break; + case nir_intrinsic_shared_atomic_imax: + op32 = aco_opcode::ds_max_i32; + op64 = aco_opcode::ds_max_i64; + op32_rtn = aco_opcode::ds_max_rtn_i32; + op64_rtn = aco_opcode::ds_max_rtn_i64; + break; + case nir_intrinsic_shared_atomic_umax: + op32 = aco_opcode::ds_max_u32; + op64 = aco_opcode::ds_max_u64; + op32_rtn = aco_opcode::ds_max_rtn_u32; + op64_rtn = aco_opcode::ds_max_rtn_u64; + break; + case nir_intrinsic_shared_atomic_and: + op32 = aco_opcode::ds_and_b32; + op64 = aco_opcode::ds_and_b64; + op32_rtn = aco_opcode::ds_and_rtn_b32; + op64_rtn = aco_opcode::ds_and_rtn_b64; + break; + case nir_intrinsic_shared_atomic_or: + op32 = aco_opcode::ds_or_b32; + op64 = aco_opcode::ds_or_b64; + op32_rtn = aco_opcode::ds_or_rtn_b32; + op64_rtn = aco_opcode::ds_or_rtn_b64; + break; + case nir_intrinsic_shared_atomic_xor: + op32 = aco_opcode::ds_xor_b32; + op64 = aco_opcode::ds_xor_b64; + op32_rtn = aco_opcode::ds_xor_rtn_b32; + op64_rtn = aco_opcode::ds_xor_rtn_b64; + break; + case nir_intrinsic_shared_atomic_exchange: + op32 = aco_opcode::ds_write_b32; + op64 = aco_opcode::ds_write_b64; + op32_rtn = aco_opcode::ds_wrxchg_rtn_b32; + op64_rtn = aco_opcode::ds_wrxchg_rtn_b64; + break; + case nir_intrinsic_shared_atomic_comp_swap: + op32 = aco_opcode::ds_cmpst_b32; + op64 = aco_opcode::ds_cmpst_b64; + op32_rtn = aco_opcode::ds_cmpst_rtn_b32; + op64_rtn = aco_opcode::ds_cmpst_rtn_b64; + num_operands = 4; + break; + case nir_intrinsic_shared_atomic_fadd: + op32 = aco_opcode::ds_add_f32; + op32_rtn = aco_opcode::ds_add_rtn_f32; + op64 = aco_opcode::num_opcodes; + op64_rtn = aco_opcode::num_opcodes; + break; + default: unreachable("Unhandled shared atomic intrinsic"); } bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa); @@ -6942,7 +7206,8 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr) } aco_ptr ds; - ds.reset(create_instruction(op, Format::DS, num_operands, return_previous ? 1 : 0)); + ds.reset( + create_instruction(op, Format::DS, num_operands, return_previous ? 1 : 0)); ds->operands[0] = Operand(address); ds->operands[1] = Operand(data); if (num_operands == 4) { @@ -6957,21 +7222,22 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr) ctx->block->instructions.emplace_back(std::move(ds)); } -Temp get_scratch_resource(isel_context *ctx) +Temp +get_scratch_resource(isel_context* ctx) { Builder bld(ctx->program, ctx->block); Temp scratch_addr = ctx->program->private_segment_buffer; if (ctx->stage != compute_cs) scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u)); - uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) | - S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2); + uint32_t rsrc_conf = + S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2); if (ctx->program->chip_class >= GFX10) { rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | - S_008F0C_RESOURCE_LEVEL(1); - } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */ + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); + } else if (ctx->program->chip_class <= + GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */ rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); } @@ -6980,10 +7246,13 @@ Temp get_scratch_resource(isel_context *ctx) if (ctx->program->chip_class <= GFX8) rsrc_conf |= S_008F0C_ELEMENT_SIZE(1); - return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf)); + return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), + Operand(rsrc_conf)); } -void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { +void +visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr) +{ Builder bld(ctx->program, ctx->block); Temp rsrc = get_scratch_resource(ctx); Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); @@ -6999,7 +7268,9 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { emit_load(ctx, bld, info, scratch_load_params); } -void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { +void +visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr) +{ Builder bld(ctx->program, ctx->block); Temp rsrc = get_scratch_resource(ctx); Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); @@ -7012,21 +7283,23 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { Temp write_datas[32]; unsigned offsets[32]; unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16; - split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, - swizzle_component_size, &write_count, write_datas, offsets); + split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size, + &write_count, write_datas, offsets); for (unsigned i = 0; i < write_count; i++) { aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); - Instruction *mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true); + Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], + offsets[i], true, true); mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private); } } -void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) { +void +visit_load_sample_mask_in(isel_context* ctx, nir_intrinsic_instr* instr) +{ uint8_t log2_ps_iter_samples; if (ctx->program->info->ps.uses_sample_shading) { - log2_ps_iter_samples = - util_logbase2(ctx->options->key.fs.num_samples); + log2_ps_iter_samples = util_logbase2(ctx->options->key.fs.num_samples); } else { log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples; } @@ -7041,31 +7314,34 @@ void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) { get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u)); Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, bld.copy(bld.def(v1), Operand(1u))); - bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage)); + bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, + get_arg(ctx, ctx->args->ac.sample_coverage)); } else { bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.sample_coverage)); } } -void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_emit_vertex_with_counter(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); unsigned stream = nir_intrinsic_stream_id(instr); Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u); - nir_const_value *next_vertex_cv = nir_src_as_const_value(instr->src[0]); + nir_const_value* next_vertex_cv = nir_src_as_const_value(instr->src[0]); /* get GSVS ring */ - Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_GSVS_GS * 16u)); + Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), + ctx->program->private_segment_buffer, Operand(RING_GSVS_GS * 16u)); - unsigned num_components = - ctx->program->info->gs.num_stream_output_components[stream]; + unsigned num_components = ctx->program->info->gs.num_stream_output_components[stream]; unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out; unsigned stream_offset = 0; for (unsigned i = 0; i < stream; i++) { - unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] * ctx->shader->info.gs.vertices_out; + unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] * + ctx->shader->info.gs.vertices_out; stream_offset += prev_stride * ctx->program->wave_size; } @@ -7075,26 +7351,25 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst Temp gsvs_dwords[4]; for (unsigned i = 0; i < 4; i++) gsvs_dwords[i] = bld.tmp(s1); - bld.pseudo(aco_opcode::p_split_vector, - Definition(gsvs_dwords[0]), - Definition(gsvs_dwords[1]), - Definition(gsvs_dwords[2]), - Definition(gsvs_dwords[3]), - gsvs_ring); + bld.pseudo(aco_opcode::p_split_vector, Definition(gsvs_dwords[0]), Definition(gsvs_dwords[1]), + Definition(gsvs_dwords[2]), Definition(gsvs_dwords[3]), gsvs_ring); if (stream_offset) { Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand(stream_offset)); Temp carry = bld.tmp(s1); - gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), gsvs_dwords[0], stream_offset_tmp); - gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(0u), bld.scc(carry)); + gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), + gsvs_dwords[0], stream_offset_tmp); + gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), + gsvs_dwords[1], Operand(0u), bld.scc(carry)); } - gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(S_008F04_STRIDE(stride))); + gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], + Operand(S_008F04_STRIDE(stride))); gsvs_dwords[2] = bld.copy(bld.def(s1), Operand((uint32_t)ctx->program->wave_size)); - gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), - gsvs_dwords[0], gsvs_dwords[1], gsvs_dwords[2], gsvs_dwords[3]); + gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), gsvs_dwords[0], gsvs_dwords[1], + gsvs_dwords[2], gsvs_dwords[3]); unsigned offset = 0; for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) { @@ -7112,11 +7387,13 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst if (vaddr_offset.isUndefined()) vaddr_offset = bld.copy(bld.def(v1), Operand(const_offset / 4096u * 4096u)); else - vaddr_offset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), vaddr_offset); + vaddr_offset = + bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), vaddr_offset); const_offset %= 4096u; } - aco_ptr mtbuf{create_instruction(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)}; + aco_ptr mtbuf{create_instruction( + aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)}; mtbuf->operands[0] = Operand(gsvs_ring); mtbuf->operands[1] = vaddr_offset; mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->ac.gs2vs_offset)); @@ -7142,55 +7419,71 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream)); } -Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src) +Temp +emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src) { Builder bld(ctx->program, ctx->block); if (cluster_size == 1) { return src; - } if (op == nir_op_iand && cluster_size == 4) { - //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) - Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); + } + if (op == nir_op_iand && cluster_size == 4) { + /* subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) */ + Temp tmp = + bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp)); } else if (op == nir_op_ior && cluster_size == 4) { - //subgroupClusteredOr(val, 4) -> wqm(val & exec) - return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), - bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))); + /* subgroupClusteredOr(val, 4) -> wqm(val & exec) */ + return bld.sop1( + Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))); } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) { - //subgroupAnd(val) -> (exec & ~val) == 0 - Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp(); + /* subgroupAnd(val) -> (exec & ~val) == 0 */ + Temp tmp = + bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src) + .def(1) + .getTemp(); Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp)); return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond); } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) { - //subgroupOr(val) -> (val & exec) != 0 - Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp(); + /* subgroupOr(val) -> (val & exec) != 0 */ + Temp tmp = + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)) + .def(1) + .getTemp(); return bool_to_vector_condition(ctx, tmp); } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) { - //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 - Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); + /* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */ + Temp tmp = + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp); - tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp(); + tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)) + .def(1) + .getTemp(); return bool_to_vector_condition(ctx, tmp); } else { - //subgroupClustered{And,Or,Xor}(val, n) -> - //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) ; just v_mbcnt_lo_u32_b32 on wave32 - //cluster_offset = ~(n - 1) & lane_id - //cluster_mask = ((1 << n) - 1) - //subgroupClusteredAnd(): - // return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask - //subgroupClusteredOr(): - // return ((val & exec) >> cluster_offset) & cluster_mask != 0 - //subgroupClusteredXor(): - // return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0 + /* subgroupClustered{And,Or,Xor}(val, n): + * lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32) + * cluster_offset = ~(n - 1) & lane_id cluster_mask = ((1 << n) - 1) + * subgroupClusteredAnd(): + * return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask + * subgroupClusteredOr(): + * return ((val & exec) >> cluster_offset) & cluster_mask != 0 + * subgroupClusteredXor(): + * return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0 + */ Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1)); - Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id); + Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), + Operand(~uint32_t(cluster_size - 1)), lane_id); Temp tmp; if (op == nir_op_iand) - tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); + tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, + Operand(exec, bld.lm)); else - tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); + tmp = + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u; @@ -7205,7 +7498,8 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp); if (op == nir_op_iand) { - return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand(cluster_mask), tmp); + return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand(cluster_mask), + tmp); } else if (op == nir_op_ior) { return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand(0u), tmp); } else if (op == nir_op_ixor) { @@ -7218,17 +7512,20 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te } } -Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src) +Temp +emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src) { Builder bld(ctx->program, ctx->block); assert(src.regClass() == bld.lm); - //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0 - //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0 - //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0 + /* subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0 + * subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0 + * subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0 + */ Temp tmp; if (op == nir_op_iand) - tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); + tmp = + bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); else tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); @@ -7246,13 +7543,15 @@ Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src) return Temp(); } -Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src) +Temp +emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src) { Builder bld(ctx->program, ctx->block); - //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val - //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val - //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val + /* subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val + * subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val + * subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val + */ Temp tmp = emit_boolean_exclusive_scan(ctx, op, src); if (op == nir_op_iand) return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src); @@ -7265,32 +7564,39 @@ Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src) return Temp(); } -ReduceOp get_reduce_op(nir_op op, unsigned bit_size) +ReduceOp +get_reduce_op(nir_op op, unsigned bit_size) { switch (op) { - #define CASEI(name) case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; - #define CASEF(name) case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; - CASEI(iadd) - CASEI(imul) - CASEI(imin) - CASEI(umin) - CASEI(imax) - CASEI(umax) - CASEI(iand) - CASEI(ior) - CASEI(ixor) - CASEF(fadd) - CASEF(fmul) - CASEF(fmin) - CASEF(fmax) - default: - unreachable("unknown reduction op"); - #undef CASEI - #undef CASEF +#define CASEI(name) \ + case nir_op_##name: \ + return (bit_size == 32) ? name##32 \ + : (bit_size == 16) ? name##16 \ + : (bit_size == 8) ? name##8 \ + : name##64; +#define CASEF(name) \ + case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; + CASEI(iadd) + CASEI(imul) + CASEI(imin) + CASEI(umin) + CASEI(imax) + CASEI(umax) + CASEI(iand) + CASEI(ior) + CASEI(ixor) + CASEF(fadd) + CASEF(fmul) + CASEF(fmin) + CASEF(fmax) + default: unreachable("unknown reduction op"); +#undef CASEI +#undef CASEF } } -void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src) +void +emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src) { Builder bld(ctx->program, ctx->block); Definition dst(get_ssa_temp(ctx, &instr->dest.ssa)); @@ -7301,7 +7607,8 @@ void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp s bld.copy(dst, src); } -void emit_addition_uniform_reduce(isel_context *ctx, nir_op op, Definition dst, nir_src src, Temp count) +void +emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count) { Builder bld(ctx->program, ctx->block); Temp src_tmp = get_ssa_temp(ctx, src.ssa); @@ -7329,8 +7636,7 @@ void emit_addition_uniform_reduce(isel_context *ctx, nir_op op, Definition dst, src_tmp = bld.as_uniform(src_tmp); if (op == nir_op_ixor && count.type() == RegType::sgpr) - count = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), - count, Operand(1u)); + count = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand(1u)); else if (op == nir_op_ixor) count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), count); @@ -7360,7 +7666,8 @@ void emit_addition_uniform_reduce(isel_context *ctx, nir_op op, Definition dst, } } -bool emit_uniform_reduce(isel_context *ctx, nir_intrinsic_instr *instr) +bool +emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr) { nir_op op = (nir_op)nir_intrinsic_reduction_op(instr); if (op == nir_op_imul || op == nir_op_fmul) @@ -7373,8 +7680,8 @@ bool emit_uniform_reduce(isel_context *ctx, nir_intrinsic_instr *instr) if (bit_size > 32) return false; - Temp thread_count = bld.sop1( - Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm)); + Temp thread_count = + bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm)); emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count); } else { @@ -7384,7 +7691,8 @@ bool emit_uniform_reduce(isel_context *ctx, nir_intrinsic_instr *instr) return true; } -bool emit_uniform_scan(isel_context *ctx, nir_intrinsic_instr *instr) +bool +emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); Definition dst(get_ssa_temp(ctx, &instr->dest.ssa)); @@ -7408,18 +7716,15 @@ bool emit_uniform_scan(isel_context *ctx, nir_intrinsic_instr *instr) return true; } - assert(op == nir_op_imin || op == nir_op_umin || - op == nir_op_imax || op == nir_op_umax || - op == nir_op_iand || op == nir_op_ior || - op == nir_op_fmin || op == nir_op_fmax); + assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax || + op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax); if (inc) { emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa)); return true; } - /* Copy the source and write the reduction operation identity to the first - * lane. */ + /* Copy the source and write the reduction operation identity to the first lane. */ Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)); Temp src = get_ssa_temp(ctx, instr->src[0].ssa); ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size); @@ -7440,8 +7745,9 @@ bool emit_uniform_scan(isel_context *ctx, nir_intrinsic_instr *instr) return true; } -Temp emit_reduction_instr(isel_context *ctx, aco_opcode aco_op, ReduceOp op, - unsigned cluster_size, Definition dst, Temp src) +Temp +emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size, + Definition dst, Temp src) { assert(src.bytes() <= 8); assert(src.type() == RegType::vgpr); @@ -7454,14 +7760,13 @@ Temp emit_reduction_instr(isel_context *ctx, aco_opcode aco_op, ReduceOp op, defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */ /* scalar identity temporary */ - bool need_sitmp = (ctx->program->chip_class <= GFX7 || ctx->program->chip_class >= GFX10) && aco_op != aco_opcode::p_reduce; + bool need_sitmp = (ctx->program->chip_class <= GFX7 || ctx->program->chip_class >= GFX10) && + aco_op != aco_opcode::p_reduce; if (aco_op == aco_opcode::p_exclusive_scan) { - need_sitmp |= - (op == imin8 || op == imin16 || op == imin32 || op == imin64 || - op == imax8 || op == imax16 || op == imax32 || op == imax64 || - op == fmin16 || op == fmin32 || op == fmin64 || - op == fmax16 || op == fmax32 || op == fmax64 || - op == fmul16 || op == fmul64); + need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 || + op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 || + op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 || + op == fmul64); } if (need_sitmp) defs[num_defs++] = bld.def(RegType::sgpr, dst.size()); @@ -7481,7 +7786,8 @@ Temp emit_reduction_instr(isel_context *ctx, aco_opcode aco_op, ReduceOp op, if (clobber_vcc) defs[num_defs++] = bld.def(bld.lm, vcc); - Pseudo_reduction_instruction *reduce = create_instruction(aco_op, Format::PSEUDO_REDUCTION, 3, num_defs); + Pseudo_reduction_instruction* reduce = create_instruction( + aco_op, Format::PSEUDO_REDUCTION, 3, num_defs); reduce->operands[0] = Operand(src); /* setup_reduce_temp will update these undef operands if needed */ reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear()); @@ -7495,7 +7801,8 @@ Temp emit_reduction_instr(isel_context *ctx, aco_opcode aco_op, ReduceOp op, return dst.getTemp(); } -void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2) +void +emit_interp_center(isel_context* ctx, Temp dst, Temp pos1, Temp pos2) { Builder bld(ctx->program, ctx->block); Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center); @@ -7529,7 +7836,8 @@ void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2) } /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */ - aco_opcode mad = ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32; + aco_opcode mad = + ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32; Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1); Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2); tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1); @@ -7542,14 +7850,15 @@ void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2) return; } -Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i); -void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt); -static void create_vs_exports(isel_context *ctx); +Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i); +void ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt); +static void create_vs_exports(isel_context* ctx); -void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) +void +visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); - switch(instr->intrinsic) { + switch (instr->intrinsic) { case nir_intrinsic_load_barycentric_sample: case nir_intrinsic_load_barycentric_pixel: case nir_intrinsic_load_barycentric_centroid: { @@ -7573,14 +7882,12 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample) bary = get_arg(ctx, ctx->args->ac.linear_sample); break; - default: - break; + default: break; } Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); Temp p1 = emit_extract_vector(ctx, bary, 0, v1); Temp p2 = emit_extract_vector(ctx, bary, 1, v1); - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), - Operand(p1), Operand(p2)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2)); emit_split_vector(ctx, dst, 2); break; } @@ -7591,55 +7898,64 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp p1 = emit_extract_vector(ctx, model, 0, v1); Temp p2 = emit_extract_vector(ctx, model, 1, v1); Temp p3 = emit_extract_vector(ctx, model, 2, v1); - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), - Operand(p1), Operand(p2), Operand(p3)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2), + Operand(p3)); emit_split_vector(ctx, dst, 3); break; } case nir_intrinsic_load_barycentric_at_sample: { uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16; switch (ctx->options->key.fs.num_samples) { - case 2: sample_pos_offset += 1 << 3; break; - case 4: sample_pos_offset += 3 << 3; break; - case 8: sample_pos_offset += 7 << 3; break; - default: break; + case 2: sample_pos_offset += 1 << 3; break; + case 4: sample_pos_offset += 3 << 3; break; + case 8: sample_pos_offset += 7 << 3; break; + default: break; } Temp sample_pos; Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]); Temp private_segment_buffer = ctx->program->private_segment_buffer; - //TODO: bounds checking? + // TODO: bounds checking? if (addr.type() == RegType::sgpr) { Operand offset; if (const_addr) { sample_pos_offset += const_addr->u32 << 3; offset = Operand(sample_pos_offset); } else if (ctx->options->chip_class >= GFX9) { - offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset)); + offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, + Operand(sample_pos_offset)); } else { - offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u)); - offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(sample_pos_offset)); + offset = + bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u)); + offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, + Operand(sample_pos_offset)); } Operand off = bld.copy(bld.def(s1), Operand(offset)); - sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off); + sample_pos = + bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off); } else if (ctx->options->chip_class >= GFX9) { addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr); - sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, private_segment_buffer, sample_pos_offset); + sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, + private_segment_buffer, sample_pos_offset); } else if (ctx->options->chip_class >= GFX7) { /* addr += private_segment_buffer + sample_pos_offset */ Temp tmp0 = bld.tmp(s1); Temp tmp1 = bld.tmp(s1); - bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), private_segment_buffer); + bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), + private_segment_buffer); Definition scc_tmp = bld.def(s1, scc); - tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset)); - tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), bld.scc(scc_tmp.getTemp())); + tmp0 = + bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset)); + tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), + bld.scc(scc_tmp.getTemp())); addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr); Temp pck0 = bld.tmp(v1); Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp(); tmp1 = as_vgpr(ctx, tmp1); - Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry); + Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), + bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry); addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1); /* sample_pos = flat_load_dwordx2 addr */ @@ -7649,14 +7965,16 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); - Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, Operand(0u), Operand(rsrc_conf)); + Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, + Operand(0u), Operand(rsrc_conf)); addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr); addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand(0u)); sample_pos = bld.tmp(v2); - aco_ptr load{create_instruction(aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)}; + aco_ptr load{create_instruction( + aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)}; load->definitions[0] = Definition(sample_pos); load->operands[0] = Operand(rsrc); load->operands[1] = Operand(addr); @@ -7690,14 +8008,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } case nir_intrinsic_load_front_face: { bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), - Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc); + Operand(0u), get_arg(ctx, ctx->args->ac.front_face)) + .def(0) + .setHint(vcc); break; } case nir_intrinsic_load_view_index: { - if (ctx->stage.has(SWStage::VS) || - ctx->stage.has(SWStage::GS) || - ctx->stage.has(SWStage::TCS) || - ctx->stage.has(SWStage::TES)) { + if (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::GS) || + ctx->stage.has(SWStage::TCS) || ctx->stage.has(SWStage::TES)) { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index))); break; @@ -7725,48 +8043,22 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u)); break; } - case nir_intrinsic_load_tess_coord: - visit_load_tess_coord(ctx, instr); - break; - case nir_intrinsic_load_interpolated_input: - visit_load_interpolated_input(ctx, instr); - break; - case nir_intrinsic_store_output: - visit_store_output(ctx, instr); - break; + case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break; + case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break; + case nir_intrinsic_store_output: visit_store_output(ctx, instr); break; case nir_intrinsic_load_input: - case nir_intrinsic_load_input_vertex: - visit_load_input(ctx, instr); - break; - case nir_intrinsic_load_per_vertex_input: - visit_load_per_vertex_input(ctx, instr); - break; - case nir_intrinsic_load_ubo: - visit_load_ubo(ctx, instr); - break; - case nir_intrinsic_load_push_constant: - visit_load_push_constant(ctx, instr); - break; - case nir_intrinsic_load_constant: - visit_load_constant(ctx, instr); - break; - case nir_intrinsic_vulkan_resource_index: - visit_load_resource(ctx, instr); - break; + case nir_intrinsic_load_input_vertex: visit_load_input(ctx, instr); break; + case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break; + case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break; + case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break; + case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break; + case nir_intrinsic_vulkan_resource_index: visit_load_resource(ctx, instr); break; case nir_intrinsic_terminate: - case nir_intrinsic_discard: - visit_discard(ctx, instr); - break; + case nir_intrinsic_discard: visit_discard(ctx, instr); break; case nir_intrinsic_terminate_if: - case nir_intrinsic_discard_if: - visit_discard_if(ctx, instr); - break; - case nir_intrinsic_load_shared: - visit_load_shared(ctx, instr); - break; - case nir_intrinsic_store_shared: - visit_store_shared(ctx, instr); - break; + case nir_intrinsic_discard_if: visit_discard_if(ctx, instr); break; + case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break; + case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break; case nir_intrinsic_shared_atomic_add: case nir_intrinsic_shared_atomic_imin: case nir_intrinsic_shared_atomic_umin: @@ -7777,16 +8069,10 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_shared_atomic_xor: case nir_intrinsic_shared_atomic_exchange: case nir_intrinsic_shared_atomic_comp_swap: - case nir_intrinsic_shared_atomic_fadd: - visit_shared_atomic(ctx, instr); - break; + case nir_intrinsic_shared_atomic_fadd: visit_shared_atomic(ctx, instr); break; case nir_intrinsic_image_deref_load: - case nir_intrinsic_image_deref_sparse_load: - visit_image_load(ctx, instr); - break; - case nir_intrinsic_image_deref_store: - visit_image_store(ctx, instr); - break; + case nir_intrinsic_image_deref_sparse_load: visit_image_load(ctx, instr); break; + case nir_intrinsic_image_deref_store: visit_image_store(ctx, instr); break; case nir_intrinsic_image_deref_atomic_add: case nir_intrinsic_image_deref_atomic_umin: case nir_intrinsic_image_deref_atomic_imin: @@ -7796,33 +8082,15 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_image_deref_atomic_or: case nir_intrinsic_image_deref_atomic_xor: case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_image_deref_atomic_comp_swap: - visit_image_atomic(ctx, instr); - break; - case nir_intrinsic_image_deref_size: - visit_image_size(ctx, instr); - break; - case nir_intrinsic_image_deref_samples: - visit_image_samples(ctx, instr); - break; - case nir_intrinsic_load_ssbo: - visit_load_ssbo(ctx, instr); - break; - case nir_intrinsic_store_ssbo: - visit_store_ssbo(ctx, instr); - break; - case nir_intrinsic_load_global: - visit_load_global(ctx, instr); - break; - case nir_intrinsic_load_buffer_amd: - visit_load_buffer(ctx, instr); - break; - case nir_intrinsic_store_buffer_amd: - visit_store_buffer(ctx, instr); - break; - case nir_intrinsic_store_global: - visit_store_global(ctx, instr); - break; + case nir_intrinsic_image_deref_atomic_comp_swap: visit_image_atomic(ctx, instr); break; + case nir_intrinsic_image_deref_size: visit_image_size(ctx, instr); break; + case nir_intrinsic_image_deref_samples: visit_image_samples(ctx, instr); break; + case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break; + case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break; + case nir_intrinsic_load_global: visit_load_global(ctx, instr); break; + case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break; + case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break; + case nir_intrinsic_store_global: visit_store_global(ctx, instr); break; case nir_intrinsic_global_atomic_add: case nir_intrinsic_global_atomic_imin: case nir_intrinsic_global_atomic_umin: @@ -7832,9 +8100,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_global_atomic_or: case nir_intrinsic_global_atomic_xor: case nir_intrinsic_global_atomic_exchange: - case nir_intrinsic_global_atomic_comp_swap: - visit_global_atomic(ctx, instr); - break; + case nir_intrinsic_global_atomic_comp_swap: visit_global_atomic(ctx, instr); break; case nir_intrinsic_ssbo_atomic_add: case nir_intrinsic_ssbo_atomic_imin: case nir_intrinsic_ssbo_atomic_umin: @@ -7844,21 +8110,11 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_ssbo_atomic_or: case nir_intrinsic_ssbo_atomic_xor: case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_ssbo_atomic_comp_swap: - visit_atomic_ssbo(ctx, instr); - break; - case nir_intrinsic_load_scratch: - visit_load_scratch(ctx, instr); - break; - case nir_intrinsic_store_scratch: - visit_store_scratch(ctx, instr); - break; - case nir_intrinsic_get_ssbo_size: - visit_get_ssbo_size(ctx, instr); - break; - case nir_intrinsic_scoped_barrier: - emit_scoped_barrier(ctx, instr); - break; + case nir_intrinsic_ssbo_atomic_comp_swap: visit_atomic_ssbo(ctx, instr); break; + case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break; + case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break; + case nir_intrinsic_get_ssbo_size: visit_get_ssbo_size(ctx, instr); break; + case nir_intrinsic_scoped_barrier: emit_scoped_barrier(ctx, instr); break; case nir_intrinsic_load_num_workgroups: { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups))); @@ -7873,7 +8129,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } case nir_intrinsic_load_workgroup_id: { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - struct ac_arg *args = ctx->args->ac.workgroup_ids; + struct ac_arg* args = ctx->args->ac.workgroup_ids; bld.pseudo(aco_opcode::p_create_vector, Definition(dst), args[0].used ? Operand(get_arg(ctx, args[0])) : Operand(0u), args[1].used ? Operand(get_arg(ctx, args[1])) : Operand(0u), @@ -7883,7 +8139,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } case nir_intrinsic_load_local_invocation_index: { if (ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::HS) { - bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_arg(ctx, ctx->args->ac.vs_rel_patch_id)); + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + get_arg(ctx, ctx->args->ac.vs_rel_patch_id)); break; } else if (ctx->stage.hw == HWStage::GS || ctx->stage.hw == HWStage::NGG) { bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), thread_id_in_threadgroup(ctx)); @@ -7896,26 +8153,31 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) * we need this multiplied by the wave size, and then OR the thread id to it. */ if (ctx->program->wave_size == 64) { - /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just feed that to v_or */ - Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), - get_arg(ctx, ctx->args->ac.tg_size)); - bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id); + /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just + * feed that to v_or */ + Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), + Operand(0xfc0u), get_arg(ctx, ctx->args->ac.tg_size)); + bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, + id); } else { - /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */ + /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */ Temp tg_num = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16))); - bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, Operand(0x5u), id); + bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + tg_num, Operand(0x5u), id); } break; } case nir_intrinsic_load_subgroup_id: { if (ctx->stage == compute_cs) { - bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), - get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16))); + bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + bld.def(s1, scc), get_arg(ctx, ctx->args->ac.tg_size), + Operand(0x6u | (0x6u << 16))); } else if (ctx->stage.hw == HWStage::NGG) { /* Get the id of the current wave within the threadgroup (workgroup) */ - bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), - get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(24u | (4u << 16))); + bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info), + Operand(24u | (4u << 16))); } else { bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u)); } @@ -7927,11 +8189,12 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } case nir_intrinsic_load_num_subgroups: { if (ctx->stage == compute_cs) - bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu), - get_arg(ctx, ctx->args->ac.tg_size)); + bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + bld.def(s1, scc), Operand(0x3fu), get_arg(ctx, ctx->args->ac.tg_size)); else if (ctx->stage.hw == HWStage::NGG) - bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), - get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(28u | (4u << 16))); + bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info), + Operand(28u | (4u << 16))); else bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u)); break; @@ -7968,7 +8231,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) emit_uniform_subgroup(ctx, instr, src); } else { Temp tid = get_ssa_temp(ctx, instr->src[1].ssa); - if (instr->intrinsic == nir_intrinsic_read_invocation || !nir_src_is_divergent(instr->src[1])) + if (instr->intrinsic == nir_intrinsic_read_invocation || + !nir_src_is_divergent(instr->src[1])) tid = bld.as_uniform(tid); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); @@ -7979,7 +8243,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp tmp = bld.tmp(v1); tmp = emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), tmp); if (dst.type() == RegType::vgpr) - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(src.regClass() == v1b ? v3b : v2b), tmp); + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), + bld.def(src.regClass() == v1b ? v3b : v2b), tmp); else bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); } else if (src.regClass() == v1) { @@ -8006,7 +8271,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src); tmp = emit_extract_vector(ctx, tmp, 0, v1); tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp); - emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst); + emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), + dst); } else { isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } @@ -8026,9 +8292,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp src = get_ssa_temp(ctx, instr->src[0].ssa); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) { - emit_wqm(bld, - bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), - dst); + emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst); } else if (src.regClass() == v2) { Temp lo = bld.tmp(v1), hi = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); @@ -8052,7 +8316,10 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) assert(src.regClass() == bld.lm); assert(dst.regClass() == bld.lm); - Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp(); + Temp tmp = + bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src) + .def(1) + .getTemp(); Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp)); bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond); break; @@ -8072,13 +8339,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_exclusive_scan: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - nir_op op = (nir_op) nir_intrinsic_reduction_op(instr); - unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ? - nir_intrinsic_cluster_size(instr) : 0; - cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size)); + nir_op op = (nir_op)nir_intrinsic_reduction_op(instr); + unsigned cluster_size = + instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0; + cluster_size = util_next_power_of_two( + MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size)); - if (!nir_src_is_divergent(instr->src[0]) && - cluster_size == ctx->program->wave_size && instr->dest.ssa.bit_size != 1) { + if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size && + instr->dest.ssa.bit_size != 1) { /* We use divergence analysis to assign the regclass, so check if it's * working as expected */ ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan; @@ -8113,8 +8381,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_inclusive_scan: emit_wqm(bld, emit_boolean_inclusive_scan(ctx, op, src), dst); break; - default: - assert(false); + default: assert(false); } } else if (cluster_size == 1) { bld.copy(Definition(dst), src); @@ -8127,14 +8394,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) aco_opcode aco_op; switch (instr->intrinsic) { - case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break; - case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break; - case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break; - default: - unreachable("unknown reduce intrinsic"); + case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break; + case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break; + case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break; + default: unreachable("unknown reduce intrinsic"); } - Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, bld.def(dst.regClass()), src); + Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, + bld.def(dst.regClass()), src); emit_wqm(bld, tmp_dst, dst); } break; @@ -8155,31 +8422,39 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) assert(src.regClass() == bld.lm); assert(dst.regClass() == bld.lm); uint32_t half_mask = 0x11111111u << lane; - Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask)); + Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), + Operand(half_mask)); Temp tmp = bld.tmp(bld.lm); bld.sop1(Builder::s_wqm, Definition(tmp), bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, - bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)))); + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, + Operand(exec, bld.lm)))); emit_wqm(bld, tmp, dst); } else if (instr->dest.ssa.bit_size == 8) { Temp tmp = bld.tmp(v1); if (ctx->program->chip_class >= GFX8) emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp); else - emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp); + emit_wqm(bld, + bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), + tmp); bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp); } else if (instr->dest.ssa.bit_size == 16) { Temp tmp = bld.tmp(v1); if (ctx->program->chip_class >= GFX8) emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp); else - emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp); + emit_wqm(bld, + bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), + tmp); bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); } else if (instr->dest.ssa.bit_size == 32) { if (ctx->program->chip_class >= GFX8) emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst); else - emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), dst); + emit_wqm(bld, + bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), + dst); } else if (instr->dest.ssa.bit_size == 64) { Temp lo = bld.tmp(v1), hi = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); @@ -8187,8 +8462,10 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) lo = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl)); hi = emit_wqm(bld, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl)); } else { - lo = emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl)); - hi = emit_wqm(bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl)); + lo = emit_wqm( + bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl)); + hi = emit_wqm( + bld, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl)); } bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); emit_split_vector(ctx, dst, 2); @@ -8209,20 +8486,11 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } uint16_t dpp_ctrl = 0; switch (instr->intrinsic) { - case nir_intrinsic_quad_swap_horizontal: - dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); - break; - case nir_intrinsic_quad_swap_vertical: - dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); - break; - case nir_intrinsic_quad_swap_diagonal: - dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); - break; - case nir_intrinsic_quad_swizzle_amd: - dpp_ctrl = nir_intrinsic_swizzle_mask(instr); - break; - default: - break; + case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break; + case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break; + case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break; + case nir_intrinsic_quad_swizzle_amd: dpp_ctrl = nir_intrinsic_swizzle_mask(instr); break; + default: break; } if (ctx->program->chip_class < GFX8) dpp_ctrl |= (1 << 15); @@ -8234,7 +8502,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) if (instr->dest.ssa.bit_size == 1) { assert(src.regClass() == bld.lm); - src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src); + src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), + Operand((uint32_t)-1), src); if (ctx->program->chip_class >= GFX8) src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl); else @@ -8293,7 +8562,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) if (instr->dest.ssa.bit_size == 1) { assert(src.regClass() == bld.lm); - src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src); + src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), + Operand((uint32_t)-1), src); src = emit_masked_swizzle(ctx, bld, src, mask); Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src); emit_wqm(bld, tmp, dst); @@ -8353,8 +8623,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); assert(dst.regClass() == v1); assert(ctx->program->chip_class >= GFX8); - bld.vop3(aco_opcode::v_perm_b32, Definition(dst), - get_ssa_temp(ctx, instr->src[0].ssa), + bld.vop3(aco_opcode::v_perm_b32, Definition(dst), get_ssa_temp(ctx, instr->src[0].ssa), as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)), as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa))); break; @@ -8368,7 +8637,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) bld.copy(Definition(dst), src); } else if (dst.regClass() == v1 && src.regClass() == v1) { bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src, - bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)), bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa))); + bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)), + bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa))); } else { isel_err(&instr->instr, "Unimplemented lane_permute_16_amd"); } @@ -8395,7 +8665,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_demote_if: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); assert(src.regClass() == bld.lm); - Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); + Temp cond = + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); bld.pseudo(aco_opcode::p_demote_to_helper, cond); if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) @@ -8418,20 +8689,22 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } case nir_intrinsic_elect: { Temp first = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)); - emit_wqm(bld, bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc), Operand(1u), first), + emit_wqm(bld, + bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc), Operand(1u), first), get_ssa_temp(ctx, &instr->dest.ssa)); break; } case nir_intrinsic_shader_clock: { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP && ctx->options->chip_class >= GFX10_3) { + if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP && + ctx->options->chip_class >= GFX10_3) { /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */ Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand(0u)); } else { - aco_opcode opcode = - nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ? - aco_opcode::s_memrealtime : aco_opcode::s_memtime; + aco_opcode opcode = nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE + ? aco_opcode::s_memrealtime + : aco_opcode::s_memtime; bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile)); } emit_split_vector(ctx, dst, 2); @@ -8467,12 +8740,13 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) { if (ctx->options->chip_class >= GFX10) - bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand(127u), get_arg(ctx, ctx->args->ac.gs_invocation_id)); + bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand(127u), + get_arg(ctx, ctx->args->ac.gs_invocation_id)); else bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id)); } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) { - bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), - get_arg(ctx, ctx->args->ac.tcs_rel_ids), Operand(8u), Operand(5u)); + bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->ac.tcs_rel_ids), + Operand(8u), Operand(5u)); } else { unreachable("Unsupported stage for load_invocation_id"); } @@ -8494,7 +8768,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; default: if (ctx->stage.hw == HWStage::NGG && !ctx->stage.has(SWStage::GS)) { - /* In case of NGG, the GS threads always have the primitive ID even if there is no SW GS. */ + /* In case of NGG, the GS threads always have the primitive ID + * even if there is no SW GS. */ bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id)); break; } @@ -8519,7 +8794,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_end_primitive_with_counter: { if (ctx->stage.hw != HWStage::NGG) { unsigned stream = nir_intrinsic_stream_id(instr); - bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream)); + bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, + sendmsg_gs(true, false, stream)); } break; } @@ -8538,7 +8814,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; } case nir_intrinsic_load_ring_tess_factors_offset_amd: { - bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_arg(ctx, ctx->args->ac.tcs_factor_offset)); + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + get_arg(ctx, ctx->args->ac.tcs_factor_offset)); break; } case nir_intrinsic_load_ring_tess_offchip_amd: { @@ -8547,7 +8824,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; } case nir_intrinsic_load_ring_tess_offchip_offset_amd: { - bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_arg(ctx, ctx->args->ac.tess_offchip_offset)); + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + get_arg(ctx, ctx->args->ac.tess_offchip_offset)); break; } case nir_intrinsic_load_ring_esgs_amd: { @@ -8557,12 +8835,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; } case nir_intrinsic_load_ring_es2gs_offset_amd: { - bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_arg(ctx, ctx->args->ac.es2gs_offset)); + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + get_arg(ctx, ctx->args->ac.es2gs_offset)); break; } case nir_intrinsic_load_gs_vertex_offset_amd: { unsigned b = nir_intrinsic_base(instr); - bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_arg(ctx, ctx->args->ac.gs_vtx_offset[b])); + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + get_arg(ctx, ctx->args->ac.gs_vtx_offset[b])); break; } case nir_intrinsic_has_input_vertex_amd: @@ -8575,9 +8855,11 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_load_workgroup_num_input_vertices_amd: case nir_intrinsic_load_workgroup_num_input_primitives_amd: { assert(ctx->stage.hw == HWStage::NGG); - unsigned pos = instr->intrinsic == nir_intrinsic_load_workgroup_num_input_vertices_amd ? 12 : 22; - bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), - get_arg(ctx, ctx->args->ac.gs_tg_info), Operand(pos | (9u << 16u))); + unsigned pos = + instr->intrinsic == nir_intrinsic_load_workgroup_num_input_vertices_amd ? 12 : 22; + bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + bld.def(s1, scc), get_arg(ctx, ctx->args->ac.gs_tg_info), + Operand(pos | (9u << 16u))); break; } case nir_intrinsic_load_initial_edgeflag_amd: { @@ -8586,11 +8868,13 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) unsigned i = nir_src_as_uint(instr->src[0]); Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id); - bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), gs_invocation_id, Operand(8u + i), Operand(1u)); + bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + gs_invocation_id, Operand(8u + i), Operand(1u)); break; } case nir_intrinsic_load_packed_passthrough_primitive_amd: { - bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_arg(ctx, ctx->args->ac.gs_vtx_offset[0])); + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + get_arg(ctx, ctx->args->ac.gs_vtx_offset[0])); break; } case nir_intrinsic_export_vertex_amd: { @@ -8602,8 +8886,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) assert(ctx->stage.hw == HWStage::NGG); Temp prim_exp_arg = get_ssa_temp(ctx, instr->src[0].ssa); bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1), - 1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */, - false /* compressed */, true/* done */, false /* valid mask */); + 1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */, + true /* done */, false /* valid mask */); break; } case nir_intrinsic_alloc_vertices_and_primitives_amd: { @@ -8618,21 +8902,20 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa); Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa); Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val))); - bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u, true); + bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u, + true); break; } case nir_intrinsic_load_shader_query_enabled_amd: { unsigned cmp_bit = 0; - Temp shader_query_enabled = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), get_arg(ctx, ctx->args->ngg_gs_state), Operand(cmp_bit)); - bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bool_to_vector_condition(ctx, shader_query_enabled)); + Temp shader_query_enabled = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), + get_arg(ctx, ctx->args->ngg_gs_state), Operand(cmp_bit)); + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + bool_to_vector_condition(ctx, shader_query_enabled)); break; } - case nir_intrinsic_load_sbt_amd: - visit_load_sbt_amd(ctx, instr); - break; - case nir_intrinsic_bvh64_intersect_ray_amd: - visit_bvh64_intersect_ray_amd(ctx, instr); - break; + case nir_intrinsic_load_sbt_amd: visit_load_sbt_amd(ctx, instr); break; + case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break; default: isel_err(&instr->instr, "Unimplemented intrinsic instr"); abort(); @@ -8641,13 +8924,12 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } } - -void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr, - Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr, - enum glsl_base_type *stype) +void +tex_fetch_ptrs(isel_context* ctx, nir_tex_instr* instr, Temp* res_ptr, Temp* samp_ptr, + Temp* fmask_ptr, enum glsl_base_type* stype) { - nir_deref_instr *texture_deref_instr = NULL; - nir_deref_instr *sampler_deref_instr = NULL; + nir_deref_instr* texture_deref_instr = NULL; + nir_deref_instr* sampler_deref_instr = NULL; int plane = -1; for (unsigned i = 0; i < instr->num_srcs; i++) { @@ -8658,11 +8940,8 @@ void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr, case nir_tex_src_sampler_deref: sampler_deref_instr = nir_src_as_deref(instr->src[i].src); break; - case nir_tex_src_plane: - plane = nir_src_as_int(instr->src[i].src); - break; - default: - break; + case nir_tex_src_plane: plane = nir_src_as_int(instr->src[i].src); break; + default: break; } } @@ -8672,11 +8951,11 @@ void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr, sampler_deref_instr = texture_deref_instr; if (plane >= 0) { - assert(instr->op != nir_texop_txf_ms && - instr->op != nir_texop_samples_identical); - assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF); - *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false); - } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { + assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical); + assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF); + *res_ptr = get_sampler_desc(ctx, texture_deref_instr, + (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false); + } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false); } else if (instr->op == nir_texop_fragment_mask_fetch) { *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false); @@ -8695,26 +8974,25 @@ void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr, bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)}; Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)}; bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]), - Definition(img[2]), Definition(img[3]), Definition(img[4]), - Definition(img[5]), Definition(img[6]), Definition(img[7]), *res_ptr); + Definition(img[2]), Definition(img[3]), Definition(img[4]), Definition(img[5]), + Definition(img[6]), Definition(img[7]), *res_ptr); bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]), Definition(samp[2]), Definition(samp[3]), *samp_ptr); samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]); - *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), - img[0], img[1], img[2], img[3], - img[4], img[5], img[6], img[7]); - *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), - samp[0], samp[1], samp[2], samp[3]); + *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), img[0], img[1], img[2], + img[3], img[4], img[5], img[6], img[7]); + *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), samp[0], samp[1], samp[2], + samp[3]); } } - if (fmask_ptr && (instr->op == nir_texop_txf_ms || - instr->op == nir_texop_samples_identical)) + if (fmask_ptr && (instr->op == nir_texop_txf_ms || instr->op == nir_texop_samples_identical)) *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false); } -void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv, - Temp *out_ma, Temp *out_sc, Temp *out_tc) +void +build_cube_select(isel_context* ctx, Temp ma, Temp id, Temp deriv, Temp* out_ma, Temp* out_sc, + Temp* out_tc) { Builder bld(ctx->program, ctx->block); @@ -8727,28 +9005,30 @@ void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv, Operand two(0x40000000u); Operand four(0x40800000u); - Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma); + Temp is_ma_positive = + bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma); Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive); Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma); Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id); Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id); is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z); - Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y); + Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), + bld.def(s1, scc), is_ma_z, is_ma_y); - // select sc + /* select sc */ Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x); - Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), - bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z), - one, is_ma_y); + Temp sgn = bld.vop2_e64( + aco_opcode::v_cndmask_b32, bld.def(v1), + bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z), one, is_ma_y); *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn); - // select tc + /* select tc */ tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y); sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y); *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn); - // select ma + /* select ma */ tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y), deriv_z, is_ma_z); @@ -8756,24 +9036,29 @@ void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv, *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp); } -void prepare_cube_coords(isel_context *ctx, std::vector& coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array) +void +prepare_cube_coords(isel_context* ctx, std::vector& coords, Temp* ddx, Temp* ddy, + bool is_deriv, bool is_array) { Builder bld(ctx->program, ctx->block); Temp ma, tc, sc, id; - aco_opcode madak = ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32; - aco_opcode madmk = ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32; + aco_opcode madak = + ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32; + aco_opcode madmk = + ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32; if (is_array) { coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]); - // see comment in ac_prepare_cube_coords() + /* see comment in ac_prepare_cube_coords() */ if (ctx->options->chip_class <= GFX8) coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coords[3]); } ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]); - aco_ptr vop3a{create_instruction(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)}; + aco_ptr vop3a{ + create_instruction(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)}; vop3a->operands[0] = Operand(ma); vop3a->abs[0] = true; Temp invma = bld.tmp(v1); @@ -8782,11 +9067,11 @@ void prepare_cube_coords(isel_context *ctx, std::vector& coords, Temp* ddx sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]); if (!is_deriv) - sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/)); + sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand(0x3fc00000u /*1.5*/)); tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]); if (!is_deriv) - tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/)); + tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand(0x3fc00000u /*1.5*/)); id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]); @@ -8795,69 +9080,70 @@ void prepare_cube_coords(isel_context *ctx, std::vector& coords, Temp* ddx tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma); for (unsigned i = 0; i < 2; i++) { - // see comment in ac_prepare_cube_coords() + /* see comment in ac_prepare_cube_coords() */ Temp deriv_ma; Temp deriv_sc, deriv_tc; - build_cube_select(ctx, ma, id, i ? *ddy : *ddx, - &deriv_ma, &deriv_sc, &deriv_tc); + build_cube_select(ctx, ma, id, i ? *ddy : *ddx, &deriv_ma, &deriv_sc, &deriv_tc); deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma); Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), - bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma), - bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc)); + bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma), + bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc)); Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), - bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma), - bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc)); + bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma), + bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc)); *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y); } - sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc); - tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc); + sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u /*1.5*/), sc); + tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u /*1.5*/), tc); } if (is_array) - id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/)); + id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand(0x41000000u /*8.0*/)); coords.resize(3); coords[0] = sc; coords[1] = tc; coords[2] = id; } -void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4]) +void +get_const_vec(nir_ssa_def* vec, nir_const_value* cv[4]) { if (vec->parent_instr->type != nir_instr_type_alu) return; - nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr); + nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr); if (vec_instr->op != nir_op_vec(vec->num_components)) return; for (unsigned i = 0; i < vec->num_components; i++) { - cv[i] = vec_instr->src[i].swizzle[0] == 0 ? - nir_src_as_const_value(vec_instr->src[i].src) : NULL; + cv[i] = + vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL; } } -void visit_tex(isel_context *ctx, nir_tex_instr *instr) +void +visit_tex(isel_context* ctx, nir_tex_instr* instr) { Builder bld(ctx->program, ctx->block); bool has_bias = false, has_lod = false, level_zero = false, has_compare = false, - has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false, - has_clamped_lod = false; + has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, + has_sample_index = false, has_clamped_lod = false; Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(), - lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), - clamped_lod = Temp(); + lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), + clamped_lod = Temp(); std::vector coords; std::vector derivs; - nir_const_value *sample_index_cv = NULL; - nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL}; + nir_const_value* sample_index_cv = NULL; + nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL}; enum glsl_base_type stype; tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype); bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 && (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT); - bool tg4_integer_cube_workaround = tg4_integer_workarounds && - instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE; + bool tg4_integer_cube_workaround = + tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE; for (unsigned i = 0; i < instr->num_srcs; i++) { switch (instr->src[i].src_type) { @@ -8910,8 +9196,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) break; case nir_tex_src_texture_offset: case nir_tex_src_sampler_offset: - default: - break; + default: break; } } @@ -8940,10 +9225,12 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) continue; acc = emit_extract_vector(ctx, offset, i, s1); - acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu)); + acc = + bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu)); if (i) { - acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i)); + acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, + Operand(8u * i)); } if (pack == Temp()) { @@ -8954,7 +9241,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) } if (pack_const && pack != Temp()) - pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack); + pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), + Operand(pack_const), pack); } else { for (unsigned i = 0; i < offset.size(); i++) { if (const_offset[i]) @@ -8986,7 +9274,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) } if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components) - prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod); + prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd, + instr->is_array && instr->op != nir_texop_lod); /* pack derivatives */ if (has_ddx || has_ddy) { @@ -9003,32 +9292,26 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) has_derivs = true; } - if (instr->coord_components > 1 && - instr->sampler_dim == GLSL_SAMPLER_DIM_1D && - instr->is_array && - instr->op != nir_texop_txf) + if (instr->coord_components > 1 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D && + instr->is_array && instr->op != nir_texop_txf) coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]); if (instr->coord_components > 2 && - (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || - instr->sampler_dim == GLSL_SAMPLER_DIM_MS || - instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS || - instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) && - instr->is_array && - instr->op != nir_texop_txf && - instr->op != nir_texop_txf_ms && - instr->op != nir_texop_fragment_fetch && - instr->op != nir_texop_fragment_mask_fetch) + (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || + instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS || + instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) && + instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms && + instr->op != nir_texop_fragment_fetch && instr->op != nir_texop_fragment_mask_fetch) coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]); - if (ctx->options->chip_class == GFX9 && - instr->sampler_dim == GLSL_SAMPLER_DIM_1D && + if (ctx->options->chip_class == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->op != nir_texop_lod && instr->coord_components) { assert(coords.size() > 0 && coords.size() < 3); - coords.insert(std::next(coords.begin()), bld.copy(bld.def(v1), instr->op == nir_texop_txf ? - Operand((uint32_t) 0) : - Operand((uint32_t) 0x3f000000))); + coords.insert( + std::next(coords.begin()), + bld.copy(bld.def(v1), instr->op == nir_texop_txf ? Operand((uint32_t)0) + : Operand((uint32_t)0x3f000000))); } bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array); @@ -9038,9 +9321,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) && - instr->op != nir_texop_txs && - instr->op != nir_texop_fragment_fetch && - instr->op != nir_texop_fragment_mask_fetch) { + instr->op != nir_texop_txs && instr->op != nir_texop_fragment_fetch && + instr->op != nir_texop_fragment_mask_fetch) { assert(has_sample_index); Operand op(sample_index); if (sample_index_cv) @@ -9062,9 +9344,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) dmask = u_bit_consecutive(0, util_last_bit(dmask)); if (instr->is_sparse) dmask = MAX2(dmask, 1) | 0x10; - unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF - ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array) - : 0; + unsigned dim = + ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF + ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array) + : 0; Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); Temp tmp_dst = dst; @@ -9079,7 +9362,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) tmp_dst = bld.tmp(instr->is_sparse ? v5 : v4); } else if (instr->op == nir_texop_samples_identical) { tmp_dst = bld.tmp(v1); - } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) { + } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || + dst.type() == RegType::sgpr) { tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask))); } @@ -9087,20 +9371,15 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) if (!has_lod) lod = bld.copy(bld.def(v1), Operand(0u)); - bool div_by_6 = instr->op == nir_texop_txs && - instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && - instr->is_array && - (dmask & (1 << 2)); + bool div_by_6 = instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && + instr->is_array && (dmask & (1 << 2)); if (tmp_dst.id() == dst.id() && div_by_6) tmp_dst = bld.tmp(tmp_dst.regClass()); - MIMG_instruction *tex = emit_mimg(bld, aco_opcode::image_get_resinfo, - Definition(tmp_dst), resource, Operand(s4), - std::vector{lod}); - if (ctx->options->chip_class == GFX9 && - instr->op == nir_texop_txs && - instr->sampler_dim == GLSL_SAMPLER_DIM_1D && - instr->is_array) { + MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(tmp_dst), + resource, Operand(s4), std::vector{lod}); + if (ctx->options->chip_class == GFX9 && instr->op == nir_texop_txs && + instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array) { tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1); } else if (instr->op == nir_texop_query_levels) { tex->dmask = 1 << 3; @@ -9113,15 +9392,14 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) if (div_by_6) { /* divide 3rd value by 6 by multiplying with magic number */ emit_split_vector(ctx, tmp_dst, tmp_dst.size()); - Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB)); - Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c); + Temp c = bld.copy(bld.def(s1), Operand((uint32_t)0x2AAAAAAB)); + Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), + emit_extract_vector(ctx, tmp_dst, 2, v1), c); assert(instr->dest.ssa.num_components == 3); Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3); tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), emit_extract_vector(ctx, tmp_dst, 0, v1), - emit_extract_vector(ctx, tmp_dst, 1, v1), - by_6); - + emit_extract_vector(ctx, tmp_dst, 1, v1), by_6); } expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask); @@ -9133,9 +9411,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) if (tg4_integer_workarounds) { Temp tg4_lod = bld.copy(bld.def(v1), Operand(0u)); Temp size = bld.tmp(v2); - MIMG_instruction *tex = emit_mimg(bld, aco_opcode::image_get_resinfo, - Definition(size), resource, Operand(s4), - std::vector{tg4_lod}); + MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(size), + resource, Operand(s4), std::vector{tg4_lod}); tex->dim = dim; tex->dmask = 0x3; tex->da = da; @@ -9146,7 +9423,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) half_texel[i] = emit_extract_vector(ctx, size, i, v1); half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]); half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]); - half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]); + half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000 /*-0.5*/), + half_texel[i]); } if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) { @@ -9158,25 +9436,24 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) * radv_init_sampler(). */ unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1; - Temp not_needed = bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand(bit_idx)); + Temp not_needed = + bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand(bit_idx)); not_needed = bool_to_vector_condition(ctx, not_needed); half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - Operand(0xbf000000/*-0.5*/), half_texel[0], not_needed); + Operand(0xbf000000 /*-0.5*/), half_texel[0], not_needed); half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - Operand(0xbf000000/*-0.5*/), half_texel[1], not_needed); + Operand(0xbf000000 /*-0.5*/), half_texel[1], not_needed); } - Temp new_coords[2] = { - bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]), - bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1]) - }; + Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]), + bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])}; if (tg4_integer_cube_workaround) { - // see comment in ac_nir_to_llvm.c's lower_gather4_integer() - Temp *const desc = (Temp *)alloca(resource.size() * sizeof(Temp)); - aco_ptr split{create_instruction(aco_opcode::p_split_vector, - Format::PSEUDO, 1, resource.size())}; + /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */ + Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp)); + aco_ptr split{create_instruction( + aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())}; split->operands[0] = Operand(resource); for (unsigned i = 0; i < resource.size(); i++) { desc[i] = bld.tmp(s1); @@ -9184,21 +9461,22 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) } ctx->block->instructions.emplace_back(std::move(split)); - Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16))); + Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], + Operand(20u | (6u << 16))); Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt, Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8)); Temp nfmt; if (stype == GLSL_TYPE_UINT) { - nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), - Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED), - Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT), - bld.scc(compare_cube_wa)); + nfmt = + bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), + Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED), + Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa)); } else { - nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), - Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED), - Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT), - bld.scc(compare_cube_wa)); + nfmt = + bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), + Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED), + Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa)); } tg4_compare_cube_wa64 = bld.tmp(bld.lm); bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64); @@ -9209,46 +9487,42 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) Operand((uint32_t)C_008F14_NUM_FORMAT)); desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt); - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, - Format::PSEUDO, resource.size(), 1)}; + aco_ptr vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)}; for (unsigned i = 0; i < resource.size(); i++) vec->operands[i] = Operand(desc[i]); resource = bld.tmp(resource.regClass()); vec->definitions[0] = Definition(resource); ctx->block->instructions.emplace_back(std::move(vec)); - new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - new_coords[0], coords[0], tg4_compare_cube_wa64); - new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - new_coords[1], coords[1], tg4_compare_cube_wa64); + new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0], + tg4_compare_cube_wa64); + new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1], + tg4_compare_cube_wa64); } coords[0] = new_coords[0]; coords[1] = new_coords[1]; } if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { - //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe() + // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return + // ac_build_buffer_load_format_gfx9_safe() assert(coords.size() == 1); aco_opcode op; switch (util_last_bit(dmask & 0xf)) { - case 1: - op = aco_opcode::buffer_load_format_x; break; - case 2: - op = aco_opcode::buffer_load_format_xy; break; - case 3: - op = aco_opcode::buffer_load_format_xyz; break; - case 4: - op = aco_opcode::buffer_load_format_xyzw; break; - default: - unreachable("Tex instruction loads more than 4 components."); + case 1: op = aco_opcode::buffer_load_format_x; break; + case 2: op = aco_opcode::buffer_load_format_xy; break; + case 3: op = aco_opcode::buffer_load_format_xyz; break; + case 4: op = aco_opcode::buffer_load_format_xyzw; break; + default: unreachable("Tex instruction loads more than 4 components."); } - aco_ptr mubuf{create_instruction( - op, Format::MUBUF, 3 + instr->is_sparse, 1)}; + aco_ptr mubuf{ + create_instruction(op, Format::MUBUF, 3 + instr->is_sparse, 1)}; mubuf->operands[0] = Operand(resource); mubuf->operands[1] = Operand(coords[0]); - mubuf->operands[2] = Operand((uint32_t) 0); + mubuf->operands[2] = Operand((uint32_t)0); mubuf->definitions[0] = Definition(tmp_dst); mubuf->idxen = true; mubuf->tfe = instr->is_sparse; @@ -9284,16 +9558,16 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) if (has_clamped_lod) args.emplace_back(clamped_lod); - - if (instr->op == nir_texop_txf || - instr->op == nir_texop_txf_ms || - instr->op == nir_texop_samples_identical || - instr->op == nir_texop_fragment_fetch || + if (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms || + instr->op == nir_texop_samples_identical || instr->op == nir_texop_fragment_fetch || instr->op == nir_texop_fragment_mask_fetch) { - aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ? aco_opcode::image_load : aco_opcode::image_load_mip; + aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || + instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS + ? aco_opcode::image_load + : aco_opcode::image_load_mip; Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1); - MIMG_instruction *tex = emit_mimg(bld, op, Definition(tmp_dst), resource, - Operand(s4), args, 0, vdata); + MIMG_instruction* tex = + emit_mimg(bld, op, Definition(tmp_dst), resource, Operand(s4), args, 0, vdata); tex->dim = dim; tex->dmask = dmask & 0xf; tex->unrm = true; @@ -9304,7 +9578,9 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) assert(dmask == 1 && dst.regClass() == bld.lm); assert(dst.id() != tmp_dst.id()); - bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(dst), Operand(0u), tmp_dst).def(0).setHint(vcc); + bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(dst), Operand(0u), tmp_dst) + .def(0) + .setHint(vcc); } else { expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask); } @@ -9421,14 +9697,13 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) opcode = aco_opcode::image_get_lod; } - bool implicit_derivs = bld.program->stage == fragment_fs && - !has_derivs && !has_lod && !level_zero && - instr->sampler_dim != GLSL_SAMPLER_DIM_MS && + bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod && + !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS && instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS; Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1); - MIMG_instruction *tex = emit_mimg(bld, opcode, Definition(tmp_dst), resource, - Operand(sampler), args, implicit_derivs ? wqm_mask : 0, vdata); + MIMG_instruction* tex = emit_mimg(bld, opcode, Definition(tmp_dst), resource, Operand(sampler), + args, implicit_derivs ? wqm_mask : 0, vdata); tex->dim = dim; tex->dmask = dmask & 0xf; tex->da = da; @@ -9447,30 +9722,30 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]); else cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]); - val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64); + val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, + tg4_compare_cube_wa64); } Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass()); if (instr->is_sparse) - tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), - val[0], val[1], val[2], val[3], - emit_extract_vector(ctx, tmp_dst, 4, v1)); + tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2], + val[3], emit_extract_vector(ctx, tmp_dst, 4, v1)); else - tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), - val[0], val[1], val[2], val[3]); + tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2], + val[3]); } unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask; expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask); - } - -Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc, bool logical) +Operand +get_phi_operand(isel_context* ctx, nir_ssa_def* ssa, RegClass rc, bool logical) { Temp tmp = get_ssa_temp(ctx, ssa); if (ssa->parent_instr->type == nir_instr_type_ssa_undef) { return Operand(rc); - } else if (logical && ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) { + } else if (logical && ssa->bit_size == 1 && + ssa->parent_instr->type == nir_instr_type_load_const) { if (ctx->program->wave_size == 64) return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX : 0u); else @@ -9480,7 +9755,8 @@ Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc, bool l } } -void visit_phi(isel_context *ctx, nir_phi_instr *instr) +void +visit_phi(isel_context* ctx, nir_phi_instr* instr) { aco_ptr phi; Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); @@ -9492,17 +9768,19 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr) /* we want a sorted list of sources, since the predecessor list is also sorted */ std::map phi_src; - nir_foreach_phi_src(src, instr) + nir_foreach_phi_src (src, instr) phi_src[src->pred->index] = src->src.ssa; std::vector& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds; unsigned num_operands = 0; - Operand *const operands = (Operand *)alloca((std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand)); + Operand* const operands = (Operand*)alloca( + (std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand)); unsigned num_defined = 0; unsigned cur_pred_idx = 0; - for (std::pair src : phi_src) { + for (std::pair src : phi_src) { if (cur_pred_idx < preds.size()) { - /* handle missing preds (IF merges with discard/break) and extra preds (loop exit with discard) */ + /* handle missing preds (IF merges with discard/break) and extra preds + * (loop exit with discard) */ unsigned block = ctx->cf_info.nir_to_aco[src.first]; unsigned skipped = 0; while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block) @@ -9533,18 +9811,19 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr) * this operand later in visit_loop() if it's not necessary or replace the * undef with something correct. */ if (!logical && ctx->block->kind & block_kind_loop_header) { - nir_loop *loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent); - nir_block *last = nir_loop_last_block(loop); + nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent); + nir_block* last = nir_loop_last_block(loop); if (last->successors[0] != instr->instr.block) operands[num_operands++] = Operand(RegClass()); } /* we can use a linear phi in some cases if one src is undef */ if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) { - phi.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, num_operands, 1)); + phi.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, + num_operands, 1)); - Block *linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]]; - Block *invert = &ctx->program->blocks[linear_else->linear_preds[0]]; + Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]]; + Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]]; assert(invert->kind & block_kind_invert); unsigned then_block = invert->linear_preds[0]; @@ -9572,8 +9851,8 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr) ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi)); } - -void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr) +void +visit_undef(isel_context* ctx, nir_ssa_undef_instr* instr) { Temp dst = get_ssa_temp(ctx, &instr->def); @@ -9582,7 +9861,8 @@ void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr) if (dst.size() == 1) { Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u)); } else { - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; + aco_ptr vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; for (unsigned i = 0; i < dst.size(); i++) vec->operands[i] = Operand(0u); vec->definitions[0] = Definition(dst); @@ -9590,9 +9870,10 @@ void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr) } } -void begin_loop(isel_context *ctx, loop_context *lc) +void +begin_loop(isel_context* ctx, loop_context* lc) { - //TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true + // TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true append_logical_end(ctx->block); ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform; Builder bld(ctx->program, ctx->block); @@ -9603,7 +9884,7 @@ void begin_loop(isel_context *ctx, loop_context *lc) ctx->program->next_loop_depth++; - Block *loop_header = ctx->program->create_and_insert_block(); + Block* loop_header = ctx->program->create_and_insert_block(); loop_header->kind |= block_kind_loop_header; add_edge(loop_preheader_idx, loop_header); ctx->block = loop_header; @@ -9617,15 +9898,18 @@ void begin_loop(isel_context *ctx, loop_context *lc) lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false); } -void end_loop(isel_context *ctx, loop_context *lc) +void +end_loop(isel_context* ctx, loop_context* lc) { - //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken? + // TODO: what if a loop ends with a unconditional or uniformly branched continue + // and this branch is never taken? if (!ctx->cf_info.has_branch) { unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx; Builder bld(ctx->program, ctx->block); append_logical_end(ctx->block); - if (ctx->cf_info.exec_potentially_empty_discard || ctx->cf_info.exec_potentially_empty_break) { + if (ctx->cf_info.exec_potentially_empty_discard || + ctx->cf_info.exec_potentially_empty_break) { /* Discards can result in code running with an empty exec mask. * This would result in divergent breaks not ever being taken. As a * workaround, break the loop when the loop mask is empty instead of @@ -9634,14 +9918,14 @@ void end_loop(isel_context *ctx, loop_context *lc) unsigned block_idx = ctx->block->index; /* create helper blocks to avoid critical edges */ - Block *break_block = ctx->program->create_and_insert_block(); + Block* break_block = ctx->program->create_and_insert_block(); break_block->kind = block_kind_uniform; bld.reset(break_block); bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2))); add_linear_edge(block_idx, break_block); add_linear_edge(break_block->index, &lc->loop_exit); - Block *continue_block = ctx->program->create_and_insert_block(); + Block* continue_block = ctx->program->create_and_insert_block(); continue_block->kind = block_kind_uniform; bld.reset(continue_block); bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2))); @@ -9671,7 +9955,7 @@ void end_loop(isel_context *ctx, loop_context *lc) ctx->block = ctx->program->insert_block(std::move(lc->loop_exit)); append_logical_start(ctx->block); - #if 0 +#if 0 // TODO: check if it is beneficial to not branch on continues /* trim linear phis in loop header */ for (auto&& instr : loop_entry->instructions) { @@ -9690,7 +9974,7 @@ void end_loop(isel_context *ctx, loop_context *lc) break; } } - #endif +#endif ctx->cf_info.parent_loop.header_idx = lc->header_idx_old; ctx->cf_info.parent_loop.exit = lc->exit_old; @@ -9701,10 +9985,11 @@ void end_loop(isel_context *ctx, loop_context *lc) ctx->cf_info.exec_potentially_empty_discard = false; } -void emit_loop_jump(isel_context *ctx, bool is_break) +void +emit_loop_jump(isel_context* ctx, bool is_break) { Builder bld(ctx->program, ctx->block); - Block *logical_target; + Block* logical_target; append_logical_end(ctx->block); unsigned idx = ctx->block->index; @@ -9766,64 +10051,45 @@ void emit_loop_jump(isel_context *ctx, bool is_break) ctx->block = continue_block; } -void emit_loop_break(isel_context *ctx) +void +emit_loop_break(isel_context* ctx) { emit_loop_jump(ctx, true); } -void emit_loop_continue(isel_context *ctx) +void +emit_loop_continue(isel_context* ctx) { emit_loop_jump(ctx, false); } -void visit_jump(isel_context *ctx, nir_jump_instr *instr) +void +visit_jump(isel_context* ctx, nir_jump_instr* instr) { /* visit_block() would usually do this but divergent jumps updates ctx->block */ ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index; switch (instr->type) { - case nir_jump_break: - emit_loop_break(ctx); - break; - case nir_jump_continue: - emit_loop_continue(ctx); - break; - default: - isel_err(&instr->instr, "Unknown NIR jump instr"); - abort(); + case nir_jump_break: emit_loop_break(ctx); break; + case nir_jump_continue: emit_loop_continue(ctx); break; + default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort(); } } -void visit_block(isel_context *ctx, nir_block *block) +void +visit_block(isel_context* ctx, nir_block* block) { - nir_foreach_instr(instr, block) { + nir_foreach_instr (instr, block) { switch (instr->type) { - case nir_instr_type_alu: - visit_alu_instr(ctx, nir_instr_as_alu(instr)); - break; - case nir_instr_type_load_const: - visit_load_const(ctx, nir_instr_as_load_const(instr)); - break; - case nir_instr_type_intrinsic: - visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); - break; - case nir_instr_type_tex: - visit_tex(ctx, nir_instr_as_tex(instr)); - break; - case nir_instr_type_phi: - visit_phi(ctx, nir_instr_as_phi(instr)); - break; - case nir_instr_type_ssa_undef: - visit_undef(ctx, nir_instr_as_ssa_undef(instr)); - break; - case nir_instr_type_deref: - break; - case nir_instr_type_jump: - visit_jump(ctx, nir_instr_as_jump(instr)); - break; - default: - isel_err(instr, "Unknown NIR instr type"); - //abort(); + case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break; + case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break; + case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break; + case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break; + case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break; + case nir_instr_type_ssa_undef: visit_undef(ctx, nir_instr_as_ssa_undef(instr)); break; + case nir_instr_type_deref: break; + case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break; + default: isel_err(instr, "Unknown NIR instr type"); } } @@ -9831,10 +10097,9 @@ void visit_block(isel_context *ctx, nir_block *block) ctx->cf_info.nir_to_aco[block->index] = ctx->block->index; } - - -static Operand create_continue_phis(isel_context *ctx, unsigned first, unsigned last, - aco_ptr& header_phi, Operand *vals) +static Operand +create_continue_phis(isel_context* ctx, unsigned first, unsigned last, + aco_ptr& header_phi, Operand* vals) { vals[0] = Operand(header_phi->definitions[0].getTemp()); RegClass rc = vals[0].regClass(); @@ -9878,11 +10143,12 @@ static Operand create_continue_phis(isel_context *ctx, unsigned first, unsigned return vals[last - first]; } -static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond); -static void begin_uniform_if_else(isel_context *ctx, if_context *ic); -static void end_uniform_if(isel_context *ctx, if_context *ic); +static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond); +static void begin_uniform_if_else(isel_context* ctx, if_context* ic); +static void end_uniform_if(isel_context* ctx, if_context* ic); -static void visit_loop(isel_context *ctx, nir_loop *loop) +static void +visit_loop(isel_context* ctx, nir_loop* loop) { loop_context lc; begin_loop(ctx, &lc); @@ -9927,13 +10193,14 @@ static void visit_loop(isel_context *ctx, nir_loop *loop) * merge block would get CSE'd */ if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) { unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1); - Operand *const vals = (Operand *)alloca(num_vals * sizeof(Operand)); + Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand)); for (aco_ptr& instr : ctx->program->blocks[loop_header_idx].instructions) { if (instr->opcode == aco_opcode::p_linear_phi) { if (ctx->cf_info.has_branch) instr->operands.pop_back(); else - instr->operands.back() = create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals); + instr->operands.back() = + create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals); } else if (!is_phi(instr)) { break; } @@ -9943,7 +10210,8 @@ static void visit_loop(isel_context *ctx, nir_loop *loop) end_loop(ctx, &lc); } -static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond) +static void +begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond) { ic->cond = cond; @@ -9953,7 +10221,8 @@ static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond /* branch to linear then block */ assert(cond.regClass() == ctx->program->lane_mask); aco_ptr branch; - branch.reset(create_instruction(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 1)); + branch.reset(create_instruction(aco_opcode::p_cbranch_z, + Format::PSEUDO_BRANCH, 1, 1)); branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); branch->definitions[0].setHint(vcc); branch->operands[0] = Operand(cond); @@ -9978,7 +10247,6 @@ static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond ctx->cf_info.exec_potentially_empty_break = false; ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; - /** emit logical then block */ ctx->program->next_divergent_if_logical_depth++; Block* BB_then_logical = ctx->program->create_and_insert_block(); @@ -9987,13 +10255,15 @@ static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond append_logical_start(BB_then_logical); } -static void begin_divergent_if_else(isel_context *ctx, if_context *ic) +static void +begin_divergent_if_else(isel_context* ctx, if_context* ic) { - Block *BB_then_logical = ctx->block; + Block* BB_then_logical = ctx->block; append_logical_end(BB_then_logical); - /* branch from logical then block to invert block */ + /* branch from logical then block to invert block */ aco_ptr branch; - branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); + branch.reset(create_instruction(aco_opcode::p_branch, + Format::PSEUDO_BRANCH, 0, 1)); branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); branch->definitions[0].setHint(vcc); BB_then_logical->instructions.emplace_back(std::move(branch)); @@ -10011,33 +10281,33 @@ static void begin_divergent_if_else(isel_context *ctx, if_context *ic) BB_then_linear->kind |= block_kind_uniform; add_linear_edge(ic->BB_if_idx, BB_then_linear); /* branch from linear then block to invert block */ - branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); + branch.reset(create_instruction(aco_opcode::p_branch, + Format::PSEUDO_BRANCH, 0, 1)); branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); branch->definitions[0].setHint(vcc); BB_then_linear->instructions.emplace_back(std::move(branch)); add_linear_edge(BB_then_linear->index, &ic->BB_invert); - /** emit invert merge block */ ctx->block = ctx->program->insert_block(std::move(ic->BB_invert)); ic->invert_idx = ctx->block->index; /* branch to linear else block (skip else) */ - branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); + branch.reset(create_instruction(aco_opcode::p_branch, + Format::PSEUDO_BRANCH, 0, 1)); branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); branch->definitions[0].setHint(vcc); ctx->block->instructions.push_back(std::move(branch)); ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard; ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break; - ic->exec_potentially_empty_break_depth_old = - std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth); + ic->exec_potentially_empty_break_depth_old = std::min( + ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth); /* divergent branches use cbranch_execz */ ctx->cf_info.exec_potentially_empty_discard = false; ctx->cf_info.exec_potentially_empty_break = false; ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; - /** emit logical else block */ ctx->program->next_divergent_if_logical_depth++; Block* BB_else_logical = ctx->program->create_and_insert_block(); @@ -10047,14 +10317,16 @@ static void begin_divergent_if_else(isel_context *ctx, if_context *ic) append_logical_start(BB_else_logical); } -static void end_divergent_if(isel_context *ctx, if_context *ic) +static void +end_divergent_if(isel_context* ctx, if_context* ic) { - Block *BB_else_logical = ctx->block; + Block* BB_else_logical = ctx->block; append_logical_end(BB_else_logical); /* branch from logical else block to endif block */ aco_ptr branch; - branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); + branch.reset(create_instruction(aco_opcode::p_branch, + Format::PSEUDO_BRANCH, 0, 1)); branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); branch->definitions[0].setHint(vcc); BB_else_logical->instructions.emplace_back(std::move(branch)); @@ -10067,30 +10339,28 @@ static void end_divergent_if(isel_context *ctx, if_context *ic) assert(!ctx->cf_info.has_branch); ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent; - /** emit linear else block */ Block* BB_else_linear = ctx->program->create_and_insert_block(); BB_else_linear->kind |= block_kind_uniform; add_linear_edge(ic->invert_idx, BB_else_linear); /* branch from linear else block to endif block */ - branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); + branch.reset(create_instruction(aco_opcode::p_branch, + Format::PSEUDO_BRANCH, 0, 1)); branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); branch->definitions[0].setHint(vcc); BB_else_linear->instructions.emplace_back(std::move(branch)); add_linear_edge(BB_else_linear->index, &ic->BB_endif); - /** emit endif merge block */ ctx->block = ctx->program->insert_block(std::move(ic->BB_endif)); append_logical_start(ctx->block); - ctx->cf_info.parent_if.is_divergent = ic->divergent_old; ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old; ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old; - ctx->cf_info.exec_potentially_empty_break_depth = - std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth); + ctx->cf_info.exec_potentially_empty_break_depth = std::min( + ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth); if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth && !ctx->cf_info.parent_if.is_divergent) { ctx->cf_info.exec_potentially_empty_break = false; @@ -10104,7 +10374,8 @@ static void end_divergent_if(isel_context *ctx, if_context *ic) } } -static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond) +static void +begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond) { assert(cond.regClass() == s1); @@ -10113,7 +10384,8 @@ static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond) aco_ptr branch; aco_opcode branch_opcode = aco_opcode::p_cbranch_z; - branch.reset(create_instruction(branch_opcode, Format::PSEUDO_BRANCH, 1, 1)); + branch.reset( + create_instruction(branch_opcode, Format::PSEUDO_BRANCH, 1, 1)); branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); branch->definitions[0].setHint(vcc); branch->operands[0] = Operand(cond); @@ -10127,7 +10399,6 @@ static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond) ctx->cf_info.has_branch = false; ctx->cf_info.parent_loop.has_divergent_branch = false; - /** emit then block */ ctx->program->next_uniform_if_depth++; Block* BB_then = ctx->program->create_and_insert_block(); @@ -10136,9 +10407,10 @@ static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond) ctx->block = BB_then; } -static void begin_uniform_if_else(isel_context *ctx, if_context *ic) +static void +begin_uniform_if_else(isel_context* ctx, if_context* ic) { - Block *BB_then = ctx->block; + Block* BB_then = ctx->block; ic->uniform_has_then_branch = ctx->cf_info.has_branch; ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch; @@ -10147,7 +10419,8 @@ static void begin_uniform_if_else(isel_context *ctx, if_context *ic) append_logical_end(BB_then); /* branch from then block to endif block */ aco_ptr branch; - branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); + branch.reset(create_instruction(aco_opcode::p_branch, + Format::PSEUDO_BRANCH, 0, 1)); branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); branch->definitions[0].setHint(vcc); BB_then->instructions.emplace_back(std::move(branch)); @@ -10167,15 +10440,17 @@ static void begin_uniform_if_else(isel_context *ctx, if_context *ic) ctx->block = BB_else; } -static void end_uniform_if(isel_context *ctx, if_context *ic) +static void +end_uniform_if(isel_context* ctx, if_context* ic) { - Block *BB_else = ctx->block; + Block* BB_else = ctx->block; if (!ctx->cf_info.has_branch) { append_logical_end(BB_else); /* branch from then block to endif block */ aco_ptr branch; - branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); + branch.reset(create_instruction(aco_opcode::p_branch, + Format::PSEUDO_BRANCH, 0, 1)); branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); branch->definitions[0].setHint(vcc); BB_else->instructions.emplace_back(std::move(branch)); @@ -10188,7 +10463,6 @@ static void end_uniform_if(isel_context *ctx, if_context *ic) ctx->cf_info.has_branch &= ic->uniform_has_then_branch; ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent; - /** emit endif merge block */ ctx->program->next_uniform_if_depth--; if (!ctx->cf_info.has_branch) { @@ -10197,7 +10471,8 @@ static void end_uniform_if(isel_context *ctx, if_context *ic) } } -static bool visit_if(isel_context *ctx, nir_if *if_stmt) +static bool +visit_if(isel_context* ctx, nir_if* if_stmt) { Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa); Builder bld(ctx->program, ctx->block); @@ -10269,41 +10544,38 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt) return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty(); } -static bool visit_cf_list(isel_context *ctx, - struct exec_list *list) +static bool +visit_cf_list(isel_context* ctx, struct exec_list* list) { - foreach_list_typed(nir_cf_node, node, node, list) { + foreach_list_typed (nir_cf_node, node, node, list) { switch (node->type) { - case nir_cf_node_block: - visit_block(ctx, nir_cf_node_as_block(node)); - break; + case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break; case nir_cf_node_if: if (!visit_if(ctx, nir_cf_node_as_if(node))) return true; break; - case nir_cf_node_loop: - visit_loop(ctx, nir_cf_node_as_loop(node)); - break; - default: - unreachable("unimplemented cf list type"); + case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break; + default: unreachable("unimplemented cf list type"); } } return false; } -static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos) +static void +export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos) { assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG); int offset = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS)) - ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot] - : ctx->program->info->vs.outinfo.vs_output_param_offset[slot]; + ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot] + : ctx->program->info->vs.outinfo.vs_output_param_offset[slot]; unsigned mask = ctx->outputs.mask[slot]; if (!is_pos && !mask) return; if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED) return; - aco_ptr exp{create_instruction(aco_opcode::exp, Format::EXP, 4, 0)}; + aco_ptr exp{ + create_instruction(aco_opcode::exp, Format::EXP, 4, 0)}; exp->enabled_mask = mask; for (unsigned i = 0; i < 4; ++i) { if (mask & (1 << i)) @@ -10324,9 +10596,11 @@ static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *nex ctx->block->instructions.emplace_back(std::move(exp)); } -static void export_vs_psiz_layer_viewport_vrs(isel_context *ctx, int *next_pos) +static void +export_vs_psiz_layer_viewport_vrs(isel_context* ctx, int* next_pos) { - aco_ptr exp{create_instruction(aco_opcode::exp, Format::EXP, 4, 0)}; + aco_ptr exp{ + create_instruction(aco_opcode::exp, Format::EXP, 4, 0)}; exp->enabled_mask = 0; for (unsigned i = 0; i < 4; ++i) exp->operands[i] = Operand(v1); @@ -10374,11 +10648,10 @@ static void export_vs_psiz_layer_viewport_vrs(isel_context *ctx, int *next_pos) Temp rates = bld.copy(bld.def(v1), Operand((unsigned)ctx->options->force_vrs_rates)); /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */ - Temp cond = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), - Operand(0x3f800000u), + Temp cond = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), Operand(0x3f800000u), Operand(ctx->outputs.temps[VARYING_SLOT_POS + 3])); - rates = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - bld.copy(bld.def(v1), Operand(0u)), rates, cond); + rates = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), + rates, cond); exp->operands[1] = Operand(rates); exp->enabled_mask |= 0x2; @@ -10391,27 +10664,31 @@ static void export_vs_psiz_layer_viewport_vrs(isel_context *ctx, int *next_pos) ctx->block->instructions.emplace_back(std::move(exp)); } -static void create_vs_exports(isel_context *ctx) +static void +create_vs_exports(isel_context* ctx) { assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG); - radv_vs_output_info *outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS)) - ? &ctx->program->info->tes.outinfo - : &ctx->program->info->vs.outinfo; + radv_vs_output_info* outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS)) + ? &ctx->program->info->tes.outinfo + : &ctx->program->info->vs.outinfo; ctx->block->kind |= block_kind_export_end; if (outinfo->export_prim_id && ctx->stage.hw != HWStage::NGG) { ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1; if (ctx->stage.has(SWStage::TES)) - ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->ac.tes_patch_id); + ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = + get_arg(ctx, ctx->args->ac.tes_patch_id); else - ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->ac.vs_prim_id); + ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = + get_arg(ctx, ctx->args->ac.vs_prim_id); } if (ctx->options->key.has_multiview_view_index) { ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1; - ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index)); + ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] = + as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index)); } /* Hardware requires position data to always be exported, even if the @@ -10423,8 +10700,8 @@ static void create_vs_exports(isel_context *ctx) int next_pos = 0; export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos); - bool writes_primitive_shading_rate = outinfo->writes_primitive_shading_rate || - ctx->options->force_vrs_rates; + bool writes_primitive_shading_rate = + outinfo->writes_primitive_shading_rate || ctx->options->force_vrs_rates; if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index || writes_primitive_shading_rate) { export_vs_psiz_layer_viewport_vrs(ctx, &next_pos); @@ -10442,9 +10719,7 @@ static void create_vs_exports(isel_context *ctx) } for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) { - if (i < VARYING_SLOT_VAR0 && - i != VARYING_SLOT_LAYER && - i != VARYING_SLOT_PRIMITIVE_ID && + if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER && i != VARYING_SLOT_PRIMITIVE_ID && i != VARYING_SLOT_VIEWPORT) continue; @@ -10452,7 +10727,8 @@ static void create_vs_exports(isel_context *ctx) } } -static bool export_fs_mrt_z(isel_context *ctx) +static bool +export_fs_mrt_z(isel_context* ctx) { Builder bld(ctx->program, ctx->block); unsigned enabled_channels = 0; @@ -10465,8 +10741,7 @@ static bool export_fs_mrt_z(isel_context *ctx) /* Both stencil and sample mask only need 16-bits. */ if (!ctx->program->info->ps.writes_z && - (ctx->program->info->ps.writes_stencil || - ctx->program->info->ps.writes_sample_mask)) { + (ctx->program->info->ps.writes_stencil || ctx->program->info->ps.writes_sample_mask)) { compr = true; /* COMPR flag */ if (ctx->program->info->ps.writes_stencil) { @@ -10480,7 +10755,7 @@ static bool export_fs_mrt_z(isel_context *ctx) /* SampleMask should be in Y[15:0]. */ values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]); enabled_channels |= 0xc; - } + } } else { if (ctx->program->info->ps.writes_z) { values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]); @@ -10501,19 +10776,19 @@ static bool export_fs_mrt_z(isel_context *ctx) /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X * writemask component. */ - if (ctx->options->chip_class == GFX6 && - ctx->options->family != CHIP_OLAND && + if (ctx->options->chip_class == GFX6 && ctx->options->family != CHIP_OLAND && ctx->options->family != CHIP_HAINAN) { - enabled_channels |= 0x1; + enabled_channels |= 0x1; } - bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], - enabled_channels, V_008DFC_SQ_EXP_MRTZ, compr); + bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, + V_008DFC_SQ_EXP_MRTZ, compr); return true; } -static bool export_fs_mrt_color(isel_context *ctx, int slot) +static bool +export_fs_mrt_color(isel_context* ctx, int slot) { Builder bld(ctx->program, ctx->block); unsigned write_mask = ctx->outputs.mask[slot]; @@ -10541,34 +10816,25 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) bool is_16bit = values[0].regClass() == v2b; /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */ - if (ctx->options->enable_mrt_output_nan_fixup && - !is_16bit && - (col_format == V_028714_SPI_SHADER_32_R || - col_format == V_028714_SPI_SHADER_32_GR || - col_format == V_028714_SPI_SHADER_32_AR || - col_format == V_028714_SPI_SHADER_32_ABGR || + if (ctx->options->enable_mrt_output_nan_fixup && !is_16bit && + (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR || + col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR || col_format == V_028714_SPI_SHADER_FP16_ABGR)) { for (int i = 0; i < 4; i++) { if (!(write_mask & (1 << i))) continue; - Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, - bld.hint_vcc(bld.def(bld.lm)), values[i], - bld.copy(bld.def(v1), Operand(3u))); + Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), + values[i], bld.copy(bld.def(v1), Operand(3u))); values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i], bld.copy(bld.def(v1), Operand(0u)), isnan); } } - switch (col_format) - { - case V_028714_SPI_SHADER_32_R: - enabled_channels = 1; - break; + switch (col_format) { + case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break; - case V_028714_SPI_SHADER_32_GR: - enabled_channels = 0x3; - break; + case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break; case V_028714_SPI_SHADER_32_AR: if (ctx->options->chip_class >= GFX10) { @@ -10583,21 +10849,24 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) case V_028714_SPI_SHADER_FP16_ABGR: for (int i = 0; i < 2; i++) { - bool enabled = (write_mask >> (i*2)) & 0x3; + bool enabled = (write_mask >> (i * 2)) & 0x3; if (enabled) { - enabled_channels |= 0x3 << (i*2); + enabled_channels |= 0x3 << (i * 2); if (is_16bit) { - values[i] = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), - values[i*2].isUndefined() ? Operand(v2b) : values[i*2], - values[i*2+1].isUndefined() ? Operand(v2b): values[i*2+1]); - } else if (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9 ) { - values[i] = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1), - values[i*2].isUndefined() ? Operand(0u) : values[i*2], - values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]); + values[i] = + bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), + values[i * 2].isUndefined() ? Operand(v2b) : values[i * 2], + values[i * 2 + 1].isUndefined() ? Operand(v2b) : values[i * 2 + 1]); + } else if (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9) { + values[i] = + bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1), + values[i * 2].isUndefined() ? Operand(0u) : values[i * 2], + values[i * 2 + 1].isUndefined() ? Operand(0u) : values[i * 2 + 1]); } else { - values[i] = bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), - values[i*2].isUndefined() ? values[i*2+1] : values[i*2], - values[i*2+1].isUndefined() ? values[i*2] : values[i*2+1]); + values[i] = + bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), + values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2], + values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]); } } else { values[i] = Operand(v1); @@ -10633,9 +10902,9 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) for (unsigned i = 0; i < 4; i++) { if ((write_mask >> i) & 1) { - values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), - i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val), - values[i]); + values[i] = + bld.vop2(aco_opcode::v_min_u32, bld.def(v1), + i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val), values[i]); } } } else if (is_16bit) { @@ -10654,18 +10923,18 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) if (is_int8 || is_int10) { /* clamp */ uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0; - uint32_t min_rgb = is_int8 ? -128 :is_int10 ? -512 : 0; + uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0; Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb)); Temp min_rgb_val = bld.copy(bld.def(s1), Operand(min_rgb)); for (unsigned i = 0; i < 4; i++) { if ((write_mask >> i) & 1) { - values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), - i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val), - values[i]); - values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), - i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val), - values[i]); + values[i] = + bld.vop2(aco_opcode::v_min_i32, bld.def(v1), + i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val), values[i]); + values[i] = + bld.vop2(aco_opcode::v_max_i32, bld.def(v1), + i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val), values[i]); } } } else if (is_16bit) { @@ -10678,24 +10947,21 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) } break; - case V_028714_SPI_SHADER_32_ABGR: - enabled_channels = 0xF; - break; + case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break; case V_028714_SPI_SHADER_ZERO: - default: - return false; + default: return false; } - if ((bool) compr_op) { + if ((bool)compr_op) { for (int i = 0; i < 2; i++) { /* check if at least one of the values to be compressed is enabled */ - bool enabled = (write_mask >> (i*2)) & 0x3; + bool enabled = (write_mask >> (i * 2)) & 0x3; if (enabled) { - enabled_channels |= 0x3 << (i*2); + enabled_channels |= 0x3 << (i * 2); values[i] = bld.vop3(compr_op, bld.def(v1), - values[i*2].isUndefined() ? Operand(0u) : values[i*2], - values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]); + values[i * 2].isUndefined() ? Operand(0u) : values[i * 2], + values[i * 2 + 1].isUndefined() ? Operand(0u) : values[i * 2 + 1]); } else { values[i] = Operand(v1); } @@ -10708,12 +10974,13 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1); } - bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], - enabled_channels, target, compr); + bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, target, + compr); return true; } -static void create_fs_null_export(isel_context *ctx) +static void +create_fs_null_export(isel_context* ctx) { /* FS must always have exports. * So when there are none, we need to add a null export. @@ -10725,13 +10992,13 @@ static void create_fs_null_export(isel_context *ctx) /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true); } -static void create_fs_exports(isel_context *ctx) +static void +create_fs_exports(isel_context* ctx) { bool exported = false; /* Export depth, stencil and sample mask. */ - if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || - ctx->outputs.mask[FRAG_RESULT_STENCIL] || + if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || ctx->outputs.mask[FRAG_RESULT_STENCIL] || ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK]) exported |= export_fs_mrt_z(ctx); @@ -10746,17 +11013,16 @@ static void create_fs_exports(isel_context *ctx) ctx->block->kind |= block_kind_export_end; } -static void create_workgroup_barrier(Builder& bld) +static void +create_workgroup_barrier(Builder& bld) { bld.barrier(aco_opcode::p_barrier, - memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup), - scope_workgroup); + memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup), scope_workgroup); } -static void emit_stream_output(isel_context *ctx, - Temp const *so_buffers, - Temp const *so_write_offset, - const struct radv_stream_output *output) +static void +emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset, + const struct radv_stream_output* output) { unsigned num_comps = util_bitcount(output->component_mask); unsigned writemask = (1 << num_comps) - 1; @@ -10791,39 +11057,34 @@ static void emit_stream_output(isel_context *ctx, unsigned offset = output->offset + start * 4; Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count)); - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; + aco_ptr vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; for (int i = 0; i < count; ++i) - vec->operands[i] = (ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u); + vec->operands[i] = + (ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u); vec->definitions[0] = Definition(write_data); ctx->block->instructions.emplace_back(std::move(vec)); aco_opcode opcode; switch (count) { - case 1: - opcode = aco_opcode::buffer_store_dword; - break; - case 2: - opcode = aco_opcode::buffer_store_dwordx2; - break; - case 3: - opcode = aco_opcode::buffer_store_dwordx3; - break; - case 4: - opcode = aco_opcode::buffer_store_dwordx4; - break; - default: - unreachable("Unsupported dword count."); + case 1: opcode = aco_opcode::buffer_store_dword; break; + case 2: opcode = aco_opcode::buffer_store_dwordx2; break; + case 3: opcode = aco_opcode::buffer_store_dwordx3; break; + case 4: opcode = aco_opcode::buffer_store_dwordx4; break; + default: unreachable("Unsupported dword count."); } - aco_ptr store{create_instruction(opcode, Format::MUBUF, 4, 0)}; + aco_ptr store{ + create_instruction(opcode, Format::MUBUF, 4, 0)}; store->operands[0] = Operand(so_buffers[buf]); store->operands[1] = Operand(so_write_offset[buf]); - store->operands[2] = Operand((uint32_t) 0); + store->operands[2] = Operand((uint32_t)0); store->operands[3] = Operand(write_data); if (offset > 4095) { /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */ Builder bld(ctx->program, ctx->block); - store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf])); + store->operands[0] = + bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf])); } else { store->offset = offset; } @@ -10835,7 +11096,8 @@ static void emit_stream_output(isel_context *ctx, } } -static void emit_streamout(isel_context *ctx, unsigned stream) +static void +emit_streamout(isel_context* ctx, unsigned stream) { Builder bld(ctx->program, ctx->block); @@ -10862,7 +11124,8 @@ static void emit_streamout(isel_context *ctx, unsigned stream) bld.reset(ctx->block); - Temp so_write_index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid); + Temp so_write_index = + bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid); Temp so_write_offset[4]; @@ -10877,7 +11140,8 @@ static void emit_streamout(isel_context *ctx, unsigned stream) get_arg(ctx, ctx->args->ac.streamout_offset[i])); Temp new_offset = bld.vadd32(bld.def(v1), offset, tid); - so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset); + so_write_offset[i] = + bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset); } else { Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u); Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u), @@ -10887,8 +11151,7 @@ static void emit_streamout(isel_context *ctx, unsigned stream) } for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) { - struct radv_stream_output *output = - &ctx->program->info->so.outputs[i]; + struct radv_stream_output* output = &ctx->program->info->so.outputs[i]; if (stream != output->stream) continue; @@ -10899,7 +11162,8 @@ static void emit_streamout(isel_context *ctx, unsigned stream) end_divergent_if(ctx, &ic); } -Pseudo_instruction *add_startpgm(struct isel_context *ctx) +Pseudo_instruction* +add_startpgm(struct isel_context* ctx) { unsigned arg_count = ctx->args->ac.arg_count; if (ctx->stage == fragment_fs) { @@ -10911,7 +11175,7 @@ Pseudo_instruction *add_startpgm(struct isel_context *ctx) * could run before argument setup, then this wouldn't be necessary * anymore. */ - struct ac_shader_args *args = &ctx->args->ac; + struct ac_shader_args* args = &ctx->args->ac; arg_count = 0; for (unsigned i = 0, vgpr_arg = 0, vgpr_reg = 0; i < args->arg_count; i++) { if (args->args[i].file != AC_ARG_VGPR) { @@ -10930,7 +11194,8 @@ Pseudo_instruction *add_startpgm(struct isel_context *ctx) } } - aco_ptr startpgm{create_instruction(aco_opcode::p_startpgm, Format::PSEUDO, 0, arg_count)}; + aco_ptr startpgm{ + create_instruction(aco_opcode::p_startpgm, Format::PSEUDO, 0, arg_count)}; for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) { if (ctx->args->ac.args[i].skip) continue; @@ -10945,7 +11210,7 @@ Pseudo_instruction *add_startpgm(struct isel_context *ctx) startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256}); arg++; } - Pseudo_instruction *instr = startpgm.get(); + Pseudo_instruction* instr = startpgm.get(); ctx->block->instructions.push_back(std::move(startpgm)); /* Stash these in the program so that they can be accessed later when @@ -10957,37 +11222,36 @@ Pseudo_instruction *add_startpgm(struct isel_context *ctx) return instr; } -void fix_ls_vgpr_init_bug(isel_context *ctx, Pseudo_instruction *startpgm) +void +fix_ls_vgpr_init_bug(isel_context* ctx, Pseudo_instruction* startpgm) { assert(ctx->shader->info.stage == MESA_SHADER_VERTEX); Builder bld(ctx->program, ctx->block); constexpr unsigned hs_idx = 1u; - Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), - get_arg(ctx, ctx->args->ac.merged_wave_info), - Operand((8u << 16) | (hs_idx * 8u))); + Builder::Result hs_thread_count = + bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), + get_arg(ctx, ctx->args->ac.merged_wave_info), Operand((8u << 16) | (hs_idx * 8u))); Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp()); /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */ - Temp instance_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - get_arg(ctx, ctx->args->ac.vertex_id), - get_arg(ctx, ctx->args->ac.instance_id), - ls_has_nonzero_hs_threads); - Temp vs_rel_patch_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - get_arg(ctx, ctx->args->ac.tcs_rel_ids), - get_arg(ctx, ctx->args->ac.vs_rel_patch_id), - ls_has_nonzero_hs_threads); - Temp vertex_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - get_arg(ctx, ctx->args->ac.tcs_patch_id), - get_arg(ctx, ctx->args->ac.vertex_id), - ls_has_nonzero_hs_threads); + Temp instance_id = + bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.vertex_id), + get_arg(ctx, ctx->args->ac.instance_id), ls_has_nonzero_hs_threads); + Temp vs_rel_patch_id = + bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids), + get_arg(ctx, ctx->args->ac.vs_rel_patch_id), ls_has_nonzero_hs_threads); + Temp vertex_id = + bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_patch_id), + get_arg(ctx, ctx->args->ac.vertex_id), ls_has_nonzero_hs_threads); ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id; ctx->arg_temps[ctx->args->ac.vs_rel_patch_id.arg_index] = vs_rel_patch_id; ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id; } -void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm) +void +split_arguments(isel_context* ctx, Pseudo_instruction* startpgm) { /* Split all arguments except for the first (ring_offsets) and the last * (exec) so that the dead channels don't stay live throughout the program. @@ -11000,13 +11264,16 @@ void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm) } } -void handle_bc_optimize(isel_context *ctx) +void +handle_bc_optimize(isel_context* ctx) { /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */ Builder bld(ctx->program, ctx->block); uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena; - bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena); - bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena); + bool uses_center = + G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena); + bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || + G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena); ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid); ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid); if (uses_center && uses_centroid) { @@ -11016,10 +11283,12 @@ void handle_bc_optimize(isel_context *ctx) if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) { Temp new_coord[2]; for (unsigned i = 0; i < 2; i++) { - Temp persp_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1); - Temp persp_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1); - new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - persp_centroid, persp_center, sel); + Temp persp_centroid = + emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1); + Temp persp_center = + emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1); + new_coord[i] = + bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), persp_centroid, persp_center, sel); } ctx->persp_centroid = bld.tmp(v2); bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid), @@ -11030,10 +11299,12 @@ void handle_bc_optimize(isel_context *ctx) if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) { Temp new_coord[2]; for (unsigned i = 0; i < 2; i++) { - Temp linear_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1); - Temp linear_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1); - new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - linear_centroid, linear_center, sel); + Temp linear_centroid = + emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1); + Temp linear_center = + emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1); + new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), linear_centroid, + linear_center, sel); } ctx->linear_centroid = bld.tmp(v2); bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid), @@ -11043,9 +11314,10 @@ void handle_bc_optimize(isel_context *ctx) } } -void setup_fp_mode(isel_context *ctx, nir_shader *shader) +void +setup_fp_mode(isel_context* ctx, nir_shader* shader) { - Program *program = ctx->program; + Program* program = ctx->program; unsigned float_controls = shader->info.float_controls_execution_mode; @@ -11058,15 +11330,17 @@ void setup_fp_mode(isel_context *ctx, nir_shader *shader) program->next_fp_mode.must_flush_denorms32 = float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32; program->next_fp_mode.must_flush_denorms16_64 = - float_controls & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | - FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64); + float_controls & + (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64); program->next_fp_mode.care_about_round32 = - float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32); + float_controls & + (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32); program->next_fp_mode.care_about_round16_64 = - float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 | - FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64); + float_controls & + (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 | + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64); /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and * the precision seems needed for Wolfenstein: Youngblood to render correctly */ @@ -11086,7 +11360,8 @@ void setup_fp_mode(isel_context *ctx, nir_shader *shader) else program->next_fp_mode.round32 = fp_round_ne; - if (float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64)) + if (float_controls & + (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64)) program->next_fp_mode.round16_64 = fp_round_tz; else program->next_fp_mode.round16_64 = fp_round_ne; @@ -11094,7 +11369,8 @@ void setup_fp_mode(isel_context *ctx, nir_shader *shader) ctx->block->fp_mode = program->next_fp_mode; } -void cleanup_cfg(Program *program) +void +cleanup_cfg(Program* program) { /* create linear_succs/logical_succs */ for (Block& BB : program->blocks) { @@ -11105,7 +11381,8 @@ void cleanup_cfg(Program *program) } } -Temp lanecount_to_mask(isel_context *ctx, Temp count, bool allow64 = true) +Temp +lanecount_to_mask(isel_context* ctx, Temp count, bool allow64 = true) { assert(count.regClass() == s1); @@ -11119,30 +11396,33 @@ Temp lanecount_to_mask(isel_context *ctx, Temp count, bool allow64 = true) return mask; /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */ - Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, Operand(6u /* log2(64) */)); + Temp active_64 = + bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, Operand(6u /* log2(64) */)); cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), mask, bld.scc(active_64)); } else { - /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of the register */ + /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of + * the register */ cond = emit_extract_vector(ctx, mask, 0, bld.lm); } return cond; } -Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i) +Temp +merged_wave_info_to_mask(isel_context* ctx, unsigned i) { Builder bld(ctx->program, ctx->block); /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */ - Temp count = i == 0 - ? get_arg(ctx, ctx->args->ac.merged_wave_info) - : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), - get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(i * 8u)); + Temp count = i == 0 ? get_arg(ctx, ctx->args->ac.merged_wave_info) + : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), + get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(i * 8u)); return lanecount_to_mask(ctx, count); } -void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt) +void +ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt) { assert(vtx_cnt.id() && prm_cnt.id()); @@ -11152,25 +11432,31 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt if (ctx->program->chip_class == GFX10 && ctx->stage.has(SWStage::GS)) { /* Navi 1x workaround: make sure to always export at least 1 vertex and triangle */ prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand(0u)); - prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(1u), prm_cnt, bld.scc(prm_cnt_0)); - vtx_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(1u), vtx_cnt, bld.scc(prm_cnt_0)); + prm_cnt = + bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(1u), prm_cnt, bld.scc(prm_cnt_0)); + vtx_cnt = + bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(1u), vtx_cnt, bld.scc(prm_cnt_0)); } /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */ - Temp tmp = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand(12u)); + Temp tmp = + bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand(12u)); tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt); - /* Request the SPI to allocate space for the primitives and vertices that will be exported by the threadgroup. */ + /* Request the SPI to allocate space for the primitives and vertices + * that will be exported by the threadgroup. */ bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req); if (prm_cnt_0.id()) { /* Navi 1x workaround: export a triangle with NaN coordinates when GS has no output. - * It can't have all-zero positions because that would render an undesired pixel with conservative rasterization. + * It can't have all-zero positions because that would render an undesired pixel with + * conservative rasterization. */ Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)); Temp cond = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc), Operand(1u, ctx->program->wave_size == 64), first_lane); - cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), cond, Operand(0u, ctx->program->wave_size == 64), bld.scc(prm_cnt_0)); + cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), cond, + Operand(0u, ctx->program->wave_size == 64), bld.scc(prm_cnt_0)); if_context ic_prim_0; begin_divergent_if_then(ctx, &ic_prim_0, cond); @@ -11182,12 +11468,12 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt /* Use NaN for the coordinates, so that the rasterizer allways culls it. */ Temp nan_coord = bld.copy(bld.def(v1), Operand(-1u)); - bld.exp(aco_opcode::exp, zero, Operand(v1), Operand(v1), Operand(v1), - 1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */, - false /* compressed */, true /* done */, false /* valid mask */); - bld.exp(aco_opcode::exp, nan_coord, nan_coord, nan_coord, nan_coord, - 0xf /* enabled mask */, V_008DFC_SQ_EXP_POS /* dest */, - false /* compressed */, true /* done */, true /* valid mask */); + bld.exp(aco_opcode::exp, zero, Operand(v1), Operand(v1), Operand(v1), 1 /* enabled mask */, + V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */, true /* done */, + false /* valid mask */); + bld.exp(aco_opcode::exp, nan_coord, nan_coord, nan_coord, nan_coord, 0xf /* enabled mask */, + V_008DFC_SQ_EXP_POS /* dest */, false /* compressed */, true /* done */, + true /* valid mask */); begin_divergent_if_else(ctx, &ic_prim_0); end_divergent_if(ctx, &ic_prim_0); @@ -11197,25 +11483,23 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt } /* end namespace */ -void select_program(Program *program, - unsigned shader_count, - struct nir_shader *const *shaders, - ac_shader_config* config, - struct radv_shader_args *args) +void +select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders, + ac_shader_config* config, struct radv_shader_args* args) { isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false); if_context ic_merged_wave_info; bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS); for (unsigned i = 0; i < shader_count; i++) { - nir_shader *nir = shaders[i]; + nir_shader* nir = shaders[i]; init_context(&ctx, nir); setup_fp_mode(&ctx, nir); if (!i) { /* needs to be after init_context() for FS */ - Pseudo_instruction *startpgm = add_startpgm(&ctx); + Pseudo_instruction* startpgm = add_startpgm(&ctx); append_logical_start(ctx.block); if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs)) @@ -11229,20 +11513,22 @@ void select_program(Program *program, } /* In a merged VS+TCS HS, the VS implementation can be completely empty. */ - nir_function_impl *func = nir_shader_get_entrypoint(nir); - bool empty_shader = nir_cf_list_is_empty_block(&func->body) && - ((nir->info.stage == MESA_SHADER_VERTEX && - (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) || - (nir->info.stage == MESA_SHADER_TESS_EVAL && - ctx.stage == tess_eval_geometry_gs)); + nir_function_impl* func = nir_shader_get_entrypoint(nir); + bool empty_shader = + nir_cf_list_is_empty_block(&func->body) && + ((nir->info.stage == MESA_SHADER_VERTEX && + (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) || + (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs)); - bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1)); - bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1)); + bool check_merged_wave_info = + ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1)); + bool endif_merged_wave_info = + ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1)); - if (program->chip_class == GFX10 && - program->stage.hw == HWStage::NGG && + if (program->chip_class == GFX10 && program->stage.hw == HWStage::NGG && program->stage.num_sw_stages() == 1) { - /* Workaround for Navi 1x HW bug to ensure all NGG waves launch before s_sendmsg(GS_ALLOC_REQ). */ + /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before + * s_sendmsg(GS_ALLOC_REQ). */ Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u); } @@ -11263,7 +11549,8 @@ void select_program(Program *program, if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) { ctx.gs_wave_id = bld.pseudo(aco_opcode::p_extract, bld.def(s1, m0), bld.def(s1, scc), - get_arg(&ctx, args->ac.merged_wave_info), Operand(2u), Operand(8u), Operand(0u)); + get_arg(&ctx, args->ac.merged_wave_info), Operand(2u), + Operand(8u), Operand(0u)); } } else if (ctx.stage == geometry_gs) ctx.gs_wave_id = get_arg(&ctx, args->ac.gs_wave_id); @@ -11282,7 +11569,8 @@ void select_program(Program *program, Builder bld(ctx.program, ctx.block); bld.barrier(aco_opcode::p_barrier, memory_sync_info(storage_vmem_output, semantic_release, scope_device)); - bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0)); + bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, + sendmsg_gs_done(false, false, 0)); } if (ctx.stage == fragment_fs) { @@ -11313,9 +11601,9 @@ void select_program(Program *program, cleanup_cfg(program); } -void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, - ac_shader_config* config, - struct radv_shader_args *args) +void +select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config, + struct radv_shader_args* args) { isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true); @@ -11326,14 +11614,16 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, Builder bld(ctx.program, ctx.block); - Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), program->private_segment_buffer, Operand(RING_GSVS_VS * 16u)); + Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), + program->private_segment_buffer, Operand(RING_GSVS_VS * 16u)); Operand stream_id(0u); if (args->shader_info->so.num_outputs) stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(&ctx, ctx.args->ac.streamout_config), Operand(0x20018u)); - Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), get_arg(&ctx, ctx.args->ac.vertex_id)); + Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), + get_arg(&ctx, ctx.args->ac.vertex_id)); std::stack if_contexts; @@ -11348,7 +11638,8 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask)); if (!stream_id.isConstant()) { - Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream)); + Temp cond = + bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream)); if_contexts.emplace(); begin_uniform_if_then(&ctx, &if_contexts.top(), cond); bld.reset(ctx.block); @@ -11367,8 +11658,8 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, Temp val = bld.tmp(v1); unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4; - load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, - 0u, true, true, true); + load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true, + true, true); ctx.outputs.mask[i] |= 1 << j; ctx.outputs.temps[i * 4u + j] = val; @@ -11407,14 +11698,14 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, cleanup_cfg(program); } -void select_trap_handler_shader(Program *program, struct nir_shader *shader, - ac_shader_config* config, - struct radv_shader_args *args) +void +select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config, + struct radv_shader_args* args) { assert(args->options->chip_class == GFX8); - init_program(program, compute_cs, args->shader_info, - args->options->chip_class, args->options->family, args->options->wgp_mode, config); + init_program(program, compute_cs, args->shader_info, args->options->chip_class, + args->options->family, args->options->wgp_mode, config); isel_context ctx = {}; ctx.program = program; @@ -11433,12 +11724,12 @@ void select_trap_handler_shader(Program *program, struct nir_shader *shader, Builder bld(ctx.program, ctx.block); /* Load the buffer descriptor from TMA. */ - bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), - Operand(PhysReg{tma}, s2), Operand(0u)); + bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2), + Operand(0u)); /* Store TTMP0-TTMP1. */ - bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), - Operand(0u), Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true); + bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand(0u), + Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true); uint32_t hw_regs_idx[] = { 2, /* HW_REG_STATUS */ @@ -11453,8 +11744,8 @@ void select_trap_handler_shader(Program *program, struct nir_shader *shader, bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1), ((20 - 1) << 11) | hw_regs_idx[i]); - bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4), - Operand(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true); + bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4), Operand(8u + i * 4), + Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true); } program->config->float_mode = program->blocks[0].fp_mode.val; @@ -11465,4 +11756,4 @@ void select_trap_handler_shader(Program *program, struct nir_shader *shader, cleanup_cfg(program); } -} +} // namespace aco diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h index 50d1a76efd3..43dd76a31aa 100644 --- a/src/amd/compiler/aco_instruction_selection.h +++ b/src/amd/compiler/aco_instruction_selection.h @@ -39,21 +39,22 @@ struct shader_io_state { uint8_t mask[VARYING_SLOT_MAX]; Temp temps[VARYING_SLOT_MAX * 4u]; - shader_io_state() { + shader_io_state() + { memset(mask, 0, sizeof(mask)); std::fill_n(temps, VARYING_SLOT_MAX * 4u, Temp(0, RegClass::v1)); } }; struct isel_context { - const struct radv_nir_compiler_options *options; - struct radv_shader_args *args; - Program *program; - nir_shader *shader; + const struct radv_nir_compiler_options* options; + struct radv_shader_args* args; + Program* program; + nir_shader* shader; uint32_t constant_data_offset; - Block *block; + Block* block; uint32_t first_temp_id; - std::unordered_map> allocated_vec; + std::unordered_map> allocated_vec; Stage stage; struct { bool has_branch; @@ -66,7 +67,8 @@ struct isel_context { struct { bool is_divergent = false; } parent_if; - bool exec_potentially_empty_discard = false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */ + bool exec_potentially_empty_discard = + false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */ uint16_t exec_potentially_empty_break_depth = UINT16_MAX; /* Set to false when loop_nest_depth==exec_potentially_empty_break_depth * and parent_if.is_divergent==false. Called _break but it's also used for @@ -76,7 +78,7 @@ struct isel_context { } cf_info; /* NIR range analysis. */ - struct hash_table *range_ht; + struct hash_table* range_ht; nir_unsigned_upper_bound_config ub_config; Temp arg_temps[AC_MAX_ARGS]; @@ -102,22 +104,19 @@ struct isel_context { shader_io_state outputs; }; -inline Temp get_arg(isel_context *ctx, struct ac_arg arg) +inline Temp +get_arg(isel_context* ctx, struct ac_arg arg) { assert(arg.used); return ctx->arg_temps[arg.arg_index]; } -void init_context(isel_context *ctx, nir_shader *shader); -void cleanup_context(isel_context *ctx); +void init_context(isel_context* ctx, nir_shader* shader); +void cleanup_context(isel_context* ctx); -isel_context -setup_isel_context(Program* program, - unsigned shader_count, - struct nir_shader *const *shaders, - ac_shader_config* config, - struct radv_shader_args *args, - bool is_gs_copy_shader); +isel_context setup_isel_context(Program* program, unsigned shader_count, + struct nir_shader* const* shaders, ac_shader_config* config, + struct radv_shader_args* args, bool is_gs_copy_shader); } // namespace aco diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 191c8e86cce..430f9f62530 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -36,7 +36,8 @@ namespace aco { namespace { -unsigned get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp) +unsigned +get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp) { switch (interp) { case INTERP_MODE_SMOOTH: @@ -58,13 +59,13 @@ unsigned get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp) else if (intrin == nir_intrinsic_load_barycentric_sample) return S_0286CC_LINEAR_SAMPLE_ENA(1); break; - default: - break; + default: break; } return 0; } -bool is_loop_header_block(nir_block *block) +bool +is_loop_header_block(nir_block* block) { return block->cf_node.parent->type == nir_cf_node_loop && block == nir_loop_first_block(nir_cf_node_as_loop(block->cf_node.parent)); @@ -72,20 +73,20 @@ bool is_loop_header_block(nir_block *block) /* similar to nir_block_is_unreachable(), but does not require dominance information */ bool -is_block_reachable(nir_function_impl *impl, nir_block *known_reachable, nir_block *block) +is_block_reachable(nir_function_impl* impl, nir_block* known_reachable, nir_block* block) { if (block == nir_start_block(impl) || block == known_reachable) return true; /* skip loop back-edges */ if (is_loop_header_block(block)) { - nir_loop *loop = nir_cf_node_as_loop(block->cf_node.parent); - nir_block *preheader = nir_block_cf_tree_prev(nir_loop_first_block(loop)); + nir_loop* loop = nir_cf_node_as_loop(block->cf_node.parent); + nir_block* preheader = nir_block_cf_tree_prev(nir_loop_first_block(loop)); return is_block_reachable(impl, known_reachable, preheader); } - set_foreach(block->predecessors, entry) { - if (is_block_reachable(impl, known_reachable, (nir_block *)entry->key)) + set_foreach (block->predecessors, entry) { + if (is_block_reachable(impl, known_reachable, (nir_block*)entry->key)) return true; } @@ -94,12 +95,12 @@ is_block_reachable(nir_function_impl *impl, nir_block *known_reachable, nir_bloc /* Check whether the given SSA def is only used by cross-lane instructions. */ bool -only_used_by_cross_lane_instrs(nir_ssa_def *ssa, bool follow_phis = true) +only_used_by_cross_lane_instrs(nir_ssa_def* ssa, bool follow_phis = true) { nir_foreach_use (src, ssa) { switch (src->parent_instr->type) { case nir_instr_type_alu: { - nir_alu_instr *alu = nir_instr_as_alu(src->parent_instr); + nir_alu_instr* alu = nir_instr_as_alu(src->parent_instr); if (alu->op != nir_op_unpack_64_2x32_split_x && alu->op != nir_op_unpack_64_2x32_split_y) return false; if (!only_used_by_cross_lane_instrs(&alu->dest.dest.ssa, follow_phis)) @@ -108,7 +109,7 @@ only_used_by_cross_lane_instrs(nir_ssa_def *ssa, bool follow_phis = true) continue; } case nir_instr_type_intrinsic: { - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src->parent_instr); + nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(src->parent_instr); if (intrin->intrinsic != nir_intrinsic_read_invocation && intrin->intrinsic != nir_intrinsic_read_first_invocation && intrin->intrinsic != nir_intrinsic_lane_permute_16_amd) @@ -121,14 +122,13 @@ only_used_by_cross_lane_instrs(nir_ssa_def *ssa, bool follow_phis = true) if (!follow_phis) return false; - nir_phi_instr *phi = nir_instr_as_phi(src->parent_instr); + nir_phi_instr* phi = nir_instr_as_phi(src->parent_instr); if (!only_used_by_cross_lane_instrs(&phi->dest.ssa, false)) return false; continue; } - default: - return false; + default: return false; } } @@ -140,12 +140,12 @@ only_used_by_cross_lane_instrs(nir_ssa_def *ssa, bool follow_phis = true) * block instead. This is so that we can use any SGPR live-out of the side * without the branch without creating a linear phi in the invert or merge block. */ bool -sanitize_if(nir_function_impl *impl, nir_if *nif) +sanitize_if(nir_function_impl* impl, nir_if* nif) { - //TODO: skip this if the condition is uniform and there are no divergent breaks/continues? + // TODO: skip this if the condition is uniform and there are no divergent breaks/continues? - nir_block *then_block = nir_if_last_then_block(nif); - nir_block *else_block = nir_if_last_else_block(nif); + nir_block* then_block = nir_if_last_then_block(nif); + nir_block* else_block = nir_if_last_else_block(nif); bool then_jump = nir_block_ends_in_jump(then_block) || !is_block_reachable(impl, nir_if_first_then_block(nif), then_block); bool else_jump = nir_block_ends_in_jump(else_block) || @@ -167,47 +167,46 @@ sanitize_if(nir_function_impl *impl, nir_if *nif) nir_opt_remove_phis_block(nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node))); /* Finally, move the continue from branch after the if-statement. */ - nir_block *last_continue_from_blk = else_jump ? then_block : else_block; - nir_block *first_continue_from_blk = else_jump ? - nir_if_first_then_block(nif) : nir_if_first_else_block(nif); + nir_block* last_continue_from_blk = else_jump ? then_block : else_block; + nir_block* first_continue_from_blk = + else_jump ? nir_if_first_then_block(nif) : nir_if_first_else_block(nif); nir_cf_list tmp; nir_cf_extract(&tmp, nir_before_block(first_continue_from_blk), - nir_after_block(last_continue_from_blk)); + nir_after_block(last_continue_from_blk)); nir_cf_reinsert(&tmp, nir_after_cf_node(&nif->cf_node)); return true; } bool -sanitize_cf_list(nir_function_impl *impl, struct exec_list *cf_list) +sanitize_cf_list(nir_function_impl* impl, struct exec_list* cf_list) { bool progress = false; - foreach_list_typed(nir_cf_node, cf_node, node, cf_list) { + foreach_list_typed (nir_cf_node, cf_node, node, cf_list) { switch (cf_node->type) { - case nir_cf_node_block: - break; + case nir_cf_node_block: break; case nir_cf_node_if: { - nir_if *nif = nir_cf_node_as_if(cf_node); + nir_if* nif = nir_cf_node_as_if(cf_node); progress |= sanitize_cf_list(impl, &nif->then_list); progress |= sanitize_cf_list(impl, &nif->else_list); progress |= sanitize_if(impl, nif); break; } case nir_cf_node_loop: { - nir_loop *loop = nir_cf_node_as_loop(cf_node); + nir_loop* loop = nir_cf_node_as_loop(cf_node); progress |= sanitize_cf_list(impl, &loop->body); break; } - case nir_cf_node_function: - unreachable("Invalid cf type"); + case nir_cf_node_function: unreachable("Invalid cf type"); } } return progress; } -void apply_nuw_to_ssa(isel_context *ctx, nir_ssa_def *ssa) +void +apply_nuw_to_ssa(isel_context* ctx, nir_ssa_def* ssa) { nir_ssa_scalar scalar; scalar.def = ssa; @@ -216,7 +215,7 @@ void apply_nuw_to_ssa(isel_context *ctx, nir_ssa_def *ssa) if (!nir_ssa_scalar_is_alu(scalar) || nir_ssa_scalar_alu_op(scalar) != nir_op_iadd) return; - nir_alu_instr *add = nir_instr_as_alu(ssa->parent_instr); + nir_alu_instr* add = nir_instr_as_alu(ssa->parent_instr); if (add->no_unsigned_wrap) return; @@ -230,20 +229,19 @@ void apply_nuw_to_ssa(isel_context *ctx, nir_ssa_def *ssa) src1 = tmp; } - uint32_t src1_ub = nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, - src1, &ctx->ub_config); + uint32_t src1_ub = nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, src1, &ctx->ub_config); add->no_unsigned_wrap = - !nir_addition_might_overflow(ctx->shader, ctx->range_ht, src0, src1_ub, - &ctx->ub_config); + !nir_addition_might_overflow(ctx->shader, ctx->range_ht, src0, src1_ub, &ctx->ub_config); } -void apply_nuw_to_offsets(isel_context *ctx, nir_function_impl *impl) +void +apply_nuw_to_offsets(isel_context* ctx, nir_function_impl* impl) { - nir_foreach_block(block, impl) { - nir_foreach_instr(instr, block) { + nir_foreach_block (block, impl) { + nir_foreach_instr (instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { case nir_intrinsic_load_constant: @@ -261,14 +259,14 @@ void apply_nuw_to_offsets(isel_context *ctx, nir_function_impl *impl) if (!nir_src_is_divergent(intrin->src[2])) apply_nuw_to_ssa(ctx, intrin->src[2].ssa); break; - default: - break; + default: break; } } } } -RegClass get_reg_class(isel_context *ctx, RegType type, unsigned components, unsigned bitsize) +RegClass +get_reg_class(isel_context* ctx, RegType type, unsigned components, unsigned bitsize) { if (bitsize == 1) return RegClass(RegType::sgpr, ctx->program->lane_mask.size() * components); @@ -277,17 +275,16 @@ RegClass get_reg_class(isel_context *ctx, RegType type, unsigned components, uns } void -setup_vs_output_info(isel_context *ctx, nir_shader *nir, - bool export_prim_id, bool export_clip_dists, - radv_vs_output_info *outinfo) +setup_vs_output_info(isel_context* ctx, nir_shader* nir, bool export_prim_id, + bool export_clip_dists, radv_vs_output_info* outinfo) { memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, sizeof(outinfo->vs_output_param_offset)); outinfo->param_exports = 0; int pos_written = 0x1; - bool writes_primitive_shading_rate = outinfo->writes_primitive_shading_rate || - ctx->options->force_vrs_rates; + bool writes_primitive_shading_rate = + outinfo->writes_primitive_shading_rate || ctx->options->force_vrs_rates; if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer || writes_primitive_shading_rate) pos_written |= 1 << 1; @@ -297,7 +294,8 @@ setup_vs_output_info(isel_context *ctx, nir_shader *nir, int idx = u_bit_scan64(&mask); if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID || idx == VARYING_SLOT_VIEWPORT || - ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) { + ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && + export_clip_dists)) { if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED) outinfo->vs_output_param_offset[idx] = outinfo->param_exports++; } @@ -333,15 +331,14 @@ setup_vs_output_info(isel_context *ctx, nir_shader *nir, * as soon as it encounters a DONE pos export. When this happens, PS waves can launch * before the NGG (or VS) waves finish. */ - ctx->program->early_rast = ctx->program->chip_class >= GFX10 && - outinfo->param_exports == 0; + ctx->program->early_rast = ctx->program->chip_class >= GFX10 && outinfo->param_exports == 0; } void -setup_vs_variables(isel_context *ctx, nir_shader *nir) +setup_vs_variables(isel_context* ctx, nir_shader* nir) { if (ctx->stage == vertex_vs || ctx->stage == vertex_ngg) { - radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo; + radv_vs_output_info* outinfo = &ctx->program->info->vs.outinfo; setup_vs_output_info(ctx, nir, outinfo->export_prim_id, ctx->options->key.vs_common_out.export_clip_dists, outinfo); @@ -351,21 +348,26 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir) } if (ctx->stage == vertex_ngg) { - ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule); - assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < (32 * 1024)); + ctx->program->config->lds_size = + DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule); + assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < + (32 * 1024)); } } -void setup_gs_variables(isel_context *ctx, nir_shader *nir) +void +setup_gs_variables(isel_context* ctx, nir_shader* nir) { if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) { - ctx->program->config->lds_size = ctx->program->info->gs_ring_info.lds_size; /* Already in units of the alloc granularity */ + ctx->program->config->lds_size = + ctx->program->info->gs_ring_info.lds_size; /* Already in units of the alloc granularity */ } else if (ctx->stage == vertex_geometry_ngg || ctx->stage == tess_eval_geometry_ngg) { - radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo; - setup_vs_output_info(ctx, nir, false, - ctx->options->key.vs_common_out.export_clip_dists, outinfo); + radv_vs_output_info* outinfo = &ctx->program->info->vs.outinfo; + setup_vs_output_info(ctx, nir, false, ctx->options->key.vs_common_out.export_clip_dists, + outinfo); - ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule); + ctx->program->config->lds_size = + DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule); } if (ctx->stage.has(SWStage::VS)) @@ -375,7 +377,7 @@ void setup_gs_variables(isel_context *ctx, nir_shader *nir) } void -setup_tcs_info(isel_context *ctx, nir_shader *nir, nir_shader *vs) +setup_tcs_info(isel_context* ctx, nir_shader* nir, nir_shader* vs) { ctx->tcs_in_out_eq = ctx->args->shader_info->vs.tcs_in_out_eq; ctx->tcs_temp_only_inputs = ctx->args->shader_info->vs.tcs_temp_only_input_mask; @@ -384,12 +386,12 @@ setup_tcs_info(isel_context *ctx, nir_shader *nir, nir_shader *vs) } void -setup_tes_variables(isel_context *ctx, nir_shader *nir) +setup_tes_variables(isel_context* ctx, nir_shader* nir) { ctx->tcs_num_patches = ctx->args->shader_info->num_tess_patches; if (ctx->stage == tess_eval_vs || ctx->stage == tess_eval_ngg) { - radv_vs_output_info *outinfo = &ctx->program->info->tes.outinfo; + radv_vs_output_info* outinfo = &ctx->program->info->tes.outinfo; setup_vs_output_info(ctx, nir, outinfo->export_prim_id, ctx->options->key.vs_common_out.export_clip_dists, outinfo); @@ -399,20 +401,23 @@ setup_tes_variables(isel_context *ctx, nir_shader *nir) } if (ctx->stage == tess_eval_ngg) { - ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule); - assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < (32 * 1024)); + ctx->program->config->lds_size = + DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule); + assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < + (32 * 1024)); } } void -setup_variables(isel_context *ctx, nir_shader *nir) +setup_variables(isel_context* ctx, nir_shader* nir) { switch (nir->info.stage) { case MESA_SHADER_FRAGMENT: { break; } case MESA_SHADER_COMPUTE: { - ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule); + ctx->program->config->lds_size = + DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule); break; } case MESA_SHADER_VERTEX: { @@ -430,16 +435,16 @@ setup_variables(isel_context *ctx, nir_shader *nir) setup_tes_variables(ctx, nir); break; } - default: - unreachable("Unhandled shader stage."); + default: unreachable("Unhandled shader stage."); } /* Make sure we fit the available LDS space. */ - assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <= ctx->program->dev.lds_limit); + assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <= + ctx->program->dev.lds_limit); } void -setup_nir(isel_context *ctx, nir_shader *nir) +setup_nir(isel_context* ctx, nir_shader* nir) { /* the variable setup has to be done before lower_io / CSE */ setup_variables(ctx, nir); @@ -447,19 +452,20 @@ setup_nir(isel_context *ctx, nir_shader *nir) nir_convert_to_lcssa(nir, true, false); nir_lower_phis_to_scalar(nir, true); - nir_function_impl *func = nir_shader_get_entrypoint(nir); + nir_function_impl* func = nir_shader_get_entrypoint(nir); nir_index_ssa_defs(func); } } /* end namespace */ -void init_context(isel_context *ctx, nir_shader *shader) +void +init_context(isel_context* ctx, nir_shader* shader) { - nir_function_impl *impl = nir_shader_get_entrypoint(shader); + nir_function_impl* impl = nir_shader_get_entrypoint(shader); ctx->shader = shader; /* Init NIR range analysis. */ - ctx->range_ht =_mesa_pointer_hash_table_create(NULL); + ctx->range_ht = _mesa_pointer_hash_table_create(NULL); ctx->ub_config.min_subgroup_size = 64; ctx->ub_config.max_subgroup_size = 64; if (ctx->shader->info.stage == MESA_SHADER_COMPUTE && ctx->options->key.cs.subgroup_size) { @@ -481,34 +487,23 @@ void init_context(isel_context *ctx, nir_shader *shader) uint32_t max = UINT32_MAX; if (nfmt == V_008F0C_BUF_NUM_FORMAT_UNORM) { max = 0x3f800000u; - } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || - nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED) { + } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED) { bool uscaled = nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED; switch (dfmt) { case V_008F0C_BUF_DATA_FORMAT_8: case V_008F0C_BUF_DATA_FORMAT_8_8: - case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: - max = uscaled ? 0x437f0000u : UINT8_MAX; - break; + case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: max = uscaled ? 0x437f0000u : UINT8_MAX; break; case V_008F0C_BUF_DATA_FORMAT_10_10_10_2: - case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: - max = uscaled ? 0x447fc000u : 1023; - break; + case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: max = uscaled ? 0x447fc000u : 1023; break; case V_008F0C_BUF_DATA_FORMAT_10_11_11: - case V_008F0C_BUF_DATA_FORMAT_11_11_10: - max = uscaled ? 0x44ffe000u : 2047; - break; + case V_008F0C_BUF_DATA_FORMAT_11_11_10: max = uscaled ? 0x44ffe000u : 2047; break; case V_008F0C_BUF_DATA_FORMAT_16: case V_008F0C_BUF_DATA_FORMAT_16_16: - case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: - max = uscaled ? 0x477fff00u : UINT16_MAX; - break; + case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: max = uscaled ? 0x477fff00u : UINT16_MAX; break; case V_008F0C_BUF_DATA_FORMAT_32: case V_008F0C_BUF_DATA_FORMAT_32_32: case V_008F0C_BUF_DATA_FORMAT_32_32_32: - case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: - max = uscaled ? 0x4f800000u : UINT32_MAX; - break; + case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: max = uscaled ? 0x4f800000u : UINT32_MAX; break; } } ctx->ub_config.vertex_attrib_max[i] = max; @@ -533,7 +528,7 @@ void init_context(isel_context *ctx, nir_shader *shader) ctx->first_temp_id = ctx->program->peekAllocationId(); ctx->program->allocateRange(impl->ssa_alloc); - RegClass *regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id; + RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id; unsigned spi_ps_inputs = 0; @@ -543,100 +538,99 @@ void init_context(isel_context *ctx, nir_shader *shader) bool done = false; while (!done) { done = true; - nir_foreach_block(block, impl) { - nir_foreach_instr(instr, block) { - switch(instr->type) { + nir_foreach_block (block, impl) { + nir_foreach_instr (instr, block) { + switch (instr->type) { case nir_instr_type_alu: { - nir_alu_instr *alu_instr = nir_instr_as_alu(instr); - RegType type = nir_dest_is_divergent(alu_instr->dest.dest) ? RegType::vgpr : RegType::sgpr; - switch(alu_instr->op) { - case nir_op_fmul: - case nir_op_fadd: - case nir_op_fsub: - case nir_op_fmax: - case nir_op_fmin: - case nir_op_fneg: - case nir_op_fabs: - case nir_op_fsat: - case nir_op_fsign: - case nir_op_frcp: - case nir_op_frsq: - case nir_op_fsqrt: - case nir_op_fexp2: - case nir_op_flog2: - case nir_op_ffract: - case nir_op_ffloor: - case nir_op_fceil: - case nir_op_ftrunc: - case nir_op_fround_even: - case nir_op_fsin: - case nir_op_fcos: - case nir_op_f2f16: - case nir_op_f2f16_rtz: - case nir_op_f2f16_rtne: - case nir_op_f2f32: - case nir_op_f2f64: - case nir_op_u2f16: - case nir_op_u2f32: - case nir_op_u2f64: - case nir_op_i2f16: - case nir_op_i2f32: - case nir_op_i2f64: - case nir_op_pack_half_2x16_split: - case nir_op_unpack_half_2x16_split_x: - case nir_op_unpack_half_2x16_split_y: - case nir_op_fddx: - case nir_op_fddy: - case nir_op_fddx_fine: - case nir_op_fddy_fine: - case nir_op_fddx_coarse: - case nir_op_fddy_coarse: - case nir_op_fquantize2f16: - case nir_op_ldexp: - case nir_op_frexp_sig: - case nir_op_frexp_exp: - case nir_op_cube_face_index_amd: - case nir_op_cube_face_coord_amd: - case nir_op_sad_u8x4: - type = RegType::vgpr; - break; - case nir_op_f2i16: - case nir_op_f2u16: - case nir_op_f2i32: - case nir_op_f2u32: - case nir_op_f2i64: - case nir_op_f2u64: - case nir_op_b2i8: - case nir_op_b2i16: - case nir_op_b2i32: - case nir_op_b2i64: - case nir_op_b2b32: - case nir_op_b2f16: - case nir_op_b2f32: - case nir_op_mov: - break; - case nir_op_iadd: - case nir_op_isub: - case nir_op_imul: - case nir_op_imin: - case nir_op_imax: - case nir_op_umin: - case nir_op_umax: - case nir_op_ishl: - case nir_op_ishr: - case nir_op_ushr: - /* packed 16bit instructions have to be VGPR */ - type = alu_instr->dest.dest.ssa.num_components == 2 ? RegType::vgpr : type; - FALLTHROUGH; - default: - for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) { - if (regclasses[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr) - type = RegType::vgpr; - } - break; + nir_alu_instr* alu_instr = nir_instr_as_alu(instr); + RegType type = + nir_dest_is_divergent(alu_instr->dest.dest) ? RegType::vgpr : RegType::sgpr; + switch (alu_instr->op) { + case nir_op_fmul: + case nir_op_fadd: + case nir_op_fsub: + case nir_op_fmax: + case nir_op_fmin: + case nir_op_fneg: + case nir_op_fabs: + case nir_op_fsat: + case nir_op_fsign: + case nir_op_frcp: + case nir_op_frsq: + case nir_op_fsqrt: + case nir_op_fexp2: + case nir_op_flog2: + case nir_op_ffract: + case nir_op_ffloor: + case nir_op_fceil: + case nir_op_ftrunc: + case nir_op_fround_even: + case nir_op_fsin: + case nir_op_fcos: + case nir_op_f2f16: + case nir_op_f2f16_rtz: + case nir_op_f2f16_rtne: + case nir_op_f2f32: + case nir_op_f2f64: + case nir_op_u2f16: + case nir_op_u2f32: + case nir_op_u2f64: + case nir_op_i2f16: + case nir_op_i2f32: + case nir_op_i2f64: + case nir_op_pack_half_2x16_split: + case nir_op_unpack_half_2x16_split_x: + case nir_op_unpack_half_2x16_split_y: + case nir_op_fddx: + case nir_op_fddy: + case nir_op_fddx_fine: + case nir_op_fddy_fine: + case nir_op_fddx_coarse: + case nir_op_fddy_coarse: + case nir_op_fquantize2f16: + case nir_op_ldexp: + case nir_op_frexp_sig: + case nir_op_frexp_exp: + case nir_op_cube_face_index_amd: + case nir_op_cube_face_coord_amd: + case nir_op_sad_u8x4: type = RegType::vgpr; break; + case nir_op_f2i16: + case nir_op_f2u16: + case nir_op_f2i32: + case nir_op_f2u32: + case nir_op_f2i64: + case nir_op_f2u64: + case nir_op_b2i8: + case nir_op_b2i16: + case nir_op_b2i32: + case nir_op_b2i64: + case nir_op_b2b32: + case nir_op_b2f16: + case nir_op_b2f32: + case nir_op_mov: break; + case nir_op_iadd: + case nir_op_isub: + case nir_op_imul: + case nir_op_imin: + case nir_op_imax: + case nir_op_umin: + case nir_op_umax: + case nir_op_ishl: + case nir_op_ishr: + case nir_op_ushr: + /* packed 16bit instructions have to be VGPR */ + type = alu_instr->dest.dest.ssa.num_components == 2 ? RegType::vgpr : type; + FALLTHROUGH; + default: + for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) { + if (regclasses[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr) + type = RegType::vgpr; + } + break; } - RegClass rc = get_reg_class(ctx, type, alu_instr->dest.dest.ssa.num_components, alu_instr->dest.dest.ssa.bit_size); + RegClass rc = get_reg_class(ctx, type, alu_instr->dest.dest.ssa.num_components, + alu_instr->dest.dest.ssa.bit_size); regclasses[alu_instr->dest.dest.ssa.index] = rc; break; } @@ -648,207 +642,203 @@ void init_context(isel_context *ctx, nir_shader *shader) break; } case nir_instr_type_intrinsic: { - nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr); + nir_intrinsic_instr* intrinsic = nir_instr_as_intrinsic(instr); if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest) break; RegType type = RegType::sgpr; - switch(intrinsic->intrinsic) { - case nir_intrinsic_load_push_constant: - case nir_intrinsic_load_workgroup_id: - case nir_intrinsic_load_num_workgroups: - case nir_intrinsic_load_subgroup_id: - case nir_intrinsic_load_num_subgroups: - case nir_intrinsic_load_first_vertex: - case nir_intrinsic_load_base_instance: - case nir_intrinsic_vote_all: - case nir_intrinsic_vote_any: - case nir_intrinsic_read_first_invocation: - case nir_intrinsic_read_invocation: - case nir_intrinsic_first_invocation: - case nir_intrinsic_ballot: - case nir_intrinsic_load_ring_tess_factors_amd: - case nir_intrinsic_load_ring_tess_factors_offset_amd: - case nir_intrinsic_load_ring_tess_offchip_amd: - case nir_intrinsic_load_ring_tess_offchip_offset_amd: - case nir_intrinsic_load_ring_esgs_amd: - case nir_intrinsic_load_ring_es2gs_offset_amd: - case nir_intrinsic_image_deref_samples: - case nir_intrinsic_has_input_vertex_amd: - case nir_intrinsic_has_input_primitive_amd: - case nir_intrinsic_load_workgroup_num_input_vertices_amd: - case nir_intrinsic_load_workgroup_num_input_primitives_amd: - case nir_intrinsic_load_shader_query_enabled_amd: - type = RegType::sgpr; - break; - case nir_intrinsic_load_sample_id: - case nir_intrinsic_load_sample_mask_in: - case nir_intrinsic_load_input: - case nir_intrinsic_load_output: - case nir_intrinsic_load_input_vertex: - case nir_intrinsic_load_per_vertex_input: - case nir_intrinsic_load_per_vertex_output: - case nir_intrinsic_load_vertex_id: - case nir_intrinsic_load_vertex_id_zero_base: - case nir_intrinsic_load_barycentric_sample: - case nir_intrinsic_load_barycentric_pixel: - case nir_intrinsic_load_barycentric_model: - case nir_intrinsic_load_barycentric_centroid: - case nir_intrinsic_load_barycentric_at_sample: - case nir_intrinsic_load_barycentric_at_offset: - case nir_intrinsic_load_interpolated_input: - case nir_intrinsic_load_frag_coord: - case nir_intrinsic_load_frag_shading_rate: - case nir_intrinsic_load_sample_pos: - case nir_intrinsic_load_layer_id: - case nir_intrinsic_load_local_invocation_id: - case nir_intrinsic_load_local_invocation_index: - case nir_intrinsic_load_subgroup_invocation: - case nir_intrinsic_load_tess_coord: - case nir_intrinsic_write_invocation_amd: - case nir_intrinsic_mbcnt_amd: - case nir_intrinsic_byte_permute_amd: - case nir_intrinsic_lane_permute_16_amd: - case nir_intrinsic_load_instance_id: - case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_ssbo_atomic_comp_swap: - case nir_intrinsic_global_atomic_add: - case nir_intrinsic_global_atomic_imin: - case nir_intrinsic_global_atomic_umin: - case nir_intrinsic_global_atomic_imax: - case nir_intrinsic_global_atomic_umax: - case nir_intrinsic_global_atomic_and: - case nir_intrinsic_global_atomic_or: - case nir_intrinsic_global_atomic_xor: - case nir_intrinsic_global_atomic_exchange: - case nir_intrinsic_global_atomic_comp_swap: - case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_umin: - case nir_intrinsic_image_deref_atomic_imin: - case nir_intrinsic_image_deref_atomic_umax: - case nir_intrinsic_image_deref_atomic_imax: - case nir_intrinsic_image_deref_atomic_and: - case nir_intrinsic_image_deref_atomic_or: - case nir_intrinsic_image_deref_atomic_xor: - case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_image_deref_atomic_comp_swap: - case nir_intrinsic_image_deref_size: - case nir_intrinsic_shared_atomic_add: - case nir_intrinsic_shared_atomic_imin: - case nir_intrinsic_shared_atomic_umin: - case nir_intrinsic_shared_atomic_imax: - case nir_intrinsic_shared_atomic_umax: - case nir_intrinsic_shared_atomic_and: - case nir_intrinsic_shared_atomic_or: - case nir_intrinsic_shared_atomic_xor: - case nir_intrinsic_shared_atomic_exchange: - case nir_intrinsic_shared_atomic_comp_swap: - case nir_intrinsic_shared_atomic_fadd: - case nir_intrinsic_load_scratch: - case nir_intrinsic_load_invocation_id: - case nir_intrinsic_load_primitive_id: - case nir_intrinsic_load_buffer_amd: - case nir_intrinsic_load_tess_rel_patch_id_amd: - case nir_intrinsic_load_gs_vertex_offset_amd: - case nir_intrinsic_load_initial_edgeflag_amd: - case nir_intrinsic_load_packed_passthrough_primitive_amd: - case nir_intrinsic_gds_atomic_add_amd: - case nir_intrinsic_load_sbt_amd: - case nir_intrinsic_bvh64_intersect_ray_amd: + switch (intrinsic->intrinsic) { + case nir_intrinsic_load_push_constant: + case nir_intrinsic_load_workgroup_id: + case nir_intrinsic_load_num_workgroups: + case nir_intrinsic_load_subgroup_id: + case nir_intrinsic_load_num_subgroups: + case nir_intrinsic_load_first_vertex: + case nir_intrinsic_load_base_instance: + case nir_intrinsic_vote_all: + case nir_intrinsic_vote_any: + case nir_intrinsic_read_first_invocation: + case nir_intrinsic_read_invocation: + case nir_intrinsic_first_invocation: + case nir_intrinsic_ballot: + case nir_intrinsic_load_ring_tess_factors_amd: + case nir_intrinsic_load_ring_tess_factors_offset_amd: + case nir_intrinsic_load_ring_tess_offchip_amd: + case nir_intrinsic_load_ring_tess_offchip_offset_amd: + case nir_intrinsic_load_ring_esgs_amd: + case nir_intrinsic_load_ring_es2gs_offset_amd: + case nir_intrinsic_image_deref_samples: + case nir_intrinsic_has_input_vertex_amd: + case nir_intrinsic_has_input_primitive_amd: + case nir_intrinsic_load_workgroup_num_input_vertices_amd: + case nir_intrinsic_load_workgroup_num_input_primitives_amd: + case nir_intrinsic_load_shader_query_enabled_amd: type = RegType::sgpr; break; + case nir_intrinsic_load_sample_id: + case nir_intrinsic_load_sample_mask_in: + case nir_intrinsic_load_input: + case nir_intrinsic_load_output: + case nir_intrinsic_load_input_vertex: + case nir_intrinsic_load_per_vertex_input: + case nir_intrinsic_load_per_vertex_output: + case nir_intrinsic_load_vertex_id: + case nir_intrinsic_load_vertex_id_zero_base: + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_model: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_at_sample: + case nir_intrinsic_load_barycentric_at_offset: + case nir_intrinsic_load_interpolated_input: + case nir_intrinsic_load_frag_coord: + case nir_intrinsic_load_frag_shading_rate: + case nir_intrinsic_load_sample_pos: + case nir_intrinsic_load_layer_id: + case nir_intrinsic_load_local_invocation_id: + case nir_intrinsic_load_local_invocation_index: + case nir_intrinsic_load_subgroup_invocation: + case nir_intrinsic_load_tess_coord: + case nir_intrinsic_write_invocation_amd: + case nir_intrinsic_mbcnt_amd: + case nir_intrinsic_byte_permute_amd: + case nir_intrinsic_lane_permute_16_amd: + case nir_intrinsic_load_instance_id: + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + case nir_intrinsic_global_atomic_add: + case nir_intrinsic_global_atomic_imin: + case nir_intrinsic_global_atomic_umin: + case nir_intrinsic_global_atomic_imax: + case nir_intrinsic_global_atomic_umax: + case nir_intrinsic_global_atomic_and: + case nir_intrinsic_global_atomic_or: + case nir_intrinsic_global_atomic_xor: + case nir_intrinsic_global_atomic_exchange: + case nir_intrinsic_global_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_size: + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_shared_atomic_comp_swap: + case nir_intrinsic_shared_atomic_fadd: + case nir_intrinsic_load_scratch: + case nir_intrinsic_load_invocation_id: + case nir_intrinsic_load_primitive_id: + case nir_intrinsic_load_buffer_amd: + case nir_intrinsic_load_tess_rel_patch_id_amd: + case nir_intrinsic_load_gs_vertex_offset_amd: + case nir_intrinsic_load_initial_edgeflag_amd: + case nir_intrinsic_load_packed_passthrough_primitive_amd: + case nir_intrinsic_gds_atomic_add_amd: + case nir_intrinsic_load_sbt_amd: + case nir_intrinsic_bvh64_intersect_ray_amd: type = RegType::vgpr; break; + case nir_intrinsic_load_shared: + /* When the result of these loads is only used by cross-lane instructions, + * it is beneficial to use a VGPR destination. This is because this allows + * to put the s_waitcnt further down, which decreases latency. + */ + if (only_used_by_cross_lane_instrs(&intrinsic->dest.ssa)) { type = RegType::vgpr; break; - case nir_intrinsic_load_shared: - /* When the result of these loads is only used by cross-lane instructions, - * it is beneficial to use a VGPR destination. This is because this allows - * to put the s_waitcnt further down, which decreases latency. - */ - if (only_used_by_cross_lane_instrs(&intrinsic->dest.ssa)) { + } + FALLTHROUGH; + case nir_intrinsic_shuffle: + case nir_intrinsic_quad_broadcast: + case nir_intrinsic_quad_swap_horizontal: + case nir_intrinsic_quad_swap_vertical: + case nir_intrinsic_quad_swap_diagonal: + case nir_intrinsic_quad_swizzle_amd: + case nir_intrinsic_masked_swizzle_amd: + case nir_intrinsic_inclusive_scan: + case nir_intrinsic_exclusive_scan: + case nir_intrinsic_reduce: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_global: + case nir_intrinsic_vulkan_resource_index: + case nir_intrinsic_get_ssbo_size: + type = nir_dest_is_divergent(intrinsic->dest) ? RegType::vgpr : RegType::sgpr; + break; + case nir_intrinsic_load_view_index: + type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr; + break; + default: + for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs; + i++) { + if (regclasses[intrinsic->src[i].ssa->index].type() == RegType::vgpr) type = RegType::vgpr; - break; - } - FALLTHROUGH; - case nir_intrinsic_shuffle: - case nir_intrinsic_quad_broadcast: - case nir_intrinsic_quad_swap_horizontal: - case nir_intrinsic_quad_swap_vertical: - case nir_intrinsic_quad_swap_diagonal: - case nir_intrinsic_quad_swizzle_amd: - case nir_intrinsic_masked_swizzle_amd: - case nir_intrinsic_inclusive_scan: - case nir_intrinsic_exclusive_scan: - case nir_intrinsic_reduce: - case nir_intrinsic_load_ubo: - case nir_intrinsic_load_ssbo: - case nir_intrinsic_load_global: - case nir_intrinsic_vulkan_resource_index: - case nir_intrinsic_get_ssbo_size: - type = nir_dest_is_divergent(intrinsic->dest) ? RegType::vgpr : RegType::sgpr; - break; - case nir_intrinsic_load_view_index: - type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr; - break; - default: - for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs; i++) { - if (regclasses[intrinsic->src[i].ssa->index].type() == RegType::vgpr) - type = RegType::vgpr; - } - break; + } + break; } - RegClass rc = get_reg_class(ctx, type, intrinsic->dest.ssa.num_components, intrinsic->dest.ssa.bit_size); + RegClass rc = get_reg_class(ctx, type, intrinsic->dest.ssa.num_components, + intrinsic->dest.ssa.bit_size); regclasses[intrinsic->dest.ssa.index] = rc; - switch(intrinsic->intrinsic) { - case nir_intrinsic_load_barycentric_sample: - case nir_intrinsic_load_barycentric_pixel: - case nir_intrinsic_load_barycentric_centroid: - case nir_intrinsic_load_barycentric_at_sample: - case nir_intrinsic_load_barycentric_at_offset: { - glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(intrinsic); - spi_ps_inputs |= get_interp_input(intrinsic->intrinsic, mode); - break; + switch (intrinsic->intrinsic) { + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_at_sample: + case nir_intrinsic_load_barycentric_at_offset: { + glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(intrinsic); + spi_ps_inputs |= get_interp_input(intrinsic->intrinsic, mode); + break; + } + case nir_intrinsic_load_barycentric_model: + spi_ps_inputs |= S_0286CC_PERSP_PULL_MODEL_ENA(1); + break; + case nir_intrinsic_load_front_face: + spi_ps_inputs |= S_0286CC_FRONT_FACE_ENA(1); + break; + case nir_intrinsic_load_frag_coord: + case nir_intrinsic_load_sample_pos: { + uint8_t mask = nir_ssa_def_components_read(&intrinsic->dest.ssa); + for (unsigned i = 0; i < 4; i++) { + if (mask & (1 << i)) + spi_ps_inputs |= S_0286CC_POS_X_FLOAT_ENA(1) << i; } - case nir_intrinsic_load_barycentric_model: - spi_ps_inputs |= S_0286CC_PERSP_PULL_MODEL_ENA(1); - break; - case nir_intrinsic_load_front_face: - spi_ps_inputs |= S_0286CC_FRONT_FACE_ENA(1); - break; - case nir_intrinsic_load_frag_coord: - case nir_intrinsic_load_sample_pos: { - uint8_t mask = nir_ssa_def_components_read(&intrinsic->dest.ssa); - for (unsigned i = 0; i < 4; i++) { - if (mask & (1 << i)) - spi_ps_inputs |= S_0286CC_POS_X_FLOAT_ENA(1) << i; - } - - if (ctx->options->adjust_frag_coord_z && - intrinsic->intrinsic == nir_intrinsic_load_frag_coord && - G_0286CC_POS_Z_FLOAT_ENA(spi_ps_inputs)) { - /* Enable ancillary for adjusting gl_FragCoord.z for - * VRS due to a hw bug on some GFX10.3 chips. - */ - spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1); - } - break; + if (ctx->options->adjust_frag_coord_z && + intrinsic->intrinsic == nir_intrinsic_load_frag_coord && + G_0286CC_POS_Z_FLOAT_ENA(spi_ps_inputs)) { + /* Enable ancillary for adjusting gl_FragCoord.z for + * VRS due to a hw bug on some GFX10.3 chips. + */ + spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1); } - case nir_intrinsic_load_sample_id: - case nir_intrinsic_load_frag_shading_rate: - spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1); - break; - case nir_intrinsic_load_sample_mask_in: - spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1); - spi_ps_inputs |= S_0286CC_SAMPLE_COVERAGE_ENA(1); - break; - default: - break; + break; + } + case nir_intrinsic_load_sample_id: + case nir_intrinsic_load_frag_shading_rate: + spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1); + break; + case nir_intrinsic_load_sample_mask_in: + spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1); + spi_ps_inputs |= S_0286CC_SAMPLE_COVERAGE_ENA(1); + break; + default: break; } break; } @@ -860,13 +850,13 @@ void init_context(isel_context *ctx, nir_shader *shader) assert(!tex->dest.ssa.divergent); } - RegClass rc = get_reg_class(ctx, type, tex->dest.ssa.num_components, - tex->dest.ssa.bit_size); + RegClass rc = + get_reg_class(ctx, type, tex->dest.ssa.num_components, tex->dest.ssa.bit_size); regclasses[tex->dest.ssa.index] = rc; break; } case nir_instr_type_parallel_copy: { - nir_foreach_parallel_copy_entry(entry, nir_instr_as_parallel_copy(instr)) { + nir_foreach_parallel_copy_entry (entry, nir_instr_as_parallel_copy(instr)) { regclasses[entry->dest.ssa.index] = regclasses[entry->src.ssa->index]; } break; @@ -900,8 +890,7 @@ void init_context(isel_context *ctx, nir_shader *shader) regclasses[phi->dest.ssa.index] = rc; break; } - default: - break; + default: break; } } } @@ -931,47 +920,33 @@ void init_context(isel_context *ctx, nir_shader *shader) (uint8_t*)shader->constant_data + shader->constant_data_size); } -void cleanup_context(isel_context *ctx) +void +cleanup_context(isel_context* ctx) { _mesa_hash_table_destroy(ctx->range_ht, NULL); } isel_context -setup_isel_context(Program* program, - unsigned shader_count, - struct nir_shader *const *shaders, - ac_shader_config* config, - struct radv_shader_args *args, - bool is_gs_copy_shader) +setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* const* shaders, + ac_shader_config* config, struct radv_shader_args* args, bool is_gs_copy_shader) { SWStage sw_stage = SWStage::None; for (unsigned i = 0; i < shader_count; i++) { switch (shaders[i]->info.stage) { - case MESA_SHADER_VERTEX: - sw_stage = sw_stage | SWStage::VS; - break; - case MESA_SHADER_TESS_CTRL: - sw_stage = sw_stage | SWStage::TCS; - break; - case MESA_SHADER_TESS_EVAL: - sw_stage = sw_stage | SWStage::TES; - break; + case MESA_SHADER_VERTEX: sw_stage = sw_stage | SWStage::VS; break; + case MESA_SHADER_TESS_CTRL: sw_stage = sw_stage | SWStage::TCS; break; + case MESA_SHADER_TESS_EVAL: sw_stage = sw_stage | SWStage::TES; break; case MESA_SHADER_GEOMETRY: sw_stage = sw_stage | (is_gs_copy_shader ? SWStage::GSCopy : SWStage::GS); break; - case MESA_SHADER_FRAGMENT: - sw_stage = sw_stage | SWStage::FS; - break; - case MESA_SHADER_COMPUTE: - sw_stage = sw_stage | SWStage::CS; - break; - default: - unreachable("Shader stage not implemented"); + case MESA_SHADER_FRAGMENT: sw_stage = sw_stage | SWStage::FS; break; + case MESA_SHADER_COMPUTE: sw_stage = sw_stage | SWStage::CS; break; + default: unreachable("Shader stage not implemented"); } } bool gfx9_plus = args->options->chip_class >= GFX9; bool ngg = args->shader_info->is_ngg && args->options->chip_class >= GFX10; - HWStage hw_stage { }; + HWStage hw_stage{}; if (sw_stage == SWStage::VS && args->shader_info->vs.as_es && !ngg) hw_stage = HWStage::ES; else if (sw_stage == SWStage::VS && !args->shader_info->vs.as_ls && !ngg) @@ -1009,8 +984,8 @@ setup_isel_context(Program* program, else unreachable("Shader stage not implemented"); - init_program(program, Stage { hw_stage, sw_stage }, args->shader_info, - args->options->chip_class, args->options->family, args->options->wgp_mode, config); + init_program(program, Stage{hw_stage, sw_stage}, args->shader_info, args->options->chip_class, + args->options->family, args->options->wgp_mode, config); isel_context ctx = {}; ctx.program = program; @@ -1028,29 +1003,37 @@ setup_isel_context(Program* program, shaders[0]->info.workgroup_size[1] * shaders[0]->info.workgroup_size[2]; } else if (program->stage.hw == HWStage::ES || program->stage == geometry_gs) { - /* Unmerged ESGS operate in workgroups if on-chip GS (LDS rings) are enabled on GFX7-8 (not implemented in Mesa) */ + /* Unmerged ESGS operate in workgroups if on-chip GS (LDS rings) are enabled on GFX7-8 + * (not implemented in Mesa) */ program->workgroup_size = program->wave_size; } else if (program->stage.hw == HWStage::GS) { /* If on-chip GS (LDS rings) are enabled on GFX9 or later, merged GS operates in workgroups */ assert(program->chip_class >= GFX9); - uint32_t es_verts_per_subgrp = G_028A44_ES_VERTS_PER_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl); - uint32_t gs_instr_prims_in_subgrp = G_028A44_GS_INST_PRIMS_IN_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl); + uint32_t es_verts_per_subgrp = + G_028A44_ES_VERTS_PER_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl); + uint32_t gs_instr_prims_in_subgrp = + G_028A44_GS_INST_PRIMS_IN_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl); uint32_t workgroup_size = MAX2(es_verts_per_subgrp, gs_instr_prims_in_subgrp); program->workgroup_size = MAX2(MIN2(workgroup_size, 256), 1); } else if (program->stage == vertex_ls) { /* Unmerged LS operates in workgroups */ - program->workgroup_size = UINT_MAX; /* TODO: probably tcs_num_patches * tcs_vertices_in, but those are not plumbed to ACO for LS */ + program->workgroup_size = UINT_MAX; /* TODO: probably tcs_num_patches * tcs_vertices_in, but + those are not plumbed to ACO for LS */ } else if (program->stage == tess_control_hs) { /* Unmerged HS operates in workgroups, size is determined by the output vertices */ setup_tcs_info(&ctx, shaders[0], NULL); program->workgroup_size = ctx.tcs_num_patches * shaders[0]->info.tess.tcs_vertices_out; } else if (program->stage == vertex_tess_control_hs) { - /* Merged LSHS operates in workgroups, but can still have a different number of LS and HS invocations */ + /* Merged LSHS operates in workgroups, but can still have a different number of LS and HS + * invocations */ setup_tcs_info(&ctx, shaders[1], shaders[0]); - program->workgroup_size = ctx.tcs_num_patches * MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices); + program->workgroup_size = + ctx.tcs_num_patches * + MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices); } else if (program->stage.hw == HWStage::NGG) { - gfx10_ngg_info &ngg_info = args->shader_info->ngg_info; - unsigned num_gs_invocations = (program->stage.has(SWStage::GS)) ? MAX2(shaders[1]->info.gs.invocations, 1) : 1; + gfx10_ngg_info& ngg_info = args->shader_info->ngg_info; + unsigned num_gs_invocations = + (program->stage.has(SWStage::GS)) ? MAX2(shaders[1]->info.gs.invocations, 1) : 1; /* Max ES (SW VS/TES) threads */ uint32_t max_esverts = ngg_info.hw_max_esverts; @@ -1074,7 +1057,7 @@ setup_isel_context(Program* program, setup_vs_output_info(&ctx, shaders[0], false, true, &args->shader_info->vs.outinfo); } else { for (unsigned i = 0; i < shader_count; i++) { - nir_shader *nir = shaders[i]; + nir_shader* nir = shaders[i]; setup_nir(&ctx, nir); } @@ -1090,4 +1073,4 @@ setup_isel_context(Program* program, return ctx; } -} +} // namespace aco diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index e7ea91e3c84..0c06f2a3ca8 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -23,6 +23,7 @@ */ #include "aco_interface.h" + #include "aco_ir.h" #include "vulkan/radv_shader.h" @@ -37,23 +38,33 @@ static const std::array statistic_infos = []() { std::array ret{}; - ret[aco::statistic_hash] = aco_compiler_statistic_info{"Hash", "CRC32 hash of code and constant data"}; - ret[aco::statistic_instructions] = aco_compiler_statistic_info{"Instructions", "Instruction count"}; - ret[aco::statistic_copies] = aco_compiler_statistic_info{"Copies", "Copy instructions created for pseudo-instructions"}; + ret[aco::statistic_hash] = + aco_compiler_statistic_info{"Hash", "CRC32 hash of code and constant data"}; + ret[aco::statistic_instructions] = + aco_compiler_statistic_info{"Instructions", "Instruction count"}; + ret[aco::statistic_copies] = + aco_compiler_statistic_info{"Copies", "Copy instructions created for pseudo-instructions"}; ret[aco::statistic_branches] = aco_compiler_statistic_info{"Branches", "Branch instructions"}; - ret[aco::statistic_latency] = aco_compiler_statistic_info{"Latency", "Issue cycles plus stall cycles"}; - ret[aco::statistic_inv_throughput] = aco_compiler_statistic_info{"Inverse Throughput", "Estimated busy cycles to execute one wave"}; - ret[aco::statistic_vmem_clauses] = aco_compiler_statistic_info{"VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"}; - ret[aco::statistic_smem_clauses] = aco_compiler_statistic_info{"SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"}; - ret[aco::statistic_sgpr_presched] = aco_compiler_statistic_info{"Pre-Sched SGPRs", "SGPR usage before scheduling"}; - ret[aco::statistic_vgpr_presched] = aco_compiler_statistic_info{"Pre-Sched VGPRs", "VGPR usage before scheduling"}; + ret[aco::statistic_latency] = + aco_compiler_statistic_info{"Latency", "Issue cycles plus stall cycles"}; + ret[aco::statistic_inv_throughput] = aco_compiler_statistic_info{ + "Inverse Throughput", "Estimated busy cycles to execute one wave"}; + ret[aco::statistic_vmem_clauses] = aco_compiler_statistic_info{ + "VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"}; + ret[aco::statistic_smem_clauses] = aco_compiler_statistic_info{ + "SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"}; + ret[aco::statistic_sgpr_presched] = + aco_compiler_statistic_info{"Pre-Sched SGPRs", "SGPR usage before scheduling"}; + ret[aco::statistic_vgpr_presched] = + aco_compiler_statistic_info{"Pre-Sched VGPRs", "VGPR usage before scheduling"}; return ret; }(); const unsigned aco_num_statistics = aco::num_statistics; -const aco_compiler_statistic_info *aco_statistic_infos = statistic_infos.data(); +const aco_compiler_statistic_info* aco_statistic_infos = statistic_infos.data(); -static void validate(aco::Program *program) +static void +validate(aco::Program* program) { if (!(aco::debug_flags & aco::DEBUG_VALIDATE_IR)) return; @@ -62,10 +73,9 @@ static void validate(aco::Program *program) assert(is_valid); } -void aco_compile_shader(unsigned shader_count, - struct nir_shader *const *shaders, - struct radv_shader_binary **binary, - struct radv_shader_args *args) +void +aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders, + struct radv_shader_binary** binary, struct radv_shader_args* args) { aco::init(); @@ -116,11 +126,11 @@ void aco_compile_shader(unsigned shader_count, std::string llvm_ir; if (args->options->record_ir) { - char *data = NULL; + char* data = NULL; size_t size = 0; u_memstream mem; if (u_memstream_open(&mem, &data, &size)) { - FILE *const memf = u_memstream_get(&mem); + FILE* const memf = u_memstream_get(&mem); aco_print_program(program.get(), memf); fputc(0, memf); u_memstream_close(&mem); @@ -137,8 +147,7 @@ void aco_compile_shader(unsigned shader_count, aco_print_program(program.get(), stderr, live_vars, aco::print_live_vars | aco::print_kill); if (!args->is_trap_handler_shader) { - if (!args->options->disable_optimizations && - !(aco::debug_flags & aco::DEBUG_NO_SCHED)) + if (!args->options->disable_optimizations && !(aco::debug_flags & aco::DEBUG_NO_SCHED)) aco::schedule_program(program.get(), live_vars); validate(program.get()); @@ -189,11 +198,11 @@ void aco_compile_shader(unsigned shader_count, std::string disasm; if (get_disasm) { - char *data = NULL; + char* data = NULL; size_t disasm_size = 0; struct u_memstream mem; if (u_memstream_open(&mem, &data, &disasm_size)) { - FILE *const memf = u_memstream_get(&mem); + FILE* const memf = u_memstream_get(&mem); aco::print_asm(program.get(), code, exec_size / 4u, memf); fputc(0, memf); u_memstream_close(&mem); @@ -214,10 +223,10 @@ void aco_compile_shader(unsigned shader_count, * directly for the disk cache. Uninitialized data can appear because of * padding in the struct or because legacy_binary->data can be at an offset * from the start less than sizeof(radv_shader_binary_legacy). */ - radv_shader_binary_legacy* legacy_binary = (radv_shader_binary_legacy*) calloc(size, 1); + radv_shader_binary_legacy* legacy_binary = (radv_shader_binary_legacy*)calloc(size, 1); legacy_binary->base.type = RADV_BINARY_TYPE_LEGACY; - legacy_binary->base.stage = shaders[shader_count-1]->info.stage; + legacy_binary->base.stage = shaders[shader_count - 1]->info.stage; legacy_binary->base.is_gs_copy_shader = args->is_gs_copy_shader; legacy_binary->base.total_size = size; @@ -225,7 +234,8 @@ void aco_compile_shader(unsigned shader_count, memcpy(legacy_binary->data, program->statistics, aco::num_statistics * sizeof(uint32_t)); legacy_binary->stats_size = stats_size; - memcpy(legacy_binary->data + legacy_binary->stats_size, code.data(), code.size() * sizeof(uint32_t)); + memcpy(legacy_binary->data + legacy_binary->stats_size, code.data(), + code.size() * sizeof(uint32_t)); legacy_binary->exec_size = exec_size; legacy_binary->code_size = code.size() * sizeof(uint32_t); @@ -233,12 +243,15 @@ void aco_compile_shader(unsigned shader_count, legacy_binary->disasm_size = 0; legacy_binary->ir_size = llvm_ir.size(); - llvm_ir.copy((char*) legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size, llvm_ir.size()); + llvm_ir.copy((char*)legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size, + llvm_ir.size()); if (get_disasm) { - disasm.copy((char*) legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size + llvm_ir.size(), disasm.size()); + disasm.copy((char*)legacy_binary->data + legacy_binary->stats_size + + legacy_binary->code_size + llvm_ir.size(), + disasm.size()); legacy_binary->disasm_size = disasm.size(); } - *binary = (radv_shader_binary*) legacy_binary; + *binary = (radv_shader_binary*)legacy_binary; } diff --git a/src/amd/compiler/aco_interface.h b/src/amd/compiler/aco_interface.h index a4cace90912..a0df87827ef 100644 --- a/src/amd/compiler/aco_interface.h +++ b/src/amd/compiler/aco_interface.h @@ -39,12 +39,10 @@ struct aco_compiler_statistic_info { }; extern const unsigned aco_num_statistics; -extern const struct aco_compiler_statistic_info *aco_statistic_infos; +extern const struct aco_compiler_statistic_info* aco_statistic_infos; -void aco_compile_shader(unsigned shader_count, - struct nir_shader *const *shaders, - struct radv_shader_binary** binary, - struct radv_shader_args *args); +void aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders, + struct radv_shader_binary** binary, struct radv_shader_args* args); #ifdef __cplusplus } diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 4184aa1cd43..79f9d71a793 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -32,39 +32,40 @@ namespace aco { uint64_t debug_flags = 0; -static const struct debug_control aco_debug_options[] = { - {"validateir", DEBUG_VALIDATE_IR}, - {"validatera", DEBUG_VALIDATE_RA}, - {"perfwarn", DEBUG_PERFWARN}, - {"force-waitcnt", DEBUG_FORCE_WAITCNT}, - {"novn", DEBUG_NO_VN}, - {"noopt", DEBUG_NO_OPT}, - {"nosched", DEBUG_NO_SCHED}, - {"perfinfo", DEBUG_PERF_INFO}, - {"liveinfo", DEBUG_LIVE_INFO}, - {NULL, 0} -}; +static const struct debug_control aco_debug_options[] = {{"validateir", DEBUG_VALIDATE_IR}, + {"validatera", DEBUG_VALIDATE_RA}, + {"perfwarn", DEBUG_PERFWARN}, + {"force-waitcnt", DEBUG_FORCE_WAITCNT}, + {"novn", DEBUG_NO_VN}, + {"noopt", DEBUG_NO_OPT}, + {"nosched", DEBUG_NO_SCHED}, + {"perfinfo", DEBUG_PERF_INFO}, + {"liveinfo", DEBUG_LIVE_INFO}, + {NULL, 0}}; static once_flag init_once_flag = ONCE_FLAG_INIT; -static void init_once() +static void +init_once() { debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options); - #ifndef NDEBUG +#ifndef NDEBUG /* enable some flags by default on debug builds */ debug_flags |= aco::DEBUG_VALIDATE_IR; - #endif +#endif } -void init() +void +init() { call_once(&init_once_flag, init_once); } -void init_program(Program *program, Stage stage, struct radv_shader_info *info, - enum chip_class chip_class, enum radeon_family family, - bool wgp_mode, ac_shader_config *config) +void +init_program(Program* program, Stage stage, struct radv_shader_info* info, + enum chip_class chip_class, enum radeon_family family, bool wgp_mode, + ac_shader_config* config) { program->stage = stage; program->config = config; @@ -72,24 +73,12 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info, program->chip_class = chip_class; if (family == CHIP_UNKNOWN) { switch (chip_class) { - case GFX6: - program->family = CHIP_TAHITI; - break; - case GFX7: - program->family = CHIP_BONAIRE; - break; - case GFX8: - program->family = CHIP_POLARIS10; - break; - case GFX9: - program->family = CHIP_VEGA10; - break; - case GFX10: - program->family = CHIP_NAVI10; - break; - default: - program->family = CHIP_UNKNOWN; - break; + case GFX6: program->family = CHIP_TAHITI; break; + case GFX7: program->family = CHIP_BONAIRE; break; + case GFX8: program->family = CHIP_POLARIS10; break; + case GFX9: program->family = CHIP_VEGA10; break; + case GFX10: program->family = CHIP_NAVI10; break; + default: program->family = CHIP_UNKNOWN; break; } } else { program->family = family; @@ -98,7 +87,8 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info, program->lane_mask = program->wave_size == 32 ? s1 : s2; program->dev.lds_encoding_granule = chip_class >= GFX7 ? 512 : 256; - program->dev.lds_alloc_granule = chip_class >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule; + program->dev.lds_alloc_granule = + chip_class >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule; program->dev.lds_limit = chip_class >= GFX7 ? 65536 : 32768; /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */ program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY; @@ -111,7 +101,8 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info, program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */ program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512; program->dev.sgpr_alloc_granule = 128; - program->dev.sgpr_limit = 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */ + program->dev.sgpr_limit = + 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */ if (chip_class >= GFX10_3) program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8; else @@ -145,18 +136,14 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info, /* GFX9 APUS */ case CHIP_RAVEN: case CHIP_RAVEN2: - case CHIP_RENOIR: - program->dev.xnack_enabled = true; - break; - default: - break; + case CHIP_RENOIR: program->dev.xnack_enabled = true; break; + default: break; } program->dev.sram_ecc_enabled = program->family == CHIP_ARCTURUS; /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */ program->dev.has_fast_fma32 = program->chip_class >= GFX9; - if (program->family == CHIP_TAHITI || - program->family == CHIP_CARRIZO || + if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO || program->family == CHIP_HAWAII) program->dev.has_fast_fma32 = true; @@ -176,29 +163,24 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info, program->next_fp_mode.round32 = fp_round_ne; } -memory_sync_info get_sync_info(const Instruction* instr) +memory_sync_info +get_sync_info(const Instruction* instr) { switch (instr->format) { - case Format::SMEM: - return instr->smem().sync; - case Format::MUBUF: - return instr->mubuf().sync; - case Format::MIMG: - return instr->mimg().sync; - case Format::MTBUF: - return instr->mtbuf().sync; + case Format::SMEM: return instr->smem().sync; + case Format::MUBUF: return instr->mubuf().sync; + case Format::MIMG: return instr->mimg().sync; + case Format::MTBUF: return instr->mtbuf().sync; case Format::FLAT: case Format::GLOBAL: - case Format::SCRATCH: - return instr->flatlike().sync; - case Format::DS: - return instr->ds().sync; - default: - return memory_sync_info(); + case Format::SCRATCH: return instr->flatlike().sync; + case Format::DS: return instr->ds().sync; + default: return memory_sync_info(); } } -bool can_use_SDWA(chip_class chip, const aco_ptr& instr, bool pre_ra) +bool +can_use_SDWA(chip_class chip, const aco_ptr& instr, bool pre_ra) { if (!instr->isVALU()) return false; @@ -218,7 +200,7 @@ bool can_use_SDWA(chip_class chip, const aco_ptr& instr, bool pre_r if (vop3.omod && chip < GFX9) return false; - //TODO: return true if we know we will use vcc + // TODO: return true if we know we will use vcc if (!pre_ra && instr->definitions.size() >= 2) return false; @@ -244,38 +226,36 @@ bool can_use_SDWA(chip_class chip, const aco_ptr& instr, bool pre_r return false; } - bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || - instr->opcode == aco_opcode::v_mac_f16 || - instr->opcode == aco_opcode::v_fmac_f32 || - instr->opcode == aco_opcode::v_fmac_f16; + bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 || + instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16; if (chip != GFX8 && is_mac) return false; - //TODO: return true if we know we will use vcc + // TODO: return true if we know we will use vcc if (!pre_ra && instr->isVOPC()) return false; if (!pre_ra && instr->operands.size() >= 3 && !is_mac) return false; - return instr->opcode != aco_opcode::v_madmk_f32 && - instr->opcode != aco_opcode::v_madak_f32 && - instr->opcode != aco_opcode::v_madmk_f16 && - instr->opcode != aco_opcode::v_madak_f16 && + return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 && + instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 && instr->opcode != aco_opcode::v_readfirstlane_b32 && - instr->opcode != aco_opcode::v_clrexcp && - instr->opcode != aco_opcode::v_swap_b32; + instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32; } /* updates "instr" and returns the old instruction (or NULL if no update was needed) */ -aco_ptr convert_to_SDWA(chip_class chip, aco_ptr& instr) +aco_ptr +convert_to_SDWA(chip_class chip, aco_ptr& instr) { if (instr->isSDWA()) return NULL; aco_ptr tmp = std::move(instr); - Format format = (Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA); - instr.reset(create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); + Format format = + (Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA); + instr.reset(create_instruction(tmp->opcode, format, tmp->operands.size(), + tmp->definitions.size())); std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin()); std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin()); @@ -295,15 +275,9 @@ aco_ptr convert_to_SDWA(chip_class chip, aco_ptr& inst break; switch (instr->operands[i].bytes()) { - case 1: - sdwa.sel[i] = sdwa_ubyte; - break; - case 2: - sdwa.sel[i] = sdwa_uword; - break; - case 4: - sdwa.sel[i] = sdwa_udword; - break; + case 1: sdwa.sel[i] = sdwa_ubyte; break; + case 2: sdwa.sel[i] = sdwa_uword; break; + case 4: sdwa.sel[i] = sdwa_udword; break; } } switch (instr->definitions[0].bytes()) { @@ -315,9 +289,7 @@ aco_ptr convert_to_SDWA(chip_class chip, aco_ptr& inst sdwa.dst_sel = sdwa_uword; sdwa.dst_preserve = true; break; - case 4: - sdwa.dst_sel = sdwa_udword; - break; + case 4: sdwa.dst_sel = sdwa_udword; break; } if (instr->definitions[0].getTemp().type() == RegType::sgpr && chip == GFX8) @@ -330,7 +302,8 @@ aco_ptr convert_to_SDWA(chip_class chip, aco_ptr& inst return tmp; } -bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high) +bool +can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high) { /* opsel is only GFX9+ */ if ((high || idx == -1) && chip < GFX9) @@ -362,21 +335,18 @@ bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high) case aco_opcode::v_lshlrev_b16_e64: case aco_opcode::v_lshrrev_b16_e64: case aco_opcode::v_ashrrev_i16_e64: - case aco_opcode::v_mul_lo_u16_e64: - return true; + case aco_opcode::v_mul_lo_u16_e64: return true; case aco_opcode::v_pack_b32_f16: case aco_opcode::v_cvt_pknorm_i16_f16: - case aco_opcode::v_cvt_pknorm_u16_f16: - return idx != -1; + case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1; case aco_opcode::v_mad_u32_u16: - case aco_opcode::v_mad_i32_i16: - return idx >= 0 && idx < 2; - default: - return false; + case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2; + default: return false; } } -uint32_t get_reduction_identity(ReduceOp op, unsigned idx) +uint32_t +get_reduction_identity(ReduceOp op, unsigned idx) { switch (op) { case iadd8: @@ -397,65 +367,44 @@ uint32_t get_reduction_identity(ReduceOp op, unsigned idx) case umax8: case umax16: case umax32: - case umax64: - return 0; + case umax64: return 0; case imul8: case imul16: case imul32: - case imul64: - return idx ? 0 : 1; - case fmul16: - return 0x3c00u; /* 1.0 */ - case fmul32: - return 0x3f800000u; /* 1.0 */ - case fmul64: - return idx ? 0x3ff00000u : 0u; /* 1.0 */ - case imin8: - return INT8_MAX; - case imin16: - return INT16_MAX; - case imin32: - return INT32_MAX; - case imin64: - return idx ? 0x7fffffffu : 0xffffffffu; - case imax8: - return INT8_MIN; - case imax16: - return INT16_MIN; - case imax32: - return INT32_MIN; - case imax64: - return idx ? 0x80000000u : 0; + case imul64: return idx ? 0 : 1; + case fmul16: return 0x3c00u; /* 1.0 */ + case fmul32: return 0x3f800000u; /* 1.0 */ + case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */ + case imin8: return INT8_MAX; + case imin16: return INT16_MAX; + case imin32: return INT32_MAX; + case imin64: return idx ? 0x7fffffffu : 0xffffffffu; + case imax8: return INT8_MIN; + case imax16: return INT16_MIN; + case imax32: return INT32_MIN; + case imax64: return idx ? 0x80000000u : 0; case umin8: case umin16: case iand8: - case iand16: - return 0xffffffffu; + case iand16: return 0xffffffffu; case umin32: case umin64: case iand32: - case iand64: - return 0xffffffffu; - case fmin16: - return 0x7c00u; /* infinity */ - case fmin32: - return 0x7f800000u; /* infinity */ - case fmin64: - return idx ? 0x7ff00000u : 0u; /* infinity */ - case fmax16: - return 0xfc00u; /* negative infinity */ - case fmax32: - return 0xff800000u; /* negative infinity */ - case fmax64: - return idx ? 0xfff00000u : 0u; /* negative infinity */ - default: - unreachable("Invalid reduction operation"); - break; + case iand64: return 0xffffffffu; + case fmin16: return 0x7c00u; /* infinity */ + case fmin32: return 0x7f800000u; /* infinity */ + case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */ + case fmax16: return 0xfc00u; /* negative infinity */ + case fmax32: return 0xff800000u; /* negative infinity */ + case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */ + default: unreachable("Invalid reduction operation"); break; } return 0; } -bool needs_exec_mask(const Instruction* instr) { +bool +needs_exec_mask(const Instruction* instr) +{ if (instr->isSALU() || instr->isBranch()) return instr->reads_exec(); if (instr->isSMEM()) @@ -479,10 +428,8 @@ bool needs_exec_mask(const Instruction* instr) { case aco_opcode::p_reload: case aco_opcode::p_logical_start: case aco_opcode::p_logical_end: - case aco_opcode::p_startpgm: - return false; - default: - break; + case aco_opcode::p_startpgm: return false; + default: break; } } @@ -495,10 +442,11 @@ bool needs_exec_mask(const Instruction* instr) { return true; } -wait_imm::wait_imm() : - vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter) {} -wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) : - vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {} +wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter) +{} +wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) + : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) +{} wait_imm::wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter) { @@ -513,7 +461,8 @@ wait_imm::wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter) lgkm |= (packed >> 8) & 0x30; } -uint16_t wait_imm::pack(enum chip_class chip) const +uint16_t +wait_imm::pack(enum chip_class chip) const { uint16_t imm = 0; assert(exp == unset_counter || exp <= 0x7); @@ -536,13 +485,16 @@ uint16_t wait_imm::pack(enum chip_class chip) const break; } if (chip < GFX9 && vm == wait_imm::unset_counter) - imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the architecture when interpreting the immediate */ + imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the + architecture when interpreting the immediate */ if (chip < GFX10 && lgkm == wait_imm::unset_counter) - imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the architecture when interpreting the immediate */ + imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the + architecture when interpreting the immediate */ return imm; } -bool wait_imm::combine(const wait_imm& other) +bool +wait_imm::combine(const wait_imm& other) { bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs; vm = std::min(vm, other.vm); @@ -552,17 +504,21 @@ bool wait_imm::combine(const wait_imm& other) return changed; } -bool wait_imm::empty() const +bool +wait_imm::empty() const { - return vm == unset_counter && exp == unset_counter && - lgkm == unset_counter && vs == unset_counter; + return vm == unset_counter && exp == unset_counter && lgkm == unset_counter && + vs == unset_counter; } -bool should_form_clause(const Instruction *a, const Instruction *b) +bool +should_form_clause(const Instruction* a, const Instruction* b) { /* Vertex attribute loads from the same binding likely load from similar addresses */ - unsigned a_vtx_binding = a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0); - unsigned b_vtx_binding = b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0); + unsigned a_vtx_binding = + a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0); + unsigned b_vtx_binding = + b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0); if (a_vtx_binding && a_vtx_binding == b_vtx_binding) return true; @@ -584,4 +540,4 @@ bool should_form_clause(const Instruction *a, const Instruction *b) return false; } -} +} // namespace aco diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 9bbbbe2cadd..2675150d126 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -27,6 +27,7 @@ #include "aco_opcodes.h" #include "aco_util.h" + #include "vulkan/radv_shader.h" #include "nir.h" @@ -129,11 +130,11 @@ enum class instr_class : uint8_t { }; enum storage_class : uint8_t { - storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */ + storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */ storage_buffer = 0x1, /* SSBOs and global memory */ storage_atomic_counter = 0x2, /* not used for Vulkan */ storage_image = 0x4, - storage_shared = 0x8, /* or TCS output */ + storage_shared = 0x8, /* or TCS output */ storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */ storage_scratch = 0x20, storage_vgpr_spill = 0x40, @@ -157,7 +158,8 @@ enum memory_semantics : uint8_t { /* does not interact with barriers and assumes this lane is the only lane * accessing this memory */ semantic_private = 0x8, - /* this operation can be reordered around operations of the same storage. says nothing about barriers */ + /* this operation can be reordered around operations of the same storage. + * says nothing about barriers */ semantic_can_reorder = 0x10, /* this is a atomic instruction (may only read or write memory) */ semantic_atomic = 0x20, @@ -178,20 +180,21 @@ enum sync_scope : uint8_t { struct memory_sync_info { memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {} - memory_sync_info(int storage_, int semantics_=0, sync_scope scope_=scope_invocation) - : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_) {} + memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation) + : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_) + {} - storage_class storage:8; - memory_semantics semantics:8; - sync_scope scope:8; + storage_class storage : 8; + memory_semantics semantics : 8; + sync_scope scope : 8; - bool operator == (const memory_sync_info& rhs) const { - return storage == rhs.storage && - semantics == rhs.semantics && - scope == rhs.scope; + bool operator==(const memory_sync_info& rhs) const + { + return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope; } - bool can_reorder() const { + bool can_reorder() const + { if (semantics & semantic_acqrel) return false; /* Also check storage so that zero-initialized memory_sync_info can be @@ -221,33 +224,34 @@ struct float_mode { /* matches encoding of the MODE register */ union { struct { - fp_round round32:2; - fp_round round16_64:2; - unsigned denorm32:2; - unsigned denorm16_64:2; + fp_round round32 : 2; + fp_round round16_64 : 2; + unsigned denorm32 : 2; + unsigned denorm16_64 : 2; }; struct { - uint8_t round:4; - uint8_t denorm:4; + uint8_t round : 4; + uint8_t denorm : 4; }; uint8_t val = 0; }; /* if false, optimizations which may remove infs/nan/-0.0 can be done */ - bool preserve_signed_zero_inf_nan32:1; - bool preserve_signed_zero_inf_nan16_64:1; + bool preserve_signed_zero_inf_nan32 : 1; + bool preserve_signed_zero_inf_nan16_64 : 1; /* if false, optimizations which may remove denormal flushing can be done */ - bool must_flush_denorms32:1; - bool must_flush_denorms16_64:1; - bool care_about_round32:1; - bool care_about_round16_64:1; + bool must_flush_denorms32 : 1; + bool must_flush_denorms16_64 : 1; + bool care_about_round32 : 1; + bool care_about_round16_64 : 1; /* Returns true if instructions using the mode "other" can safely use the * current one instead. */ - bool canReplace(float_mode other) const noexcept { + bool canReplace(float_mode other) const noexcept + { return val == other.val && (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) && (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) && - (must_flush_denorms32 || !other.must_flush_denorms32) && + (must_flush_denorms32 || !other.must_flush_denorms32) && (must_flush_denorms16_64 || !other.must_flush_denorms16_64) && (care_about_round32 || !other.care_about_round32) && (care_about_round16_64 || !other.care_about_round16_64); @@ -273,13 +277,17 @@ struct wait_imm { bool empty() const; }; -constexpr Format asVOP3(Format format) { - return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format); +constexpr Format +asVOP3(Format format) +{ + return (Format)((uint32_t)Format::VOP3 | (uint32_t)format); }; -constexpr Format asSDWA(Format format) { +constexpr Format +asSDWA(Format format) +{ assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC); - return (Format) ((uint32_t) Format::SDWA | (uint32_t) format); + return (Format)((uint32_t)Format::SDWA | (uint32_t)format); } enum class RegType { @@ -303,10 +311,10 @@ struct RegClass { v2 = s2 | (1 << 5), v3 = s3 | (1 << 5), v4 = s4 | (1 << 5), - v5 = 5 | (1 << 5), - v6 = 6 | (1 << 5), - v7 = 7 | (1 << 5), - v8 = 8 | (1 << 5), + v5 = 5 | (1 << 5), + v6 = 6 | (1 << 5), + v7 = 7 | (1 << 5), + v8 = 8 | (1 << 5), /* byte-sized register class */ v1b = v1 | (1 << 7), v2b = v2 | (1 << 7), @@ -320,29 +328,29 @@ struct RegClass { }; RegClass() = default; - constexpr RegClass(RC rc_) - : rc(rc_) {} + constexpr RegClass(RC rc_) : rc(rc_) {} constexpr RegClass(RegType type, unsigned size) - : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {} + : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size)) + {} constexpr operator RC() const { return rc; } explicit operator bool() = delete; constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; } constexpr bool is_subdword() const { return rc & (1 << 7); } - constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); } - //TODO: use size() less in favor of bytes() + constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); } + // TODO: use size() less in favor of bytes() constexpr unsigned size() const { return (bytes() + 3) >> 2; } constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); } - constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); } - constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); } + constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); } + constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); } - static constexpr RegClass get(RegType type, unsigned bytes) { + static constexpr RegClass get(RegType type, unsigned bytes) + { if (type == RegType::sgpr) { return RegClass(type, DIV_ROUND_UP(bytes, 4u)); } else { - return bytes % 4u ? RegClass(type, bytes).as_subdword() : - RegClass(type, bytes / 4u); + return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u); } } @@ -380,8 +388,7 @@ static constexpr RegClass v8b{RegClass::v8b}; */ struct Temp { Temp() noexcept : id_(0), reg_class(0) {} - constexpr Temp(uint32_t id, RegClass cls) noexcept - : id_(id), reg_class(uint8_t(cls)) {} + constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {} constexpr uint32_t id() const noexcept { return id_; } constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; } @@ -391,12 +398,12 @@ struct Temp { constexpr RegType type() const noexcept { return regClass().type(); } constexpr bool is_linear() const noexcept { return regClass().is_linear(); } - constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); } + constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); } constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); } constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); } private: - uint32_t id_: 24; + uint32_t id_ : 24; uint32_t reg_class : 8; }; @@ -413,8 +420,13 @@ struct PhysReg { constexpr operator unsigned() const { return reg(); } constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; } constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; } - constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; } - constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; } + constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; } + constexpr PhysReg advance(int bytes) const + { + PhysReg res = *this; + res.reg_b += bytes; + return res; + } uint16_t reg_b = 0; }; @@ -453,13 +465,13 @@ static constexpr PhysReg scc{253}; * Temporary registers get mapped to physical register during RA * Constant values are inlined into the instruction sequence. */ -class Operand final -{ +class Operand final { public: constexpr Operand() - : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), - isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0), - isLateKill_(false), is16bit_(false), is24bit_(false), signext(false) {} + : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false), + isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false), + is24bit_(false), signext(false) + {} explicit Operand(Temp r) noexcept { @@ -553,11 +565,11 @@ public: isConstant_ = true; constSize = 3; if (v <= 64) { - data_.i = (uint32_t) v; - setFixed(PhysReg{128 + (uint32_t) v}); + data_.i = (uint32_t)v; + setFixed(PhysReg{128 + (uint32_t)v}); } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */ - data_.i = (uint32_t) v; - setFixed(PhysReg{192 - (uint32_t) v}); + data_.i = (uint32_t)v; + setFixed(PhysReg{192 - (uint32_t)v}); } else if (v == 0x3FE0000000000000) { /* 0.5 */ data_.i = 0x3f000000; setFixed(PhysReg{240}); @@ -586,7 +598,8 @@ public: signext = v >> 63; data_.i = v & 0xffffffffu; setFixed(PhysReg{255}); - assert(constantValue64() == v && "attempt to create a unrepresentable 64-bit literal constant"); + assert(constantValue64() == v && + "attempt to create a unrepresentable 64-bit literal constant"); } }; explicit Operand(RegClass type) noexcept @@ -623,7 +636,8 @@ public: return Operand((uint8_t)val); } - static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext=false, bool sext=false) + static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false, + bool sext = false) { if (bytes <= 4) return true; @@ -634,48 +648,33 @@ public: if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0)) return true; - return val <= 64 || - val >= 0xFFFFFFFFFFFFFFF0 || /* [-16 .. -1] */ - val == 0x3FE0000000000000 || /* 0.5 */ - val == 0xBFE0000000000000 || /* -0.5 */ - val == 0x3FF0000000000000 || /* 1.0 */ - val == 0xBFF0000000000000 || /* -1.0 */ - val == 0x4000000000000000 || /* 2.0 */ - val == 0xC000000000000000 || /* -2.0 */ - val == 0x4010000000000000 || /* 4.0 */ - val == 0xC010000000000000; /* -4.0 */ + return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */ + val == 0x3FE0000000000000 || /* 0.5 */ + val == 0xBFE0000000000000 || /* -0.5 */ + val == 0x3FF0000000000000 || /* 1.0 */ + val == 0xBFF0000000000000 || /* -1.0 */ + val == 0x4000000000000000 || /* 2.0 */ + val == 0xC000000000000000 || /* -2.0 */ + val == 0x4010000000000000 || /* 4.0 */ + val == 0xC010000000000000; /* -4.0 */ } - constexpr bool isTemp() const noexcept + constexpr bool isTemp() const noexcept { return isTemp_; } + + constexpr void setTemp(Temp t) noexcept { - return isTemp_; - } - - constexpr void setTemp(Temp t) noexcept { assert(!isConstant_); isTemp_ = true; data_.temp = t; } - constexpr Temp getTemp() const noexcept - { - return data_.temp; - } + constexpr Temp getTemp() const noexcept { return data_.temp; } - constexpr uint32_t tempId() const noexcept - { - return data_.temp.id(); - } + constexpr uint32_t tempId() const noexcept { return data_.temp.id(); } - constexpr bool hasRegClass() const noexcept - { - return isTemp() || isUndefined(); - } + constexpr bool hasRegClass() const noexcept { return isTemp() || isUndefined(); } - constexpr RegClass regClass() const noexcept - { - return data_.temp.regClass(); - } + constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); } constexpr unsigned bytes() const noexcept { @@ -693,15 +692,9 @@ public: return data_.temp.size(); } - constexpr bool isFixed() const noexcept - { - return isFixed_; - } + constexpr bool isFixed() const noexcept { return isFixed_; } - constexpr PhysReg physReg() const noexcept - { - return reg_; - } + constexpr PhysReg physReg() const noexcept { return reg_; } constexpr void setFixed(PhysReg reg) noexcept { @@ -709,25 +702,13 @@ public: reg_ = reg; } - constexpr bool isConstant() const noexcept - { - return isConstant_; - } + constexpr bool isConstant() const noexcept { return isConstant_; } - constexpr bool isLiteral() const noexcept - { - return isConstant() && reg_ == 255; - } + constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; } - constexpr bool isUndefined() const noexcept - { - return isUndef_; - } + constexpr bool isUndefined() const noexcept { return isUndef_; } - constexpr uint32_t constantValue() const noexcept - { - return data_.i; - } + constexpr uint32_t constantValue() const noexcept { return data_.i; } constexpr bool constantEquals(uint32_t cmp) const noexcept { @@ -743,22 +724,14 @@ public: return 0xFFFFFFFFFFFFFFFF - (reg_ - 193); switch (reg_) { - case 240: - return 0x3FE0000000000000; - case 241: - return 0xBFE0000000000000; - case 242: - return 0x3FF0000000000000; - case 243: - return 0xBFF0000000000000; - case 244: - return 0x4000000000000000; - case 245: - return 0xC000000000000000; - case 246: - return 0x4010000000000000; - case 247: - return 0xC010000000000000; + case 240: return 0x3FE0000000000000; + case 241: return 0xBFE0000000000000; + case 242: return 0x3FF0000000000000; + case 243: return 0xBFF0000000000000; + case 244: return 0x4000000000000000; + case 245: return 0xC000000000000000; + case 246: return 0x4010000000000000; + case 247: return 0xC010000000000000; case 255: return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i; } @@ -776,15 +749,9 @@ public: /* Indicates that the killed operand's live range intersects with the * instruction's definitions. Unlike isKill() and isFirstKill(), this is * not set by liveness analysis. */ - constexpr void setLateKill(bool flag) noexcept - { - isLateKill_ = flag; - } + constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; } - constexpr bool isLateKill() const noexcept - { - return isLateKill_; - } + constexpr bool isLateKill() const noexcept { return isLateKill_; } constexpr void setKill(bool flag) noexcept { @@ -793,10 +760,7 @@ public: setFirstKill(false); } - constexpr bool isKill() const noexcept - { - return isKill_ || isFirstKill(); - } + constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); } constexpr void setFirstKill(bool flag) noexcept { @@ -807,22 +771,13 @@ public: /* When there are multiple operands killing the same temporary, * isFirstKill() is only returns true for the first one. */ - constexpr bool isFirstKill() const noexcept - { - return isFirstKill_; - } + constexpr bool isFirstKill() const noexcept { return isFirstKill_; } - constexpr bool isKillBeforeDef() const noexcept - { - return isKill() && !isLateKill(); - } + constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); } - constexpr bool isFirstKillBeforeDef() const noexcept - { - return isFirstKill() && !isLateKill(); - } + constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); } - constexpr bool operator == (Operand other) const noexcept + constexpr bool operator==(Operand other) const noexcept { if (other.size() != size()) return false; @@ -840,51 +795,36 @@ public: return other.isTemp() && other.getTemp() == getTemp(); } - constexpr bool operator != (Operand other) const noexcept - { - return !operator==(other); - } + constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); } - constexpr void set16bit(bool flag) noexcept - { - is16bit_ = flag; - } + constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; } - constexpr bool is16bit() const noexcept - { - return is16bit_; - } + constexpr bool is16bit() const noexcept { return is16bit_; } - constexpr void set24bit(bool flag) noexcept - { - is24bit_ = flag; - } + constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; } - constexpr bool is24bit() const noexcept - { - return is24bit_; - } + constexpr bool is24bit() const noexcept { return is24bit_; } private: union { Temp temp; uint32_t i; float f; - } data_ = { Temp(0, s1) }; + } data_ = {Temp(0, s1)}; PhysReg reg_; union { struct { - uint8_t isTemp_:1; - uint8_t isFixed_:1; - uint8_t isConstant_:1; - uint8_t isKill_:1; - uint8_t isUndef_:1; - uint8_t isFirstKill_:1; - uint8_t constSize:2; - uint8_t isLateKill_:1; - uint8_t is16bit_:1; - uint8_t is24bit_:1; - uint8_t signext:1; + uint8_t isTemp_ : 1; + uint8_t isFixed_ : 1; + uint8_t isConstant_ : 1; + uint8_t isKill_ : 1; + uint8_t isUndef_ : 1; + uint8_t isFirstKill_ : 1; + uint8_t constSize : 2; + uint8_t isLateKill_ : 1; + uint8_t is16bit_ : 1; + uint8_t is24bit_ : 1; + uint8_t signext : 1; }; /* can't initialize bit-fields in c++11, so work around using a union */ uint16_t control_ = 0; @@ -897,73 +837,39 @@ private: * and refer to temporary virtual registers * which are later mapped to physical registers */ -class Definition final -{ +class Definition final { public: - constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), - isKill_(0), isPrecise_(0), isNUW_(0), isNoCSE_(0) {} - Definition(uint32_t index, RegClass type) noexcept - : temp(index, type) {} - explicit Definition(Temp tmp) noexcept - : temp(tmp) {} - Definition(PhysReg reg, RegClass type) noexcept - : temp(Temp(0, type)) - { - setFixed(reg); - } - Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept - : temp(Temp(tmpId, type)) + constexpr Definition() + : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0), isNUW_(0), + isNoCSE_(0) + {} + Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {} + explicit Definition(Temp tmp) noexcept : temp(tmp) {} + Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); } + Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type)) { setFixed(reg); } - constexpr bool isTemp() const noexcept - { - return tempId() > 0; - } + constexpr bool isTemp() const noexcept { return tempId() > 0; } - constexpr Temp getTemp() const noexcept - { - return temp; - } + constexpr Temp getTemp() const noexcept { return temp; } - constexpr uint32_t tempId() const noexcept - { - return temp.id(); - } + constexpr uint32_t tempId() const noexcept { return temp.id(); } - constexpr void setTemp(Temp t) noexcept { - temp = t; - } + constexpr void setTemp(Temp t) noexcept { temp = t; } - void swapTemp(Definition& other) noexcept { - std::swap(temp, other.temp); - } + void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); } - constexpr RegClass regClass() const noexcept - { - return temp.regClass(); - } + constexpr RegClass regClass() const noexcept { return temp.regClass(); } - constexpr unsigned bytes() const noexcept - { - return temp.bytes(); - } + constexpr unsigned bytes() const noexcept { return temp.bytes(); } - constexpr unsigned size() const noexcept - { - return temp.size(); - } + constexpr unsigned size() const noexcept { return temp.size(); } - constexpr bool isFixed() const noexcept - { - return isFixed_; - } + constexpr bool isFixed() const noexcept { return isFixed_; } - constexpr PhysReg physReg() const noexcept - { - return reg_; - } + constexpr PhysReg physReg() const noexcept { return reg_; } constexpr void setFixed(PhysReg reg) noexcept { @@ -977,63 +883,36 @@ public: reg_ = reg; } - constexpr bool hasHint() const noexcept - { - return hasHint_; - } + constexpr bool hasHint() const noexcept { return hasHint_; } - constexpr void setKill(bool flag) noexcept - { - isKill_ = flag; - } + constexpr void setKill(bool flag) noexcept { isKill_ = flag; } - constexpr bool isKill() const noexcept - { - return isKill_; - } + constexpr bool isKill() const noexcept { return isKill_; } - constexpr void setPrecise(bool precise) noexcept - { - isPrecise_ = precise; - } + constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; } - constexpr bool isPrecise() const noexcept - { - return isPrecise_; - } + constexpr bool isPrecise() const noexcept { return isPrecise_; } /* No Unsigned Wrap */ - constexpr void setNUW(bool nuw) noexcept - { - isNUW_ = nuw; - } + constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; } - constexpr bool isNUW() const noexcept - { - return isNUW_; - } + constexpr bool isNUW() const noexcept { return isNUW_; } - constexpr void setNoCSE(bool noCSE) noexcept - { - isNoCSE_ = noCSE; - } + constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; } - constexpr bool isNoCSE() const noexcept - { - return isNoCSE_; - } + constexpr bool isNoCSE() const noexcept { return isNoCSE_; } private: Temp temp = Temp(0, s1); PhysReg reg_; union { struct { - uint8_t isFixed_:1; - uint8_t hasHint_:1; - uint8_t isKill_:1; - uint8_t isPrecise_:1; - uint8_t isNUW_:1; - uint8_t isNoCSE_:1; + uint8_t isFixed_ : 1; + uint8_t hasHint_ : 1; + uint8_t isKill_ : 1; + uint8_t isPrecise_ : 1; + uint8_t isNUW_ : 1; + uint8_t isNoCSE_ : 1; }; /* can't initialize bit-fields in c++11, so work around using a union */ uint8_t control_ = 0; @@ -1086,99 +965,298 @@ struct Instruction { return false; } - Pseudo_instruction& pseudo() noexcept {assert(isPseudo()); return *(Pseudo_instruction *)this;} - const Pseudo_instruction& pseudo() const noexcept {assert(isPseudo()); return *(Pseudo_instruction *)this;} - constexpr bool isPseudo() const noexcept {return format == Format::PSEUDO;} - SOP1_instruction& sop1() noexcept {assert(isSOP1()); return *(SOP1_instruction *)this;} - const SOP1_instruction& sop1() const noexcept {assert(isSOP1()); return *(SOP1_instruction *)this;} - constexpr bool isSOP1() const noexcept {return format == Format::SOP1;} - SOP2_instruction& sop2() noexcept {assert(isSOP2()); return *(SOP2_instruction *)this;} - const SOP2_instruction& sop2() const noexcept {assert(isSOP2()); return *(SOP2_instruction *)this;} - constexpr bool isSOP2() const noexcept {return format == Format::SOP2;} - SOPK_instruction& sopk() noexcept {assert(isSOPK()); return *(SOPK_instruction *)this;} - const SOPK_instruction& sopk() const noexcept {assert(isSOPK()); return *(SOPK_instruction *)this;} - constexpr bool isSOPK() const noexcept {return format == Format::SOPK;} - SOPP_instruction& sopp() noexcept {assert(isSOPP()); return *(SOPP_instruction *)this;} - const SOPP_instruction& sopp() const noexcept {assert(isSOPP()); return *(SOPP_instruction *)this;} - constexpr bool isSOPP() const noexcept {return format == Format::SOPP;} - SOPC_instruction& sopc() noexcept {assert(isSOPC()); return *(SOPC_instruction *)this;} - const SOPC_instruction& sopc() const noexcept {assert(isSOPC()); return *(SOPC_instruction *)this;} - constexpr bool isSOPC() const noexcept {return format == Format::SOPC;} - SMEM_instruction& smem() noexcept {assert(isSMEM()); return *(SMEM_instruction *)this;} - const SMEM_instruction& smem() const noexcept {assert(isSMEM()); return *(SMEM_instruction *)this;} - constexpr bool isSMEM() const noexcept {return format == Format::SMEM;} - DS_instruction& ds() noexcept {assert(isDS()); return *(DS_instruction *)this;} - const DS_instruction& ds() const noexcept {assert(isDS()); return *(DS_instruction *)this;} - constexpr bool isDS() const noexcept {return format == Format::DS;} - MTBUF_instruction& mtbuf() noexcept {assert(isMTBUF()); return *(MTBUF_instruction *)this;} - const MTBUF_instruction& mtbuf() const noexcept {assert(isMTBUF()); return *(MTBUF_instruction *)this;} - constexpr bool isMTBUF() const noexcept {return format == Format::MTBUF;} - MUBUF_instruction& mubuf() noexcept {assert(isMUBUF()); return *(MUBUF_instruction *)this;} - const MUBUF_instruction& mubuf() const noexcept {assert(isMUBUF()); return *(MUBUF_instruction *)this;} - constexpr bool isMUBUF() const noexcept {return format == Format::MUBUF;} - MIMG_instruction& mimg() noexcept {assert(isMIMG()); return *(MIMG_instruction *)this;} - const MIMG_instruction& mimg() const noexcept {assert(isMIMG()); return *(MIMG_instruction *)this;} - constexpr bool isMIMG() const noexcept {return format == Format::MIMG;} - Export_instruction& exp() noexcept {assert(isEXP()); return *(Export_instruction *)this;} - const Export_instruction& exp() const noexcept {assert(isEXP()); return *(Export_instruction *)this;} - constexpr bool isEXP() const noexcept {return format == Format::EXP;} - FLAT_instruction& flat() noexcept {assert(isFlat()); return *(FLAT_instruction *)this;} - const FLAT_instruction& flat() const noexcept {assert(isFlat()); return *(FLAT_instruction *)this;} - constexpr bool isFlat() const noexcept {return format == Format::FLAT;} - FLAT_instruction& global() noexcept {assert(isGlobal()); return *(FLAT_instruction *)this;} - const FLAT_instruction& global() const noexcept {assert(isGlobal()); return *(FLAT_instruction *)this;} - constexpr bool isGlobal() const noexcept {return format == Format::GLOBAL;} - FLAT_instruction& scratch() noexcept {assert(isScratch()); return *(FLAT_instruction *)this;} - const FLAT_instruction& scratch() const noexcept {assert(isScratch()); return *(FLAT_instruction *)this;} - constexpr bool isScratch() const noexcept {return format == Format::SCRATCH;} - Pseudo_branch_instruction& branch() noexcept {assert(isBranch()); return *(Pseudo_branch_instruction *)this;} - const Pseudo_branch_instruction& branch() const noexcept {assert(isBranch()); return *(Pseudo_branch_instruction *)this;} - constexpr bool isBranch() const noexcept {return format == Format::PSEUDO_BRANCH;} - Pseudo_barrier_instruction& barrier() noexcept {assert(isBarrier()); return *(Pseudo_barrier_instruction *)this;} - const Pseudo_barrier_instruction& barrier() const noexcept {assert(isBarrier()); return *(Pseudo_barrier_instruction *)this;} - constexpr bool isBarrier() const noexcept {return format == Format::PSEUDO_BARRIER;} - Pseudo_reduction_instruction& reduction() noexcept {assert(isReduction()); return *(Pseudo_reduction_instruction *)this;} - const Pseudo_reduction_instruction& reduction() const noexcept {assert(isReduction()); return *(Pseudo_reduction_instruction *)this;} - constexpr bool isReduction() const noexcept {return format == Format::PSEUDO_REDUCTION;} - VOP3P_instruction& vop3p() noexcept {assert(isVOP3P()); return *(VOP3P_instruction *)this;} - const VOP3P_instruction& vop3p() const noexcept {assert(isVOP3P()); return *(VOP3P_instruction *)this;} - constexpr bool isVOP3P() const noexcept {return format == Format::VOP3P;} - VOP1_instruction& vop1() noexcept {assert(isVOP1()); return *(VOP1_instruction *)this;} - const VOP1_instruction& vop1() const noexcept {assert(isVOP1()); return *(VOP1_instruction *)this;} - constexpr bool isVOP1() const noexcept {return (uint16_t)format & (uint16_t)Format::VOP1;} - VOP2_instruction& vop2() noexcept {assert(isVOP2()); return *(VOP2_instruction *)this;} - const VOP2_instruction& vop2() const noexcept {assert(isVOP2()); return *(VOP2_instruction *)this;} - constexpr bool isVOP2() const noexcept {return (uint16_t)format & (uint16_t)Format::VOP2;} - VOPC_instruction& vopc() noexcept {assert(isVOPC()); return *(VOPC_instruction *)this;} - const VOPC_instruction& vopc() const noexcept {assert(isVOPC()); return *(VOPC_instruction *)this;} - constexpr bool isVOPC() const noexcept {return (uint16_t)format & (uint16_t)Format::VOPC;} - VOP3_instruction& vop3() noexcept {assert(isVOP3()); return *(VOP3_instruction *)this;} - const VOP3_instruction& vop3() const noexcept {assert(isVOP3()); return *(VOP3_instruction *)this;} - constexpr bool isVOP3() const noexcept {return (uint16_t)format & (uint16_t)Format::VOP3;} - Interp_instruction& vintrp() noexcept {assert(isVINTRP()); return *(Interp_instruction *)this;} - const Interp_instruction& vintrp() const noexcept {assert(isVINTRP()); return *(Interp_instruction *)this;} - constexpr bool isVINTRP() const noexcept {return (uint16_t)format & (uint16_t)Format::VINTRP;} - DPP_instruction& dpp() noexcept {assert(isDPP()); return *(DPP_instruction *)this;} - const DPP_instruction& dpp() const noexcept {assert(isDPP()); return *(DPP_instruction *)this;} - constexpr bool isDPP() const noexcept {return (uint16_t)format & (uint16_t)Format::DPP;} - SDWA_instruction& sdwa() noexcept {assert(isSDWA()); return *(SDWA_instruction *)this;} - const SDWA_instruction& sdwa() const noexcept {assert(isSDWA()); return *(SDWA_instruction *)this;} - constexpr bool isSDWA() const noexcept {return (uint16_t)format & (uint16_t)Format::SDWA;} - - FLAT_instruction& flatlike() + Pseudo_instruction& pseudo() noexcept { - return *(FLAT_instruction *)this; + assert(isPseudo()); + return *(Pseudo_instruction*)this; } - - const FLAT_instruction& flatlike() const + const Pseudo_instruction& pseudo() const noexcept { - return *(FLAT_instruction *)this; + assert(isPseudo()); + return *(Pseudo_instruction*)this; } - - constexpr bool isFlatLike() const noexcept + constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; } + SOP1_instruction& sop1() noexcept { - return isFlat() || isGlobal() || isScratch(); + assert(isSOP1()); + return *(SOP1_instruction*)this; } + const SOP1_instruction& sop1() const noexcept + { + assert(isSOP1()); + return *(SOP1_instruction*)this; + } + constexpr bool isSOP1() const noexcept { return format == Format::SOP1; } + SOP2_instruction& sop2() noexcept + { + assert(isSOP2()); + return *(SOP2_instruction*)this; + } + const SOP2_instruction& sop2() const noexcept + { + assert(isSOP2()); + return *(SOP2_instruction*)this; + } + constexpr bool isSOP2() const noexcept { return format == Format::SOP2; } + SOPK_instruction& sopk() noexcept + { + assert(isSOPK()); + return *(SOPK_instruction*)this; + } + const SOPK_instruction& sopk() const noexcept + { + assert(isSOPK()); + return *(SOPK_instruction*)this; + } + constexpr bool isSOPK() const noexcept { return format == Format::SOPK; } + SOPP_instruction& sopp() noexcept + { + assert(isSOPP()); + return *(SOPP_instruction*)this; + } + const SOPP_instruction& sopp() const noexcept + { + assert(isSOPP()); + return *(SOPP_instruction*)this; + } + constexpr bool isSOPP() const noexcept { return format == Format::SOPP; } + SOPC_instruction& sopc() noexcept + { + assert(isSOPC()); + return *(SOPC_instruction*)this; + } + const SOPC_instruction& sopc() const noexcept + { + assert(isSOPC()); + return *(SOPC_instruction*)this; + } + constexpr bool isSOPC() const noexcept { return format == Format::SOPC; } + SMEM_instruction& smem() noexcept + { + assert(isSMEM()); + return *(SMEM_instruction*)this; + } + const SMEM_instruction& smem() const noexcept + { + assert(isSMEM()); + return *(SMEM_instruction*)this; + } + constexpr bool isSMEM() const noexcept { return format == Format::SMEM; } + DS_instruction& ds() noexcept + { + assert(isDS()); + return *(DS_instruction*)this; + } + const DS_instruction& ds() const noexcept + { + assert(isDS()); + return *(DS_instruction*)this; + } + constexpr bool isDS() const noexcept { return format == Format::DS; } + MTBUF_instruction& mtbuf() noexcept + { + assert(isMTBUF()); + return *(MTBUF_instruction*)this; + } + const MTBUF_instruction& mtbuf() const noexcept + { + assert(isMTBUF()); + return *(MTBUF_instruction*)this; + } + constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; } + MUBUF_instruction& mubuf() noexcept + { + assert(isMUBUF()); + return *(MUBUF_instruction*)this; + } + const MUBUF_instruction& mubuf() const noexcept + { + assert(isMUBUF()); + return *(MUBUF_instruction*)this; + } + constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; } + MIMG_instruction& mimg() noexcept + { + assert(isMIMG()); + return *(MIMG_instruction*)this; + } + const MIMG_instruction& mimg() const noexcept + { + assert(isMIMG()); + return *(MIMG_instruction*)this; + } + constexpr bool isMIMG() const noexcept { return format == Format::MIMG; } + Export_instruction& exp() noexcept + { + assert(isEXP()); + return *(Export_instruction*)this; + } + const Export_instruction& exp() const noexcept + { + assert(isEXP()); + return *(Export_instruction*)this; + } + constexpr bool isEXP() const noexcept { return format == Format::EXP; } + FLAT_instruction& flat() noexcept + { + assert(isFlat()); + return *(FLAT_instruction*)this; + } + const FLAT_instruction& flat() const noexcept + { + assert(isFlat()); + return *(FLAT_instruction*)this; + } + constexpr bool isFlat() const noexcept { return format == Format::FLAT; } + FLAT_instruction& global() noexcept + { + assert(isGlobal()); + return *(FLAT_instruction*)this; + } + const FLAT_instruction& global() const noexcept + { + assert(isGlobal()); + return *(FLAT_instruction*)this; + } + constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; } + FLAT_instruction& scratch() noexcept + { + assert(isScratch()); + return *(FLAT_instruction*)this; + } + const FLAT_instruction& scratch() const noexcept + { + assert(isScratch()); + return *(FLAT_instruction*)this; + } + constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; } + Pseudo_branch_instruction& branch() noexcept + { + assert(isBranch()); + return *(Pseudo_branch_instruction*)this; + } + const Pseudo_branch_instruction& branch() const noexcept + { + assert(isBranch()); + return *(Pseudo_branch_instruction*)this; + } + constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; } + Pseudo_barrier_instruction& barrier() noexcept + { + assert(isBarrier()); + return *(Pseudo_barrier_instruction*)this; + } + const Pseudo_barrier_instruction& barrier() const noexcept + { + assert(isBarrier()); + return *(Pseudo_barrier_instruction*)this; + } + constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; } + Pseudo_reduction_instruction& reduction() noexcept + { + assert(isReduction()); + return *(Pseudo_reduction_instruction*)this; + } + const Pseudo_reduction_instruction& reduction() const noexcept + { + assert(isReduction()); + return *(Pseudo_reduction_instruction*)this; + } + constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; } + VOP3P_instruction& vop3p() noexcept + { + assert(isVOP3P()); + return *(VOP3P_instruction*)this; + } + const VOP3P_instruction& vop3p() const noexcept + { + assert(isVOP3P()); + return *(VOP3P_instruction*)this; + } + constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; } + VOP1_instruction& vop1() noexcept + { + assert(isVOP1()); + return *(VOP1_instruction*)this; + } + const VOP1_instruction& vop1() const noexcept + { + assert(isVOP1()); + return *(VOP1_instruction*)this; + } + constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; } + VOP2_instruction& vop2() noexcept + { + assert(isVOP2()); + return *(VOP2_instruction*)this; + } + const VOP2_instruction& vop2() const noexcept + { + assert(isVOP2()); + return *(VOP2_instruction*)this; + } + constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; } + VOPC_instruction& vopc() noexcept + { + assert(isVOPC()); + return *(VOPC_instruction*)this; + } + const VOPC_instruction& vopc() const noexcept + { + assert(isVOPC()); + return *(VOPC_instruction*)this; + } + constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; } + VOP3_instruction& vop3() noexcept + { + assert(isVOP3()); + return *(VOP3_instruction*)this; + } + const VOP3_instruction& vop3() const noexcept + { + assert(isVOP3()); + return *(VOP3_instruction*)this; + } + constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; } + Interp_instruction& vintrp() noexcept + { + assert(isVINTRP()); + return *(Interp_instruction*)this; + } + const Interp_instruction& vintrp() const noexcept + { + assert(isVINTRP()); + return *(Interp_instruction*)this; + } + constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; } + DPP_instruction& dpp() noexcept + { + assert(isDPP()); + return *(DPP_instruction*)this; + } + const DPP_instruction& dpp() const noexcept + { + assert(isDPP()); + return *(DPP_instruction*)this; + } + constexpr bool isDPP() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP; } + SDWA_instruction& sdwa() noexcept + { + assert(isSDWA()); + return *(SDWA_instruction*)this; + } + const SDWA_instruction& sdwa() const noexcept + { + assert(isSDWA()); + return *(SDWA_instruction*)this; + } + constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; } + + FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; } + + const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; } + + constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); } constexpr bool isVALU() const noexcept { @@ -1190,10 +1268,7 @@ struct Instruction { return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP(); } - constexpr bool isVMEM() const noexcept - { - return isMTBUF() || isMUBUF() || isMIMG(); - } + constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); } }; static_assert(sizeof(Instruction) == 16, "Unexpected padding"); @@ -1209,16 +1284,13 @@ struct SOPP_instruction : public Instruction { }; static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); -struct SOPC_instruction : public Instruction { -}; +struct SOPC_instruction : public Instruction {}; static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); -struct SOP1_instruction : public Instruction { -}; +struct SOP1_instruction : public Instruction {}; static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); -struct SOP2_instruction : public Instruction { -}; +struct SOP2_instruction : public Instruction {}; static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); /** @@ -1236,23 +1308,20 @@ struct SMEM_instruction : public Instruction { memory_sync_info sync; bool glc : 1; /* VI+: globally coherent */ bool dlc : 1; /* NAVI: device level coherent */ - bool nv : 1; /* VEGA only: Non-volatile */ + bool nv : 1; /* VEGA only: Non-volatile */ bool disable_wqm : 1; bool prevent_overflow : 1; /* avoid overflow when combining additions */ - uint8_t padding: 3; + uint8_t padding : 3; }; static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); -struct VOP1_instruction : public Instruction { -}; +struct VOP1_instruction : public Instruction {}; static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); -struct VOP2_instruction : public Instruction { -}; +struct VOP2_instruction : public Instruction {}; static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); -struct VOPC_instruction : public Instruction { -}; +struct VOPC_instruction : public Instruction {}; static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); struct VOP3_instruction : public Instruction { @@ -1295,39 +1364,39 @@ struct DPP_instruction : public Instruction { static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); enum sdwa_sel : uint8_t { - /* masks */ - sdwa_wordnum = 0x1, - sdwa_bytenum = 0x3, - sdwa_asuint = 0x7 | 0x10, - sdwa_rasize = 0x3, + /* masks */ + sdwa_wordnum = 0x1, + sdwa_bytenum = 0x3, + sdwa_asuint = 0x7 | 0x10, + sdwa_rasize = 0x3, - /* flags */ - sdwa_isword = 0x4, - sdwa_sext = 0x8, - sdwa_isra = 0x10, + /* flags */ + sdwa_isword = 0x4, + sdwa_sext = 0x8, + sdwa_isra = 0x10, - /* specific values */ - sdwa_ubyte0 = 0, - sdwa_ubyte1 = 1, - sdwa_ubyte2 = 2, - sdwa_ubyte3 = 3, - sdwa_uword0 = sdwa_isword | 0, - sdwa_uword1 = sdwa_isword | 1, - sdwa_udword = 6, + /* specific values */ + sdwa_ubyte0 = 0, + sdwa_ubyte1 = 1, + sdwa_ubyte2 = 2, + sdwa_ubyte3 = 3, + sdwa_uword0 = sdwa_isword | 0, + sdwa_uword1 = sdwa_isword | 1, + sdwa_udword = 6, - sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext, - sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext, - sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext, - sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext, - sdwa_sword0 = sdwa_uword0 | sdwa_sext, - sdwa_sword1 = sdwa_uword1 | sdwa_sext, - sdwa_sdword = sdwa_udword | sdwa_sext, + sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext, + sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext, + sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext, + sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext, + sdwa_sword0 = sdwa_uword0 | sdwa_sext, + sdwa_sword1 = sdwa_uword1 | sdwa_sext, + sdwa_sdword = sdwa_udword | sdwa_sext, - /* register-allocated */ - sdwa_ubyte = 1 | sdwa_isra, - sdwa_uword = 2 | sdwa_isra, - sdwa_sbyte = sdwa_ubyte | sdwa_sext, - sdwa_sword = sdwa_uword | sdwa_sext, + /* register-allocated */ + sdwa_ubyte = 1 | sdwa_isra, + sdwa_uword = 2 | sdwa_isra, + sdwa_sbyte = sdwa_ubyte | sdwa_sext, + sdwa_sword = sdwa_uword | sdwa_sext, }; /** @@ -1387,16 +1456,16 @@ static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected pad */ struct MUBUF_instruction : public Instruction { memory_sync_info sync; - bool offen : 1; /* Supply an offset from VGPR (VADDR) */ - bool idxen : 1; /* Supply an index from VGPR (VADDR) */ - bool addr64 : 1; /* SI, CIK: Address size is 64-bit */ - bool glc : 1; /* globally coherent */ - bool dlc : 1; /* NAVI: device level coherent */ - bool slc : 1; /* system level coherent */ - bool tfe : 1; /* texture fail enable */ - bool lds : 1; /* Return read-data to LDS instead of VGPRs */ + bool offen : 1; /* Supply an offset from VGPR (VADDR) */ + bool idxen : 1; /* Supply an index from VGPR (VADDR) */ + bool addr64 : 1; /* SI, CIK: Address size is 64-bit */ + bool glc : 1; /* globally coherent */ + bool dlc : 1; /* NAVI: device level coherent */ + bool slc : 1; /* system level coherent */ + bool tfe : 1; /* texture fail enable */ + bool lds : 1; /* Return read-data to LDS instead of VGPRs */ uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */ - uint16_t offset : 12; /* Unsigned byte offset - 12 bit */ + uint16_t offset : 12; /* Unsigned byte offset - 12 bit */ uint16_t swizzled : 1; uint16_t padding0 : 2; uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */ @@ -1414,14 +1483,14 @@ static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected */ struct MTBUF_instruction : public Instruction { memory_sync_info sync; - uint8_t dfmt : 4; /* Data Format of data in memory buffer */ - uint8_t nfmt : 3; /* Numeric format of data in memory */ - bool offen : 1; /* Supply an offset from VGPR (VADDR) */ - uint16_t idxen : 1; /* Supply an index from VGPR (VADDR) */ - uint16_t glc : 1; /* globally coherent */ - uint16_t dlc : 1; /* NAVI: device level coherent */ - uint16_t slc : 1; /* system level coherent */ - uint16_t tfe : 1; /* texture fail enable */ + uint8_t dfmt : 4; /* Data Format of data in memory buffer */ + uint8_t nfmt : 3; /* Numeric format of data in memory */ + bool offen : 1; /* Supply an offset from VGPR (VADDR) */ + uint16_t idxen : 1; /* Supply an index from VGPR (VADDR) */ + uint16_t glc : 1; /* globally coherent */ + uint16_t dlc : 1; /* NAVI: device level coherent */ + uint16_t slc : 1; /* system level coherent */ + uint16_t tfe : 1; /* texture fail enable */ uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */ uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */ uint16_t padding : 4; @@ -1440,18 +1509,18 @@ static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected */ struct MIMG_instruction : public Instruction { memory_sync_info sync; - uint8_t dmask; /* Data VGPR enable mask */ - uint8_t dim : 3; /* NAVI: dimensionality */ - bool unrm : 1; /* Force address to be un-normalized */ - bool dlc : 1; /* NAVI: device level coherent */ - bool glc : 1; /* globally coherent */ - bool slc : 1; /* system level coherent */ - bool tfe : 1; /* texture fail enable */ - bool da : 1; /* declare an array */ - bool lwe : 1; /* LOD warning enable */ - bool r128 : 1; /* NAVI: Texture resource size */ - bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */ - bool d16 : 1; /* Convert 32-bit data to 16-bit data */ + uint8_t dmask; /* Data VGPR enable mask */ + uint8_t dim : 3; /* NAVI: dimensionality */ + bool unrm : 1; /* Force address to be un-normalized */ + bool dlc : 1; /* NAVI: device level coherent */ + bool glc : 1; /* globally coherent */ + bool slc : 1; /* system level coherent */ + bool tfe : 1; /* texture fail enable */ + bool da : 1; /* declare an array */ + bool lwe : 1; /* LOD warning enable */ + bool r128 : 1; /* NAVI: Texture resource size */ + bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */ + bool d16 : 1; /* Convert 32-bit data to 16-bit data */ bool disable_wqm : 1; /* Require an exec mask without helper invocations */ uint8_t padding0 : 2; uint8_t padding1; @@ -1514,6 +1583,7 @@ struct Pseudo_barrier_instruction : public Instruction { static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); enum ReduceOp : uint16_t { + // clang-format off iadd8, iadd16, iadd32, iadd64, imul8, imul16, imul32, imul64, fadd16, fadd32, fadd64, @@ -1528,6 +1598,7 @@ enum ReduceOp : uint16_t { ior8, ior16, ior32, ior64, ixor8, ixor16, ixor32, ixor64, num_reduce_ops, + // clang-format on }; /** @@ -1547,23 +1618,24 @@ struct Pseudo_reduction_instruction : public Instruction { ReduceOp reduce_op; uint16_t cluster_size; // must be 0 for scans }; -static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); +static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, + "Unexpected padding"); struct instr_deleter_functor { - void operator()(void* p) { - free(p); - } + void operator()(void* p) { free(p); } }; -template -using aco_ptr = std::unique_ptr; +template using aco_ptr = std::unique_ptr; -template -T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions) +template +T* +create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, + uint32_t num_definitions) { - std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition); - char *data = (char*) calloc(1, size); - T* inst = (T*) data; + std::size_t size = + sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition); + char* data = (char*)calloc(1, size); + T* inst = (T*)data; inst->opcode = opcode; inst->format = format; @@ -1576,7 +1648,8 @@ T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, u return inst; } -constexpr bool Instruction::usesModifiers() const noexcept +constexpr bool +Instruction::usesModifiers() const noexcept { if (isDPP() || isSDWA()) return true; @@ -1603,19 +1676,21 @@ constexpr bool Instruction::usesModifiers() const noexcept return false; } -constexpr bool is_phi(Instruction* instr) +constexpr bool +is_phi(Instruction* instr) { return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi; } -static inline bool is_phi(aco_ptr& instr) +static inline bool +is_phi(aco_ptr& instr) { return is_phi(instr.get()); } memory_sync_info get_sync_info(const Instruction* instr); -bool is_dead(const std::vector& uses, Instruction *instr); +bool is_dead(const std::vector& uses, Instruction* instr); bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high); bool can_use_SDWA(chip_class chip, const aco_ptr& instr, bool pre_ra); @@ -1625,9 +1700,9 @@ bool needs_exec_mask(const Instruction* instr); uint32_t get_reduction_identity(ReduceOp op, unsigned idx); -unsigned get_mimg_nsa_dwords(const Instruction *instr); +unsigned get_mimg_nsa_dwords(const Instruction* instr); -bool should_form_clause(const Instruction *a, const Instruction *b); +bool should_form_clause(const Instruction* a, const Instruction* b); enum block_kind { /* uniform indicates that leaving this block, @@ -1650,50 +1725,56 @@ enum block_kind { block_kind_export_end = 1 << 15, }; - struct RegisterDemand { constexpr RegisterDemand() = default; - constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept - : vgpr{v}, sgpr{s} {} + constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {} int16_t vgpr = 0; int16_t sgpr = 0; - constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept { + constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept + { return a.vgpr == b.vgpr && a.sgpr == b.sgpr; } - constexpr bool exceeds(const RegisterDemand other) const noexcept { + constexpr bool exceeds(const RegisterDemand other) const noexcept + { return vgpr > other.vgpr || sgpr > other.sgpr; } - constexpr RegisterDemand operator+(const Temp t) const noexcept { + constexpr RegisterDemand operator+(const Temp t) const noexcept + { if (t.type() == RegType::sgpr) - return RegisterDemand( vgpr, sgpr + t.size() ); + return RegisterDemand(vgpr, sgpr + t.size()); else - return RegisterDemand( vgpr + t.size(), sgpr ); + return RegisterDemand(vgpr + t.size(), sgpr); } - constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept { + constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept + { return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr); } - constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept { + constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept + { return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr); } - constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept { + constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept + { vgpr += other.vgpr; sgpr += other.sgpr; return *this; } - constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept { + constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept + { vgpr -= other.vgpr; sgpr -= other.sgpr; return *this; } - constexpr RegisterDemand& operator+=(const Temp t) noexcept { + constexpr RegisterDemand& operator+=(const Temp t) noexcept + { if (t.type() == RegType::sgpr) sgpr += t.size(); else @@ -1701,7 +1782,8 @@ struct RegisterDemand { return *this; } - constexpr RegisterDemand& operator-=(const Temp t) noexcept { + constexpr RegisterDemand& operator-=(const Temp t) noexcept + { if (t.type() == RegType::sgpr) sgpr -= t.size(); else @@ -1709,11 +1791,11 @@ struct RegisterDemand { return *this; } - constexpr void update(const RegisterDemand other) noexcept { + constexpr void update(const RegisterDemand other) noexcept + { vgpr = std::max(vgpr, other.vgpr); sgpr = std::max(sgpr, other.sgpr); } - }; /* CFG */ @@ -1746,23 +1828,25 @@ struct Block { * Shader stages as provided in Vulkan by the application. Contrast this to HWStage. */ enum class SWStage : uint8_t { - None = 0, - VS = 1 << 0, /* Vertex Shader */ - GS = 1 << 1, /* Geometry Shader */ - TCS = 1 << 2, /* Tessellation Control aka Hull Shader */ - TES = 1 << 3, /* Tessellation Evaluation aka Domain Shader */ - FS = 1 << 4, /* Fragment aka Pixel Shader */ - CS = 1 << 5, /* Compute Shader */ - GSCopy = 1 << 6, /* GS Copy Shader (internal) */ + None = 0, + VS = 1 << 0, /* Vertex Shader */ + GS = 1 << 1, /* Geometry Shader */ + TCS = 1 << 2, /* Tessellation Control aka Hull Shader */ + TES = 1 << 3, /* Tessellation Evaluation aka Domain Shader */ + FS = 1 << 4, /* Fragment aka Pixel Shader */ + CS = 1 << 5, /* Compute Shader */ + GSCopy = 1 << 6, /* GS Copy Shader (internal) */ - /* Stage combinations merged to run on a single HWStage */ - VS_GS = VS | GS, - VS_TCS = VS | TCS, - TES_GS = TES | GS, + /* Stage combinations merged to run on a single HWStage */ + VS_GS = VS | GS, + VS_TCS = VS | TCS, + TES_GS = TES | GS, }; -constexpr SWStage operator|(SWStage a, SWStage b) { - return static_cast(static_cast(a) | static_cast(b)); +constexpr SWStage +operator|(SWStage a, SWStage b) +{ + return static_cast(static_cast(a) | static_cast(b)); } /* @@ -1773,14 +1857,14 @@ constexpr SWStage operator|(SWStage a, SWStage b) { * See README.md for details. */ enum class HWStage : uint8_t { - VS, - ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */ - GS, /* Geometry shader on GFX10/legacy and GFX6-9. */ - NGG, /* Primitive shader, used to implement VS, TES, GS. */ - LS, /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */ - HS, /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */ - FS, - CS, + VS, + ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */ + GS, /* Geometry shader on GFX10/legacy and GFX6-9. */ + NGG, /* Primitive shader, used to implement VS, TES, GS. */ + LS, /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */ + HS, /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */ + FS, + CS, }; /* @@ -1788,32 +1872,27 @@ enum class HWStage : uint8_t { * HWStage it will run on. */ struct Stage { - constexpr Stage() = default; + constexpr Stage() = default; - explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) { } + explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {} - /* Check if the given SWStage is included */ - constexpr bool has(SWStage stage) const { - return (static_cast(sw) & static_cast(stage)); - } + /* Check if the given SWStage is included */ + constexpr bool has(SWStage stage) const + { + return (static_cast(sw) & static_cast(stage)); + } - unsigned num_sw_stages() const { - return util_bitcount(static_cast(sw)); - } + unsigned num_sw_stages() const { return util_bitcount(static_cast(sw)); } - constexpr bool operator==(const Stage& other) const { - return sw == other.sw && hw == other.hw; - } + constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; } - constexpr bool operator!=(const Stage& other) const { - return sw != other.sw || hw != other.hw; - } + constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; } - /* Mask of merged software stages */ - SWStage sw = SWStage::None; + /* Mask of merged software stages */ + SWStage sw = SWStage::None; - /* Active hardware stage */ - HWStage hw {}; + /* Active hardware stage */ + HWStage hw{}; }; /* possible settings of Program::stage */ @@ -1835,7 +1914,8 @@ static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS); static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */ static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */ static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS); -static constexpr Stage tess_eval_es(HWStage::ES, SWStage::TES); /* tesselation evaluation before geometry */ +static constexpr Stage tess_eval_es(HWStage::ES, + SWStage::TES); /* tesselation evaluation before geometry */ static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS); enum statistic { @@ -1884,7 +1964,7 @@ public: uint16_t num_waves = 0; uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */ ac_shader_config* config; - struct radv_shader_info *info; + struct radv_shader_info* info; enum chip_class chip_class; enum radeon_family family; DeviceInfo dev; @@ -1892,7 +1972,7 @@ public: RegClass lane_mask; Stage stage; bool needs_exact = false; /* there exists an instruction with disable_wqm = true */ - bool needs_wqm = false; /* there exists a p_wqm instruction */ + bool needs_wqm = false; /* there exists a p_wqm instruction */ std::vector constant_data; Temp private_segment_buffer; @@ -1917,12 +1997,10 @@ public: unsigned next_uniform_if_depth = 0; struct { - FILE *output = stderr; + FILE* output = stderr; bool shorten_messages = false; - void (*func)(void *private_data, - enum radv_compiler_debug_level level, - const char *message); - void *private_data; + void (*func)(void* private_data, enum radv_compiler_debug_level level, const char* message); + void* private_data; } debug; uint32_t allocateId(RegClass rc) @@ -1939,25 +2017,21 @@ public: allocationID += amount; } - Temp allocateTmp(RegClass rc) - { - return Temp(allocateId(rc), rc); - } + Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); } - uint32_t peekAllocationId() - { - return allocationID; - } + uint32_t peekAllocationId() { return allocationID; } friend void reindex_ssa(Program* program); friend void reindex_ssa(Program* program, std::vector& live_out); - Block* create_and_insert_block() { + Block* create_and_insert_block() + { Block block; return insert_block(std::move(block)); } - Block* insert_block(Block&& block) { + Block* insert_block(Block&& block) + { block.index = blocks.size(); block.fp_mode = next_fp_mode; block.loop_nest_depth = next_loop_depth; @@ -1985,35 +2059,30 @@ struct ra_test_policy { void init(); -void init_program(Program *program, Stage stage, struct radv_shader_info *info, - enum chip_class chip_class, enum radeon_family family, - bool wgp_mode, ac_shader_config *config); +void init_program(Program* program, Stage stage, struct radv_shader_info* info, + enum chip_class chip_class, enum radeon_family family, bool wgp_mode, + ac_shader_config* config); -void select_program(Program *program, - unsigned shader_count, - struct nir_shader *const *shaders, - ac_shader_config* config, - struct radv_shader_args *args); -void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, - ac_shader_config* config, - struct radv_shader_args *args); -void select_trap_handler_shader(Program *program, struct nir_shader *shader, - ac_shader_config* config, - struct radv_shader_args *args); +void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders, + ac_shader_config* config, struct radv_shader_args* args); +void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config, + struct radv_shader_args* args); +void select_trap_handler_shader(Program* program, struct nir_shader* shader, + ac_shader_config* config, struct radv_shader_args* args); void lower_phis(Program* program); void calc_min_waves(Program* program); void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand); live live_var_analysis(Program* program); -std::vector dead_code_analysis(Program *program); +std::vector dead_code_analysis(Program* program); void dominator_tree(Program* program); -void insert_exec_mask(Program *program); +void insert_exec_mask(Program* program); void value_numbering(Program* program); void optimize(Program* program); void optimize_postRA(Program* program); void setup_reduce_temp(Program* program); void lower_to_cssa(Program* program, live& live_vars); -void register_allocation(Program *program, std::vector& live_out_per_block, +void register_allocation(Program* program, std::vector& live_out_per_block, ra_test_policy = {}); void ssa_elimination(Program* program); void lower_to_hw_instr(Program* program); @@ -2021,21 +2090,22 @@ void schedule_program(Program* program, live& live_vars); void spill(Program* program, live& live_vars); void insert_wait_states(Program* program); void insert_NOPs(Program* program); -void form_hard_clauses(Program *program); +void form_hard_clauses(Program* program); unsigned emit_program(Program* program, std::vector& code); -bool print_asm(Program *program, std::vector& binary, - unsigned exec_size, FILE *output); +bool print_asm(Program* program, std::vector& binary, unsigned exec_size, FILE* output); bool validate_ir(Program* program); bool validate_ra(Program* program); #ifndef NDEBUG -void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr=NULL); +void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL); #else -#define perfwarn(program, cond, msg, ...) do {} while(0) +#define perfwarn(program, cond, msg, ...) \ + do { \ + } while (0) #endif -void collect_presched_stats(Program *program); -void collect_preasm_stats(Program *program); -void collect_postasm_stats(Program *program, const std::vector& code); +void collect_presched_stats(Program* program); +void collect_preasm_stats(Program* program); +void collect_postasm_stats(Program* program, const std::vector& code); enum print_flags { print_no_ssa = 0x1, @@ -2044,34 +2114,34 @@ enum print_flags { print_live_vars = 0x8, }; -void aco_print_operand(const Operand *operand, FILE *output, unsigned flags=0); -void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags=0); -void aco_print_program(const Program *program, FILE *output, unsigned flags=0); -void aco_print_program(const Program *program, FILE *output, const live& live_vars, unsigned flags=0); +void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0); +void aco_print_instr(const Instruction* instr, FILE* output, unsigned flags = 0); +void aco_print_program(const Program* program, FILE* output, unsigned flags = 0); +void aco_print_program(const Program* program, FILE* output, const live& live_vars, + unsigned flags = 0); -void _aco_perfwarn(Program *program, const char *file, unsigned line, - const char *fmt, ...); -void _aco_err(Program *program, const char *file, unsigned line, - const char *fmt, ...); +void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...); +void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...); #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__) -#define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__) +#define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__) /* utilities for dealing with register demand */ RegisterDemand get_live_changes(aco_ptr& instr); RegisterDemand get_temp_registers(aco_ptr& instr); -RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr& instr, aco_ptr& instr_before); +RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr& instr, + aco_ptr& instr_before); /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */ -uint16_t get_extra_sgprs(Program *program); +uint16_t get_extra_sgprs(Program* program); /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */ -uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs); -uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs); +uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs); +uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs); /* return number of addressable sgprs/vgprs for max_waves */ -uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves); -uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves); +uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves); +uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves); typedef struct { const int16_t opcode_gfx7[static_cast(aco_opcode::num_opcodes)]; @@ -2080,7 +2150,7 @@ typedef struct { const std::bitset(aco_opcode::num_opcodes)> can_use_input_modifiers; const std::bitset(aco_opcode::num_opcodes)> can_use_output_modifiers; const std::bitset(aco_opcode::num_opcodes)> is_atomic; - const char *name[static_cast(aco_opcode::num_opcodes)]; + const char* name[static_cast(aco_opcode::num_opcodes)]; const aco::Format format[static_cast(aco_opcode::num_opcodes)]; /* sizes used for input/output modifiers and constants */ const unsigned operand_size[static_cast(aco_opcode::num_opcodes)]; @@ -2090,7 +2160,6 @@ typedef struct { extern const Info instr_info; -} +} // namespace aco #endif /* ACO_IR_H */ - diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index bcba44acd19..60876267d04 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -24,13 +24,15 @@ */ #include "aco_ir.h" + #include "util/u_math.h" #include #include namespace aco { -RegisterDemand get_live_changes(aco_ptr& instr) +RegisterDemand +get_live_changes(aco_ptr& instr) { RegisterDemand changes; for (const Definition& def : instr->definitions) { @@ -48,7 +50,8 @@ RegisterDemand get_live_changes(aco_ptr& instr) return changes; } -RegisterDemand get_temp_registers(aco_ptr& instr) +RegisterDemand +get_temp_registers(aco_ptr& instr) { RegisterDemand temp_registers; @@ -67,7 +70,9 @@ RegisterDemand get_temp_registers(aco_ptr& instr) return temp_registers; } -RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr& instr, aco_ptr& instr_before) +RegisterDemand +get_demand_before(RegisterDemand demand, aco_ptr& instr, + aco_ptr& instr_before) { demand -= get_live_changes(instr); demand -= get_temp_registers(instr); @@ -77,8 +82,9 @@ RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr& in } namespace { -void process_live_temps_per_block(Program *program, live& lives, Block* block, - std::set& worklist, std::vector& phi_sgpr_ops) +void +process_live_temps_per_block(Program* program, live& lives, Block* block, + std::set& worklist, std::vector& phi_sgpr_ops) { std::vector& register_demand = lives.register_demand[block->index]; RegisterDemand new_demand; @@ -94,8 +100,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, /* traverse the instructions backwards */ int idx; - for (idx = block->instructions.size() -1; idx >= 0; idx--) { - Instruction *insn = block->instructions[idx].get(); + for (idx = block->instructions.size() - 1; idx >= 0; idx--) { + Instruction* insn = block->instructions[idx].get(); if (is_phi(insn)) break; @@ -131,8 +137,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, for (Operand& op : insn->operands) op.setKill(false); - for (unsigned i = 0; i < insn->operands.size(); ++i) - { + for (unsigned i = 0; i < insn->operands.size(); ++i) { Operand& operand = insn->operands[i]; if (!operand.isTemp()) continue; @@ -143,7 +148,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, if (inserted) { operand.setFirstKill(true); for (unsigned j = i + 1; j < insn->operands.size(); ++j) { - if (insn->operands[j].isTemp() && insn->operands[j].tempId() == operand.tempId()) { + if (insn->operands[j].isTemp() && + insn->operands[j].tempId() == operand.tempId()) { insn->operands[j].setFirstKill(false); insn->operands[j].setKill(true); } @@ -167,7 +173,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, int phi_idx = idx; while (phi_idx >= 0) { register_demand[phi_idx] = new_demand; - Instruction *insn = block->instructions[phi_idx].get(); + Instruction* insn = block->instructions[phi_idx].get(); assert(is_phi(insn) && insn->definitions.size() == 1); if (!insn->definitions[0].isTemp()) { @@ -196,7 +202,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, #ifndef NDEBUG if (preds.empty()) - aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, block->index); + aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, + block->index); #endif for (unsigned pred_idx : preds) { @@ -209,14 +216,13 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, /* handle phi operands */ phi_idx = idx; while (phi_idx >= 0) { - Instruction *insn = block->instructions[phi_idx].get(); + Instruction* insn = block->instructions[phi_idx].get(); assert(is_phi(insn)); /* directly insert into the predecessors live-out set */ - std::vector& preds = insn->opcode == aco_opcode::p_phi - ? block->logical_preds - : block->linear_preds; + std::vector& preds = + insn->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; for (unsigned i = 0; i < preds.size(); ++i) { - Operand &operand = insn->operands[i]; + Operand& operand = insn->operands[i]; if (!operand.isTemp()) continue; if (operand.isFixed() && operand.physReg() == vcc) @@ -238,18 +244,19 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, assert(block->index != 0 || (new_demand == RegisterDemand() && live.empty())); } -unsigned calc_waves_per_workgroup(Program *program) +unsigned +calc_waves_per_workgroup(Program* program) { /* When workgroup size is not known, just go with wave_size */ - unsigned workgroup_size = program->workgroup_size == UINT_MAX - ? program->wave_size - : program->workgroup_size; + unsigned workgroup_size = + program->workgroup_size == UINT_MAX ? program->wave_size : program->workgroup_size; return align(workgroup_size, program->wave_size) / program->wave_size; } } /* end namespace */ -uint16_t get_extra_sgprs(Program *program) +uint16_t +get_extra_sgprs(Program* program) { if (program->chip_class >= GFX10) { assert(!program->needs_flat_scr); @@ -275,26 +282,30 @@ uint16_t get_extra_sgprs(Program *program) } } -uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs) +uint16_t +get_sgpr_alloc(Program* program, uint16_t addressable_sgprs) { uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program); uint16_t granule = program->dev.sgpr_alloc_granule; return ALIGN_NPOT(std::max(sgprs, granule), granule); } -uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs) +uint16_t +get_vgpr_alloc(Program* program, uint16_t addressable_vgprs) { assert(addressable_vgprs <= program->dev.vgpr_limit); uint16_t granule = program->dev.vgpr_alloc_granule; return align(std::max(addressable_vgprs, granule), granule); } -unsigned round_down(unsigned a, unsigned b) +unsigned +round_down(unsigned a, unsigned b) { return a - (a % b); } -uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves) +uint16_t +get_addr_sgpr_from_waves(Program* program, uint16_t waves) { /* it's not possible to allocate more than 128 SGPRs */ uint16_t sgprs = std::min(program->dev.physical_sgprs / waves, 128); @@ -303,21 +314,24 @@ uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves) return std::min(sgprs, program->dev.sgpr_limit); } -uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t waves) +uint16_t +get_addr_vgpr_from_waves(Program* program, uint16_t waves) { uint16_t vgprs = program->dev.physical_vgprs / waves & ~(program->dev.vgpr_alloc_granule - 1); vgprs -= program->config->num_shared_vgprs / 2; return std::min(vgprs, program->dev.vgpr_limit); } -void calc_min_waves(Program* program) +void +calc_min_waves(Program* program) { unsigned waves_per_workgroup = calc_waves_per_workgroup(program); unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1); program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp); } -void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) +void +update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) { unsigned max_waves_per_simd = program->dev.max_wave64_per_simd * (64 / program->wave_size); unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1); @@ -333,8 +347,10 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) program->max_reg_demand = new_demand; } else { program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr); - uint16_t vgpr_demand = get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2; - program->num_waves = std::min(program->num_waves, program->dev.physical_vgprs / vgpr_demand); + uint16_t vgpr_demand = + get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2; + program->num_waves = + std::min(program->num_waves, program->dev.physical_vgprs / vgpr_demand); program->max_waves = max_waves_per_simd; /* adjust max_waves for workgroup and LDS limits */ @@ -346,12 +362,15 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds); } if (waves_per_workgroup > 1 && program->chip_class < GFX10) - workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */ + workgroups_per_cu_wgp = std::min( + workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */ /* in cases like waves_per_workgroup=3 or lds=65536 and * waves_per_workgroup=1, we want the maximum possible number of waves per * SIMD and not the minimum. so DIV_ROUND_UP is used */ - program->max_waves = std::min(program->max_waves, DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp)); + program->max_waves = std::min( + program->max_waves, + DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp)); /* incorporate max_waves and calculate max_reg_demand */ program->num_waves = std::min(program->num_waves, program->max_waves); @@ -360,7 +379,8 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) } } -live live_var_analysis(Program* program) +live +live_var_analysis(Program* program) { live result; result.live_out.resize(program->blocks.size()); @@ -371,14 +391,16 @@ live live_var_analysis(Program* program) program->needs_vcc = false; - /* this implementation assumes that the block idx corresponds to the block's position in program->blocks vector */ + /* this implementation assumes that the block idx corresponds to the block's position in + * program->blocks vector */ for (Block& block : program->blocks) worklist.insert(block.index); while (!worklist.empty()) { std::set::reverse_iterator b_it = worklist.rbegin(); unsigned block_idx = *b_it; worklist.erase(block_idx); - process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, phi_sgpr_ops); + process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, + phi_sgpr_ops); new_demand.update(program->blocks[block_idx].register_demand); } @@ -389,5 +411,4 @@ live live_var_analysis(Program* program) return result; } -} - +} // namespace aco diff --git a/src/amd/compiler/aco_lower_phis.cpp b/src/amd/compiler/aco_lower_phis.cpp index 41d0c202eae..2b10318c9bd 100644 --- a/src/amd/compiler/aco_lower_phis.cpp +++ b/src/amd/compiler/aco_lower_phis.cpp @@ -47,7 +47,8 @@ struct ssa_state { std::vector visited; }; -Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool before_write) +Operand +get_ssa(Program* program, unsigned block_idx, ssa_state* state, bool before_write) { if (!before_write) { auto it = state->writes.find(block_idx); @@ -79,7 +80,8 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool bef Temp res = Temp(program->allocateTmp(program->lane_mask)); state->latest[block_idx] = Operand(res); - aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)}; + aco_ptr phi{ + create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)}; for (unsigned i = 0; i < pred; i++) phi->operands[i] = get_ssa(program, block.linear_preds[i], state, false); phi->definitions[0] = Definition(res); @@ -89,11 +91,11 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool bef } } -void insert_before_logical_end(Block *block, aco_ptr instr) +void +insert_before_logical_end(Block* block, aco_ptr instr) { - auto IsLogicalEnd = [] (const aco_ptr& inst) -> bool { - return inst->opcode == aco_opcode::p_logical_end; - }; + auto IsLogicalEnd = [](const aco_ptr& inst) -> bool + { return inst->opcode == aco_opcode::p_logical_end; }; auto it = std::find_if(block->instructions.crbegin(), block->instructions.crend(), IsLogicalEnd); if (it == block->instructions.crend()) { @@ -104,13 +106,13 @@ void insert_before_logical_end(Block *block, aco_ptr instr) } } -void build_merge_code(Program *program, Block *block, Definition dst, Operand prev, Operand cur) +void +build_merge_code(Program* program, Block* block, Definition dst, Operand prev, Operand cur) { Builder bld(program); - auto IsLogicalEnd = [] (const aco_ptr& instr) -> bool { - return instr->opcode == aco_opcode::p_logical_end; - }; + auto IsLogicalEnd = [](const aco_ptr& instr) -> bool + { return instr->opcode == aco_opcode::p_logical_end; }; auto it = std::find_if(block->instructions.rbegin(), block->instructions.rend(), IsLogicalEnd); assert(it != block->instructions.rend()); bld.reset(&block->instructions, std::prev(it.base())); @@ -126,7 +128,8 @@ void build_merge_code(Program *program, Block *block, Definition dst, Operand pr if (!prev_is_constant) { if (!cur_is_constant) { Temp tmp1 = bld.tmp(bld.lm), tmp2 = bld.tmp(bld.lm); - bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc), prev, Operand(exec, bld.lm)); + bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc), prev, + Operand(exec, bld.lm)); bld.sop2(Builder::s_and, Definition(tmp2), bld.def(s1, scc), cur, Operand(exec, bld.lm)); bld.sop2(Builder::s_or, dst, bld.def(s1, scc), tmp1, tmp2); } else if (cur.constantValue()) { @@ -151,7 +154,8 @@ void build_merge_code(Program *program, Block *block, Definition dst, Operand pr } } -void init_any_pred_defined(Program *program, ssa_state *state, Block *block, aco_ptr& phi) +void +init_any_pred_defined(Program* program, ssa_state* state, Block* block, aco_ptr& phi) { std::fill(state->any_pred_defined.begin(), state->any_pred_defined.end(), false); for (unsigned i = 0; i < block->logical_preds.size(); i++) { @@ -178,7 +182,9 @@ void init_any_pred_defined(Program *program, ssa_state *state, Block *block, aco } } -void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block, aco_ptr& phi) +void +lower_divergent_bool_phi(Program* program, ssa_state* state, Block* block, + aco_ptr& phi) { Builder bld(program); @@ -186,7 +192,8 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block, state->all_preds_uniform = !(block->kind & block_kind_merge) && block->linear_preds.size() == block->logical_preds.size(); for (unsigned pred : block->logical_preds) - state->all_preds_uniform = state->all_preds_uniform && (program->blocks[pred].kind & block_kind_uniform); + state->all_preds_uniform = + state->all_preds_uniform && (program->blocks[pred].kind & block_kind_uniform); state->checked_preds_for_uniform = true; } @@ -230,7 +237,7 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block, bool uniform_merge = block->kind & block_kind_loop_header; for (unsigned i = 0; i < phi->operands.size(); i++) { - Block *pred = &program->blocks[block->logical_preds[i]]; + Block* pred = &program->blocks[block->logical_preds[i]]; bool need_get_ssa = !uniform_merge; if (block->kind & block_kind_loop_header && !(pred->kind & block_kind_uniform)) @@ -254,7 +261,8 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block, unsigned num_preds = block->linear_preds.size(); if (phi->operands.size() != num_preds) { - Pseudo_instruction* new_phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, num_preds, 1)}; + Pseudo_instruction* new_phi{create_instruction( + aco_opcode::p_linear_phi, Format::PSEUDO, num_preds, 1)}; new_phi->definitions[0] = phi->definitions[0]; phi.reset(new_phi); } else { @@ -268,7 +276,8 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block, return; } -void lower_subdword_phis(Program *program, Block *block, aco_ptr& phi) +void +lower_subdword_phis(Program* program, Block* block, aco_ptr& phi) { Builder bld(program); for (unsigned i = 0; i < phi->operands.size(); i++) { @@ -278,21 +287,24 @@ void lower_subdword_phis(Program *program, Block *block, aco_ptr& p continue; assert(phi->operands[i].isTemp()); - Block *pred = &program->blocks[block->logical_preds[i]]; + Block* pred = &program->blocks[block->logical_preds[i]]; Temp phi_src = phi->operands[i].getTemp(); assert(phi_src.regClass().type() == RegType::sgpr); Temp tmp = bld.tmp(RegClass(RegType::vgpr, phi_src.size())); insert_before_logical_end(pred, bld.copy(Definition(tmp), phi_src).get_ptr()); Temp new_phi_src = bld.tmp(phi->definitions[0].regClass()); - insert_before_logical_end(pred, bld.pseudo(aco_opcode::p_extract_vector, Definition(new_phi_src), tmp, Operand(0u)).get_ptr()); + insert_before_logical_end( + pred, bld.pseudo(aco_opcode::p_extract_vector, Definition(new_phi_src), tmp, Operand(0u)) + .get_ptr()); phi->operands[i].setTemp(new_phi_src); } return; } -void lower_phis(Program* program) +void +lower_phis(Program* program) { ssa_state state; @@ -301,7 +313,8 @@ void lower_phis(Program* program) state.needs_init = true; for (aco_ptr& phi : block.instructions) { if (phi->opcode == aco_opcode::p_phi) { - assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1 : phi->definitions[0].regClass() != s2); + assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1 + : phi->definitions[0].regClass() != s2); if (phi->definitions[0].regClass() == program->lane_mask) lower_divergent_bool_phi(program, &state, &block, phi); else if (phi->definitions[0].regClass().is_subdword()) @@ -313,4 +326,4 @@ void lower_phis(Program* program) } } -} +} // namespace aco diff --git a/src/amd/compiler/aco_lower_to_cssa.cpp b/src/amd/compiler/aco_lower_to_cssa.cpp index 15fd0f6ae62..db809867a70 100644 --- a/src/amd/compiler/aco_lower_to_cssa.cpp +++ b/src/amd/compiler/aco_lower_to_cssa.cpp @@ -53,32 +53,32 @@ struct copy { struct merge_node { Operand value = Operand(); /* original value: can be an SSA-def or constant value */ - uint32_t index = -1u; /* index into the vector of merge sets */ + uint32_t index = -1u; /* index into the vector of merge sets */ uint32_t defined_at = -1u; /* defining block */ /* we also remember two dominating defs with the same value: */ - Temp equal_anc_in = Temp(); /* within the same merge set */ + Temp equal_anc_in = Temp(); /* within the same merge set */ Temp equal_anc_out = Temp(); /* from a different set */ }; struct cssa_ctx { Program* program; - std::vector& live_out; /* live-out sets per block */ + std::vector& live_out; /* live-out sets per block */ std::vector> parallelcopies; /* copies per block */ - std::vector merge_sets; /* each vector is one (ordered) merge set */ + std::vector merge_sets; /* each vector is one (ordered) merge set */ std::unordered_map merge_node_table; /* tempid -> merge node */ }; /* create (virtual) parallelcopies for each phi instruction and * already merge copy-definitions with phi-defs into merge sets */ -void collect_parallelcopies(cssa_ctx& ctx) +void +collect_parallelcopies(cssa_ctx& ctx) { ctx.parallelcopies.resize(ctx.program->blocks.size()); Builder bld(ctx.program); for (Block& block : ctx.program->blocks) { for (aco_ptr& phi : block.instructions) { - if (phi->opcode != aco_opcode::p_phi && - phi->opcode != aco_opcode::p_linear_phi) + if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) break; const Definition& def = phi->definitions[0]; @@ -89,9 +89,8 @@ void collect_parallelcopies(cssa_ctx& ctx) if (!def.isTemp()) continue; - std::vector& preds = phi->opcode == aco_opcode::p_phi ? - block.logical_preds : - block.linear_preds; + std::vector& preds = + phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; uint32_t index = ctx.merge_sets.size(); merge_set set; @@ -151,8 +150,8 @@ void collect_parallelcopies(cssa_ctx& ctx) } /* check whether the definition of a comes after b. */ -inline -bool defined_after(cssa_ctx& ctx, Temp a, Temp b) +inline bool +defined_after(cssa_ctx& ctx, Temp a, Temp b) { merge_node& node_a = ctx.merge_node_table[a.id()]; merge_node& node_b = ctx.merge_node_table[b.id()]; @@ -163,25 +162,24 @@ bool defined_after(cssa_ctx& ctx, Temp a, Temp b) } /* check whether a dominates b where b is defined after a */ -inline -bool dominates(cssa_ctx& ctx, Temp a, Temp b) +inline bool +dominates(cssa_ctx& ctx, Temp a, Temp b) { assert(defined_after(ctx, b, a)); merge_node& node_a = ctx.merge_node_table[a.id()]; merge_node& node_b = ctx.merge_node_table[b.id()]; unsigned idom = node_b.defined_at; while (idom > node_a.defined_at) - idom = b.regClass().type() == RegType::vgpr ? - ctx.program->blocks[idom].logical_idom : - ctx.program->blocks[idom].linear_idom; + idom = b.regClass().type() == RegType::vgpr ? ctx.program->blocks[idom].logical_idom + : ctx.program->blocks[idom].linear_idom; return idom == node_a.defined_at; } /* check intersection between var and parent: * We already know that parent dominates var. */ -inline -bool intersects(cssa_ctx& ctx, Temp var, Temp parent) +inline bool +intersects(cssa_ctx& ctx, Temp var, Temp parent) { merge_node& node_var = ctx.merge_node_table[var.id()]; merge_node& node_parent = ctx.merge_node_table[parent.id()]; @@ -196,9 +194,9 @@ bool intersects(cssa_ctx& ctx, Temp var, Temp parent) /* parent is defined in a different block than var */ if (node_parent.defined_at < node_var.defined_at) { /* if the parent is not live-in, they don't interfere */ - std::vector& preds = var.type() == RegType::vgpr ? - ctx.program->blocks[block_idx].logical_preds : - ctx.program->blocks[block_idx].linear_preds; + std::vector& preds = var.type() == RegType::vgpr + ? ctx.program->blocks[block_idx].logical_preds + : ctx.program->blocks[block_idx].linear_preds; for (uint32_t pred : preds) { if (!ctx.live_out[pred].count(parent.id())) return false; @@ -246,8 +244,8 @@ bool intersects(cssa_ctx& ctx, Temp var, Temp parent) /* check interference between var and parent: * i.e. they have different values and intersect. * If parent and var share the same value, also updates the equal ancestor. */ -inline -bool interference(cssa_ctx& ctx, Temp var, Temp parent) +inline bool +interference(cssa_ctx& ctx, Temp var, Temp parent) { assert(var != parent); merge_node& node_var = ctx.merge_node_table[var.id()]; @@ -281,13 +279,14 @@ bool interference(cssa_ctx& ctx, Temp var, Temp parent) /* tries to merge set_b into set_a of given temporary and * drops that temporary as it is being coalesced */ -bool try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b) +bool +try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b) { auto def_node_it = ctx.merge_node_table.find(dst.id()); uint32_t index = def_node_it->second.index; merge_set& set_a = ctx.merge_sets[index]; std::vector dom; /* stack of the traversal */ - merge_set union_set; /* the new merged merge-set */ + merge_set union_set; /* the new merged merge-set */ uint32_t i_a = 0; uint32_t i_b = 0; @@ -335,7 +334,8 @@ bool try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b) } /* returns true if the copy can safely be omitted */ -bool try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx) +bool +try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx) { /* we can only coalesce temporaries */ if (!copy.op.isTemp()) @@ -348,11 +348,9 @@ bool try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx) uint32_t pred = block_idx; do { block_idx = pred; - pred = copy.op.regClass().type() == RegType::vgpr ? - ctx.program->blocks[pred].logical_idom : - ctx.program->blocks[pred].linear_idom; - } while (block_idx != pred && - ctx.live_out[pred].count(copy.op.tempId())); + pred = copy.op.regClass().type() == RegType::vgpr ? ctx.program->blocks[pred].logical_idom + : ctx.program->blocks[pred].linear_idom; + } while (block_idx != pred && ctx.live_out[pred].count(copy.op.tempId())); op_node.defined_at = block_idx; op_node.value = copy.op; } @@ -385,7 +383,8 @@ struct ltg_node { /* emit the copies in an order that does not * create interferences within a merge-set */ -void emit_copies_block(Builder bld, std::map& ltg, RegType type) +void +emit_copies_block(Builder bld, std::map& ltg, RegType type) { auto&& it = ltg.begin(); while (it != ltg.end()) { @@ -410,16 +409,16 @@ void emit_copies_block(Builder bld, std::map& ltg, RegType t } /* count the number of remaining circular dependencies */ - unsigned num = std::count_if(ltg.begin(), ltg.end(), [&] (auto& n){ - return n.second.cp.def.regClass().type() == type; - }); + unsigned num = std::count_if(ltg.begin(), ltg.end(), + [&](auto& n) { return n.second.cp.def.regClass().type() == type; }); /* if there are circular dependencies, we just emit them as single parallelcopy */ if (num) { // TODO: this should be restricted to a feasible number of registers // and otherwise use a temporary to avoid having to reload more (spilled) // variables than we have registers. - aco_ptr copy{create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)}; + aco_ptr copy{create_instruction( + aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)}; it = ltg.begin(); for (unsigned i = 0; i < num; i++) { while (it->second.cp.def.regClass().type() != type) @@ -435,7 +434,8 @@ void emit_copies_block(Builder bld, std::map& ltg, RegType t /* either emits or coalesces all parallelcopies and * renames the phi-operands accordingly. */ -void emit_parallelcopies(cssa_ctx& ctx) +void +emit_parallelcopies(cssa_ctx& ctx) { std::unordered_map renames; @@ -476,9 +476,8 @@ void emit_parallelcopies(cssa_ctx& ctx) Block& block = ctx.program->blocks[i]; /* emit VGPR copies */ - auto IsLogicalEnd = [] (const aco_ptr& inst) -> bool { - return inst->opcode == aco_opcode::p_logical_end; - }; + auto IsLogicalEnd = [](const aco_ptr& inst) -> bool + { return inst->opcode == aco_opcode::p_logical_end; }; auto it = std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd); bld.reset(&block.instructions, std::prev(it.base())); emit_copies_block(bld, ltg, RegType::vgpr); @@ -494,8 +493,7 @@ void emit_parallelcopies(cssa_ctx& ctx) /* finally, rename coalesced phi operands */ for (Block& block : ctx.program->blocks) { for (aco_ptr& phi : block.instructions) { - if (phi->opcode != aco_opcode::p_phi && - phi->opcode != aco_opcode::p_linear_phi) + if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) break; for (Operand& op : phi->operands) { @@ -514,8 +512,8 @@ void emit_parallelcopies(cssa_ctx& ctx) } /* end namespace */ - -void lower_to_cssa(Program* program, live& live_vars) +void +lower_to_cssa(Program* program, live& live_vars) { reindex_ssa(program, live_vars.live_out); cssa_ctx ctx = {program, live_vars.live_out}; @@ -525,5 +523,4 @@ void lower_to_cssa(Program* program, live& live_vars) /* update live variable information */ live_vars = live_var_analysis(program); } -} - +} // namespace aco diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 59cae568d68..4a1a2caf82d 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -24,6 +24,7 @@ #include "aco_builder.h" #include "aco_ir.h" + #include "common/sid.h" #include @@ -32,43 +33,43 @@ namespace aco { struct lower_context { - Program *program; - Block *block; + Program* program; + Block* block; std::vector> instructions; }; /* used by handle_operands() indirectly through Builder::copy */ uint8_t int8_mul_table[512] = { - 0, 20, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9, 1, 10, 1, 11, - 1, 12, 1, 13, 1, 14, 1, 15, 1, 16, 1, 17, 1, 18, 1, 19, 1, 20, 1, 21, - 1, 22, 1, 23, 1, 24, 1, 25, 1, 26, 1, 27, 1, 28, 1, 29, 1, 30, 1, 31, - 1, 32, 1, 33, 1, 34, 1, 35, 1, 36, 1, 37, 1, 38, 1, 39, 1, 40, 1, 41, - 1, 42, 1, 43, 1, 44, 1, 45, 1, 46, 1, 47, 1, 48, 1, 49, 1, 50, 1, 51, - 1, 52, 1, 53, 1, 54, 1, 55, 1, 56, 1, 57, 1, 58, 1, 59, 1, 60, 1, 61, - 1, 62, 1, 63, 1, 64, 5, 13, 2, 33, 17, 19, 2, 34, 3, 23, 2, 35, 11, 53, - 2, 36, 7, 47, 2, 37, 3, 25, 2, 38, 7, 11, 2, 39, 53, 243, 2, 40, 3, 27, - 2, 41, 17, 35, 2, 42, 5, 17, 2, 43, 3, 29, 2, 44, 15, 23, 2, 45, 7, 13, - 2, 46, 3, 31, 2, 47, 5, 19, 2, 48, 19, 59, 2, 49, 3, 33, 2, 50, 7, 51, - 2, 51, 15, 41, 2, 52, 3, 35, 2, 53, 11, 33, 2, 54, 23, 27, 2, 55, 3, 37, - 2, 56, 9, 41, 2, 57, 5, 23, 2, 58, 3, 39, 2, 59, 7, 17, 2, 60, 9, 241, - 2, 61, 3, 41, 2, 62, 5, 25, 2, 63, 35, 245, 2, 64, 3, 43, 5, 26, 9, 43, - 3, 44, 7, 19, 10, 39, 3, 45, 4, 34, 11, 59, 3, 46, 9, 243, 4, 35, 3, 47, - 22, 53, 7, 57, 3, 48, 5, 29, 10, 245, 3, 49, 4, 37, 9, 45, 3, 50, 7, 241, - 4, 38, 3, 51, 7, 22, 5, 31, 3, 52, 7, 59, 7, 242, 3, 53, 4, 40, 7, 23, - 3, 54, 15, 45, 4, 41, 3, 55, 6, 241, 9, 47, 3, 56, 13, 13, 5, 34, 3, 57, - 4, 43, 11, 39, 3, 58, 5, 35, 4, 44, 3, 59, 6, 243, 7, 245, 3, 60, 5, 241, - 7, 26, 3, 61, 4, 46, 5, 37, 3, 62, 11, 17, 4, 47, 3, 63, 5, 38, 5, 243, - 3, 64, 7, 247, 9, 50, 5, 39, 4, 241, 33, 37, 6, 33, 13, 35, 4, 242, 5, 245, - 6, 247, 7, 29, 4, 51, 5, 41, 5, 246, 7, 249, 3, 240, 11, 19, 5, 42, 3, 241, - 4, 245, 25, 29, 3, 242, 5, 43, 4, 246, 3, 243, 17, 58, 17, 43, 3, 244, - 5, 249, 6, 37, 3, 245, 2, 240, 5, 45, 2, 241, 21, 23, 2, 242, 3, 247, - 2, 243, 5, 251, 2, 244, 29, 61, 2, 245, 3, 249, 2, 246, 17, 29, 2, 247, - 9, 55, 1, 240, 1, 241, 1, 242, 1, 243, 1, 244, 1, 245, 1, 246, 1, 247, - 1, 248, 1, 249, 1, 250, 1, 251, 1, 252, 1, 253, 1, 254, 1, 255 -}; + 0, 20, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9, + 1, 10, 1, 11, 1, 12, 1, 13, 1, 14, 1, 15, 1, 16, 1, 17, 1, 18, 1, 19, + 1, 20, 1, 21, 1, 22, 1, 23, 1, 24, 1, 25, 1, 26, 1, 27, 1, 28, 1, 29, + 1, 30, 1, 31, 1, 32, 1, 33, 1, 34, 1, 35, 1, 36, 1, 37, 1, 38, 1, 39, + 1, 40, 1, 41, 1, 42, 1, 43, 1, 44, 1, 45, 1, 46, 1, 47, 1, 48, 1, 49, + 1, 50, 1, 51, 1, 52, 1, 53, 1, 54, 1, 55, 1, 56, 1, 57, 1, 58, 1, 59, + 1, 60, 1, 61, 1, 62, 1, 63, 1, 64, 5, 13, 2, 33, 17, 19, 2, 34, 3, 23, + 2, 35, 11, 53, 2, 36, 7, 47, 2, 37, 3, 25, 2, 38, 7, 11, 2, 39, 53, 243, + 2, 40, 3, 27, 2, 41, 17, 35, 2, 42, 5, 17, 2, 43, 3, 29, 2, 44, 15, 23, + 2, 45, 7, 13, 2, 46, 3, 31, 2, 47, 5, 19, 2, 48, 19, 59, 2, 49, 3, 33, + 2, 50, 7, 51, 2, 51, 15, 41, 2, 52, 3, 35, 2, 53, 11, 33, 2, 54, 23, 27, + 2, 55, 3, 37, 2, 56, 9, 41, 2, 57, 5, 23, 2, 58, 3, 39, 2, 59, 7, 17, + 2, 60, 9, 241, 2, 61, 3, 41, 2, 62, 5, 25, 2, 63, 35, 245, 2, 64, 3, 43, + 5, 26, 9, 43, 3, 44, 7, 19, 10, 39, 3, 45, 4, 34, 11, 59, 3, 46, 9, 243, + 4, 35, 3, 47, 22, 53, 7, 57, 3, 48, 5, 29, 10, 245, 3, 49, 4, 37, 9, 45, + 3, 50, 7, 241, 4, 38, 3, 51, 7, 22, 5, 31, 3, 52, 7, 59, 7, 242, 3, 53, + 4, 40, 7, 23, 3, 54, 15, 45, 4, 41, 3, 55, 6, 241, 9, 47, 3, 56, 13, 13, + 5, 34, 3, 57, 4, 43, 11, 39, 3, 58, 5, 35, 4, 44, 3, 59, 6, 243, 7, 245, + 3, 60, 5, 241, 7, 26, 3, 61, 4, 46, 5, 37, 3, 62, 11, 17, 4, 47, 3, 63, + 5, 38, 5, 243, 3, 64, 7, 247, 9, 50, 5, 39, 4, 241, 33, 37, 6, 33, 13, 35, + 4, 242, 5, 245, 6, 247, 7, 29, 4, 51, 5, 41, 5, 246, 7, 249, 3, 240, 11, 19, + 5, 42, 3, 241, 4, 245, 25, 29, 3, 242, 5, 43, 4, 246, 3, 243, 17, 58, 17, 43, + 3, 244, 5, 249, 6, 37, 3, 245, 2, 240, 5, 45, 2, 241, 21, 23, 2, 242, 3, 247, + 2, 243, 5, 251, 2, 244, 29, 61, 2, 245, 3, 249, 2, 246, 17, 29, 2, 247, 9, 55, + 1, 240, 1, 241, 1, 242, 1, 243, 1, 244, 1, 245, 1, 246, 1, 247, 1, 248, 1, 249, + 1, 250, 1, 251, 1, 252, 1, 253, 1, 254, 1, 255}; - -aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) { +aco_opcode +get_reduce_opcode(chip_class chip, ReduceOp op) +{ /* Because some 16-bit instructions are already VOP3 on GFX10, we use the * 32-bit opcodes (VOP2) which allows to remove the tempory VGPR and to use * DPP with the arithmetic instructions. This requires to sign-extend. @@ -174,7 +175,8 @@ aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) { } } -bool is_vop3_reduce_opcode(aco_opcode opcode) +bool +is_vop3_reduce_opcode(aco_opcode opcode) { /* 64-bit reductions are VOP3. */ if (opcode == aco_opcode::num_opcodes) @@ -183,83 +185,75 @@ bool is_vop3_reduce_opcode(aco_opcode opcode) return instr_info.format[(int)opcode] == Format::VOP3; } -void emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1) +void +emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1) { - Instruction *instr = bld.vadd32(def, src0, src1, false, Operand(s2), true); + Instruction* instr = bld.vadd32(def, src0, src1, false, Operand(s2), true); if (instr->definitions.size() >= 2) { assert(instr->definitions[1].regClass() == bld.lm); instr->definitions[1].setFixed(vcc); } } -void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, - PhysReg vtmp_reg, ReduceOp op, - unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl, - Operand *identity=NULL) +void +emit_int64_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, + PhysReg vtmp_reg, ReduceOp op, unsigned dpp_ctrl, unsigned row_mask, + unsigned bank_mask, bool bound_ctrl, Operand* identity = NULL) { Builder bld(ctx->program, &ctx->instructions); - Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg+1}, v1)}; - Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg+1}, v1)}; - Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg+1}, v1)}; - Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg+1}, v1)}; + Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)}; + Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg + 1}, v1)}; + Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg + 1}, v1)}; + Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)}; Operand src1_64 = Operand(src1_reg, v2); - Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg+1}, v1)}; + Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg + 1}, v1)}; Operand vtmp_op64 = Operand(vtmp_reg, v2); if (op == iadd64) { if (ctx->program->chip_class >= GFX10) { if (identity) bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); - bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], - dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask, + bound_ctrl); bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]); } else { bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0], dpp_ctrl, row_mask, bank_mask, bound_ctrl); } - bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm), - dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], + Operand(vcc, bld.lm), dpp_ctrl, row_mask, bank_mask, bound_ctrl); } else if (op == iand64) { - bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], - dpp_ctrl, row_mask, bank_mask, bound_ctrl); - bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1], - dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask, + bound_ctrl); + bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask, + bound_ctrl); } else if (op == ior64) { - bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0], - dpp_ctrl, row_mask, bank_mask, bound_ctrl); - bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1], - dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask, + bound_ctrl); + bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask, + bound_ctrl); } else if (op == ixor64) { - bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0], - dpp_ctrl, row_mask, bank_mask, bound_ctrl); - bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1], - dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask, + bound_ctrl); + bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask, + bound_ctrl); } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) { aco_opcode cmp = aco_opcode::num_opcodes; switch (op) { - case umin64: - cmp = aco_opcode::v_cmp_gt_u64; - break; - case umax64: - cmp = aco_opcode::v_cmp_lt_u64; - break; - case imin64: - cmp = aco_opcode::v_cmp_gt_i64; - break; - case imax64: - cmp = aco_opcode::v_cmp_lt_i64; - break; - default: - break; + case umin64: cmp = aco_opcode::v_cmp_gt_u64; break; + case umax64: cmp = aco_opcode::v_cmp_lt_u64; break; + case imin64: cmp = aco_opcode::v_cmp_gt_i64; break; + case imax64: cmp = aco_opcode::v_cmp_lt_i64; break; + default: break; } if (identity) { bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]); } - bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], - dpp_ctrl, row_mask, bank_mask, bound_ctrl); - bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], - dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask, + bound_ctrl); + bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], dpp_ctrl, row_mask, bank_mask, + bound_ctrl); bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64); bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm)); @@ -278,36 +272,38 @@ void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, Ph */ if (identity) bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]); - bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1], - dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1], dpp_ctrl, row_mask, bank_mask, + bound_ctrl); bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]); if (identity) bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); - bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], - dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask, + bound_ctrl); bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]); emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]); if (identity) bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); - bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], - dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask, + bound_ctrl); bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]); emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]); if (identity) bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); - bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], - dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask, + bound_ctrl); bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]); } } -void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp, ReduceOp op) +void +emit_int64_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp, + ReduceOp op) { Builder bld(ctx->program, &ctx->instructions); - Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg+1}, v1)}; + Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)}; RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1; - Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg+1}, src0_rc)}; - Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg+1}, v1)}; + Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg + 1}, src0_rc)}; + Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)}; Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2); Operand src1_64 = Operand(src1_reg, v2); @@ -315,15 +311,15 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) { assert(vtmp.reg() != 0); bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), src0[1]); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]); src0_reg = vtmp; src0[0] = Operand(vtmp, v1); - src0[1] = Operand(PhysReg{vtmp+1}, v1); + src0[1] = Operand(PhysReg{vtmp + 1}, v1); src0_64 = Operand(vtmp, v2); } else if (src0_rc == s1 && op == iadd64) { assert(vtmp.reg() != 0); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), src0[1]); - src0[1] = Operand(PhysReg{vtmp+1}, v1); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]); + src0[1] = Operand(PhysReg{vtmp + 1}, v1); } if (op == iadd64) { @@ -332,7 +328,8 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe } else { bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]); } - bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm)); + bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], + Operand(vcc, bld.lm)); } else if (op == iand64) { bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]); bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]); @@ -345,20 +342,11 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) { aco_opcode cmp = aco_opcode::num_opcodes; switch (op) { - case umin64: - cmp = aco_opcode::v_cmp_gt_u64; - break; - case umax64: - cmp = aco_opcode::v_cmp_lt_u64; - break; - case imin64: - cmp = aco_opcode::v_cmp_gt_i64; - break; - case imax64: - cmp = aco_opcode::v_cmp_lt_i64; - break; - default: - break; + case umin64: cmp = aco_opcode::v_cmp_gt_u64; break; + case umax64: cmp = aco_opcode::v_cmp_lt_u64; break; + case imin64: cmp = aco_opcode::v_cmp_gt_i64; break; + case imax64: cmp = aco_opcode::v_cmp_lt_i64; break; + default: break; } bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64); @@ -381,8 +369,8 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe * res_lo = umul_lo(x_lo, y_lo) * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp */ - Definition tmp0_def(PhysReg{src0_reg+1}, v1); - Definition tmp1_def(PhysReg{src1_reg+1}, v1); + Definition tmp0_def(PhysReg{src0_reg + 1}, v1); + Definition tmp1_def(PhysReg{src1_reg + 1}, v1); Operand tmp0_op = src0[1]; Operand tmp1_op = src1[1]; bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]); @@ -394,10 +382,10 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe } } -void emit_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, - PhysReg vtmp, ReduceOp op, unsigned size, - unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl, - Operand *identity=NULL) /* for VOP3 with sparse writes */ +void +emit_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp, + ReduceOp op, unsigned size, unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, + bool bound_ctrl, Operand* identity = NULL) /* for VOP3 with sparse writes */ { Builder bld(ctx->program, &ctx->instructions); RegClass rc = RegClass(RegType::vgpr, size); @@ -410,32 +398,34 @@ void emit_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg if (!vop3) { if (opcode == aco_opcode::v_add_co_u32) - bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, + bound_ctrl); else bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl); return; } if (opcode == aco_opcode::num_opcodes) { - emit_int64_dpp_op(ctx, dst_reg ,src0_reg, src1_reg, vtmp, op, - dpp_ctrl, row_mask, bank_mask, bound_ctrl, identity); + emit_int64_dpp_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op, dpp_ctrl, row_mask, bank_mask, + bound_ctrl, identity); return; } if (identity) bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]); if (identity && size >= 2) - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), identity[1]); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), identity[1]); for (unsigned i = 0; i < size; i++) - bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{src0_reg+i}, v1), - dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1), + Operand(PhysReg{src0_reg + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl); bld.vop3(opcode, dst, Operand(vtmp, rc), src1); } -void emit_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, - PhysReg vtmp, ReduceOp op, unsigned size) +void +emit_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp, + ReduceOp op, unsigned size) { Builder bld(ctx->program, &ctx->instructions); RegClass rc = RegClass(RegType::vgpr, size); @@ -460,26 +450,29 @@ void emit_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1 } } -void emit_dpp_mov(lower_context *ctx, PhysReg dst, PhysReg src0, unsigned size, - unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl) +void +emit_dpp_mov(lower_context* ctx, PhysReg dst, PhysReg src0, unsigned size, unsigned dpp_ctrl, + unsigned row_mask, unsigned bank_mask, bool bound_ctrl) { Builder bld(ctx->program, &ctx->instructions); for (unsigned i = 0; i < size; i++) { - bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst+i}, v1), Operand(PhysReg{src0+i}, v1), - dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst + i}, v1), + Operand(PhysReg{src0 + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl); } } -void emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern) +void +emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern) { for (unsigned i = 0; i < size; i++) { - bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst+i}, v1), - Operand(PhysReg{src+i}, v1), ds_pattern); + bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst + i}, v1), + Operand(PhysReg{src + i}, v1), ds_pattern); } } -void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size, PhysReg tmp, - PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst) +void +emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size, + PhysReg tmp, PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst) { assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce); assert(cluster_size <= ctx->program->wave_size); @@ -492,20 +485,22 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig Operand vcndmask_identity[2] = {identity[0], identity[1]}; /* First, copy the source to tmp and set inactive lanes to the identity */ - bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1), Definition(exec, bld.lm), Operand(UINT64_MAX), Operand(exec, bld.lm)); + bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1), + Definition(exec, bld.lm), Operand(UINT64_MAX), Operand(exec, bld.lm)); for (unsigned i = 0; i < src.size(); i++) { /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32 * except on GFX10, where v_writelane_b32 can take a literal. */ - if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan && ctx->program->chip_class < GFX10) { - bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp+i}, s1), identity[i]); - identity[i] = Operand(PhysReg{sitmp+i}, s1); + if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan && + ctx->program->chip_class < GFX10) { + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp + i}, s1), identity[i]); + identity[i] = Operand(PhysReg{sitmp + i}, s1); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]); - vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]); + vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1); } else if (identity[i].isLiteral()) { - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]); - vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]); + vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1); } } @@ -517,7 +512,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig if (src.regClass() == v1b) { if (ctx->program->chip_class >= GFX8) { - aco_ptr sdwa{create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)}; + aco_ptr sdwa{create_instruction( + aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)}; sdwa->operands[0] = Operand(PhysReg{tmp}, v1); sdwa->definitions[0] = Definition(PhysReg{tmp}, v1); if (reduce_op == imin8 || reduce_op == imax8) @@ -534,14 +530,15 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig else opcode = aco_opcode::v_bfe_u32; - bld.vop3(opcode, Definition(PhysReg{tmp}, v1), - Operand(PhysReg{tmp}, v1), Operand(0u), Operand(8u)); + bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand(0u), + Operand(8u)); } } else if (src.regClass() == v2b) { if (ctx->program->chip_class >= GFX10 && - (reduce_op == iadd16 || reduce_op == imax16 || - reduce_op == imin16 || reduce_op == umin16 || reduce_op == umax16)) { - aco_ptr sdwa{create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)}; + (reduce_op == iadd16 || reduce_op == imax16 || reduce_op == imin16 || + reduce_op == umin16 || reduce_op == umax16)) { + aco_ptr sdwa{create_instruction( + aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)}; sdwa->operands[0] = Operand(PhysReg{tmp}, v1); sdwa->definitions[0] = Definition(PhysReg{tmp}, v1); if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16) @@ -558,54 +555,69 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig else opcode = aco_opcode::v_bfe_u32; - bld.vop3(opcode, Definition(PhysReg{tmp}, v1), - Operand(PhysReg{tmp}, v1), Operand(0u), Operand(16u)); + bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand(0u), + Operand(16u)); } } bool reduction_needs_last_op = false; switch (op) { case aco_opcode::p_reduce: - if (cluster_size == 1) break; + if (cluster_size == 1) + break; if (ctx->program->chip_class <= GFX7) { reduction_needs_last_op = true; emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2)); - if (cluster_size == 2) break; + if (cluster_size == 2) + break; emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1)); - if (cluster_size == 4) break; + if (cluster_size == 4) + break; emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04)); - if (cluster_size == 8) break; + if (cluster_size == 8) + break; emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08)); - if (cluster_size == 16) break; + if (cluster_size == 16) + break; emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10)); - if (cluster_size == 32) break; + if (cluster_size == 32) + break; emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); for (unsigned i = 0; i < src.size(); i++) - bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1), Operand(0u)); + bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1), + Operand(0u)); // TODO: it would be more effective to do the last reduction step on SALU emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size()); reduction_needs_last_op = false; break; } - emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf, 0xf, false); - if (cluster_size == 2) break; - emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf, 0xf, false); - if (cluster_size == 4) break; - emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf, false); - if (cluster_size == 8) break; + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf, + 0xf, false); + if (cluster_size == 2) + break; + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf, + 0xf, false); + if (cluster_size == 4) + break; + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf, + false); + if (cluster_size == 8) + break; emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false); - if (cluster_size == 16) break; + if (cluster_size == 16) + break; if (ctx->program->chip_class >= GFX10) { /* GFX10+ doesn't support row_bcast15 and row_bcast31 */ for (unsigned i = 0; i < src.size(); i++) - bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u)); + bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1), + Operand(PhysReg{tmp + i}, v1), Operand(0u), Operand(0u)); if (cluster_size == 32) { reduction_needs_last_op = true; @@ -614,7 +626,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); for (unsigned i = 0; i < src.size(); i++) - bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(0u)); + bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1), + Operand(0u)); // TODO: it would be more effective to do the last reduction step on SALU emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size()); break; @@ -626,8 +639,10 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig break; } assert(cluster_size == 64); - emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf, false); - emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf, false); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf, + false); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf, + false); break; case aco_opcode::p_exclusive_scan: if (ctx->program->chip_class >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */ @@ -638,10 +653,10 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10000u)); bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0x10000u)); for (unsigned i = 0; i < src.size(); i++) { - Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32, - Definition(PhysReg{vtmp+i}, v1), - Operand(PhysReg{tmp+i}, v1), - Operand(0xffffffffu), Operand(0xffffffffu)).instr; + Instruction* perm = + bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1), + Operand(PhysReg{tmp + i}, v1), Operand(0xffffffffu), Operand(0xffffffffu)) + .instr; perm->vop3().opsel = 1; /* FI (Fetch Inactive) */ } bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(UINT64_MAX)); @@ -649,8 +664,10 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig if (ctx->program->wave_size == 64) { /* fill in the gap in row 2 */ for (unsigned i = 0; i < src.size(); i++) { - bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); - bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1)); + bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1), + Operand(31u)); + bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1), + Operand(32u), Operand(PhysReg{vtmp + i}, v1)); } } std::swap(tmp, vtmp); @@ -660,41 +677,53 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig // TODO: use LDS on CS with a single write and shifted read /* wavefront shift_right by 1 on SI/CI */ emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2)); - emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */ + emit_ds_swizzle(bld, tmp, tmp, src.size(), + ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */ bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10101010u)); bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1)); for (unsigned i = 0; i < src.size(); i++) - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1), + Operand(PhysReg{tmp + i}, v1)); bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX)); - emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */ + emit_ds_swizzle(bld, tmp, tmp, src.size(), + ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */ bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x01000100u)); bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1)); for (unsigned i = 0; i < src.size(); i++) - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1), + Operand(PhysReg{tmp + i}, v1)); bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX)); - emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */ + emit_ds_swizzle(bld, tmp, tmp, src.size(), + ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */ bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(1u), Operand(16u)); bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(1u), Operand(16u)); for (unsigned i = 0; i < src.size(); i++) - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1), + Operand(PhysReg{tmp + i}, v1)); bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX)); for (unsigned i = 0; i < src.size(); i++) { - bld.writelane(Definition(PhysReg{vtmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{vtmp+i}, v1)); - bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(0u)); - bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1)); + bld.writelane(Definition(PhysReg{vtmp + i}, v1), identity[i], Operand(0u), + Operand(PhysReg{vtmp + i}, v1)); + bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1), + Operand(0u)); + bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1), + Operand(32u), Operand(PhysReg{vtmp + i}, v1)); identity[i] = Operand(0u); /* prevent further uses of identity */ } std::swap(tmp, vtmp); } for (unsigned i = 0; i < src.size(); i++) { - if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take care of this overwise */ + if (!identity[i].isConstant() || + identity[i].constantValue()) { /* bound_ctrl should take care of this overwise */ if (ctx->program->chip_class < GFX10) - assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i}); - bld.writelane(Definition(PhysReg{tmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{tmp+i}, v1)); + assert((identity[i].isConstant() && !identity[i].isLiteral()) || + identity[i].physReg() == PhysReg{sitmp + i}); + bld.writelane(Definition(PhysReg{tmp + i}, v1), identity[i], Operand(0u), + Operand(PhysReg{tmp + i}, v1)); } } FALLTHROUGH; @@ -731,28 +760,29 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); for (unsigned i = 0; i < src.size(); i++) - bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); + bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1), + Operand(31u)); bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u)); emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size()); break; } - emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), - dpp_row_sr(1), 0xf, 0xf, false, identity); - emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), - dpp_row_sr(2), 0xf, 0xf, false, identity); - emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), - dpp_row_sr(4), 0xf, 0xf, false, identity); - emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), - dpp_row_sr(8), 0xf, 0xf, false, identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(1), 0xf, 0xf, false, + identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(2), 0xf, 0xf, false, + identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(4), 0xf, 0xf, false, + identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(8), 0xf, 0xf, false, + identity); if (ctx->program->chip_class >= GFX10) { bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(16u), Operand(16u)); bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(16u), Operand(16u)); for (unsigned i = 0; i < src.size(); i++) { - Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32, - Definition(PhysReg{vtmp+i}, v1), - Operand(PhysReg{tmp+i}, v1), - Operand(0xffffffffu), Operand(0xffffffffu)).instr; + Instruction* perm = + bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1), + Operand(PhysReg{tmp + i}, v1), Operand(0xffffffffu), Operand(0xffffffffu)) + .instr; perm->vop3().opsel = 1; /* FI (Fetch Inactive) */ } emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); @@ -760,21 +790,20 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig if (ctx->program->wave_size == 64) { bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u)); for (unsigned i = 0; i < src.size(); i++) - bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); + bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1), + Operand(31u)); emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size()); } } else { - emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), - dpp_row_bcast15, 0xa, 0xf, false, identity); - emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), - dpp_row_bcast31, 0xc, 0xf, false, identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf, + false, identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf, + false, identity); } break; - default: - unreachable("Invalid reduction mode"); + default: unreachable("Invalid reduction mode"); } - if (op == aco_opcode::p_reduce) { if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) { bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm)); @@ -791,8 +820,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig if (dst.regClass().type() == RegType::sgpr) { for (unsigned k = 0; k < src.size(); k++) { - bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), - Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1)); + bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), Operand(PhysReg{tmp + k}, v1), + Operand(ctx->program->wave_size - 1)); } } else if (dst.physReg() != tmp) { for (unsigned k = 0; k < src.size(); k++) { @@ -802,7 +831,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig } } -void emit_gfx10_wave64_bpermute(Program *program, aco_ptr &instr, Builder &bld) +void +emit_gfx10_wave64_bpermute(Program* program, aco_ptr& instr, Builder& bld) { /* Emulates proper bpermute on GFX10 in wave64 mode. * @@ -840,7 +870,8 @@ void emit_gfx10_wave64_bpermute(Program *program, aco_ptr &instr, B bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data); /* HI: Copy data from high lanes 32-63 to shared vgpr */ - bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data, dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false); + bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data, + dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false); /* Save EXEC */ bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2)); /* Set EXEC to enable LO lanes only */ @@ -848,30 +879,37 @@ void emit_gfx10_wave64_bpermute(Program *program, aco_ptr &instr, B /* LO: Copy data from low lanes 0-31 to shared vgpr */ bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data); /* LO: bpermute shared vgpr (high lanes' data) */ - bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4, Operand(shared_vgpr_hi, v1)); + bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4, + Operand(shared_vgpr_hi, v1)); /* Set EXEC to enable HI lanes only */ bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u)); /* HI: bpermute shared vgpr (low lanes' data) */ - bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4, Operand(shared_vgpr_lo, v1)); + bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4, + Operand(shared_vgpr_lo, v1)); /* Only enable lanes which use the other half's data */ - bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc, Operand(tmp_exec.physReg(), s2), same_half); + bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc, + Operand(tmp_exec.physReg(), s2), same_half); /* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */ - bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3), 0x3, 0xf, false); + bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3), + 0x3, 0xf, false); /* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */ - bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false); + bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3), + 0xc, 0xf, false); /* Restore saved EXEC */ bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2)); - /* RA assumes that the result is always in the low part of the register, so we have to shift, if it's not there already */ + /* RA assumes that the result is always in the low part of the register, so we have to shift, if + * it's not there already */ if (input_data.physReg().byte()) { unsigned right_shift = input_data.physReg().byte() * 8; bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand(right_shift), Operand(dst.physReg(), v1)); } } -void emit_gfx6_bpermute(Program *program, aco_ptr &instr, Builder &bld) +void +emit_gfx6_bpermute(Program* program, aco_ptr& instr, Builder& bld) { /* Emulates bpermute using readlane instructions */ @@ -920,8 +958,9 @@ struct copy_operation { }; }; -void split_copy(lower_context *ctx, unsigned offset, Definition *def, Operand *op, - const copy_operation& src, bool ignore_uses, unsigned max_size) +void +split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op, + const copy_operation& src, bool ignore_uses, unsigned max_size) { PhysReg def_reg = src.def.physReg(); PhysReg op_reg = src.op.physReg(); @@ -929,8 +968,7 @@ void split_copy(lower_context *ctx, unsigned offset, Definition *def, Operand *o op_reg.reg_b += offset; /* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10 */ - if (ctx->program->chip_class < GFX10 && - src.def.regClass().type() == RegType::vgpr) + if (ctx->program->chip_class < GFX10 && src.def.regClass().type() == RegType::vgpr) max_size = MIN2(max_size, 4); unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16; @@ -948,23 +986,23 @@ void split_copy(lower_context *ctx, unsigned offset, Definition *def, Operand *o break; } - RegClass def_cls = bytes % 4 == 0 ? RegClass(src.def.regClass().type(), bytes / 4u) : - RegClass(src.def.regClass().type(), bytes).as_subdword(); + RegClass def_cls = bytes % 4 == 0 ? RegClass(src.def.regClass().type(), bytes / 4u) + : RegClass(src.def.regClass().type(), bytes).as_subdword(); *def = Definition(src.def.tempId(), def_reg, def_cls); if (src.op.isConstant()) { assert(bytes >= 1 && bytes <= 8); uint64_t val = src.op.constantValue64() >> (offset * 8u); *op = Operand::get_const(ctx->program->chip_class, val, bytes); } else { - RegClass op_cls = bytes % 4 == 0 ? RegClass(src.op.regClass().type(), bytes / 4u) : - RegClass(src.op.regClass().type(), bytes).as_subdword(); + RegClass op_cls = bytes % 4 == 0 ? RegClass(src.op.regClass().type(), bytes / 4u) + : RegClass(src.op.regClass().type(), bytes).as_subdword(); *op = Operand(op_reg, op_cls); op->setTemp(Temp(src.op.tempId(), op_cls)); } } -uint32_t get_intersection_mask(int a_start, int a_size, - int b_start, int b_size) +uint32_t +get_intersection_mask(int a_start, int a_size, int b_start, int b_size) { int intersection_start = MAX2(b_start - a_start, 0); int intersection_end = MAX2(b_start + b_size - a_start, 0); @@ -975,7 +1013,8 @@ uint32_t get_intersection_mask(int a_start, int a_size, return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask; } -void copy_constant(lower_context *ctx, Builder& bld, Definition dst, Operand op) +void +copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op) { assert(op.bytes() == dst.bytes()); @@ -1069,7 +1108,9 @@ void copy_constant(lower_context *ctx, Builder& bld, Definition dst, Operand op) } } -bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc, PhysReg scratch_sgpr) +bool +do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc, + PhysReg scratch_sgpr) { bool did_copy = false; for (unsigned offset = 0; offset < copy.bytes;) { @@ -1104,23 +1145,30 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool /* preserve the target's lower half */ uint32_t bits = def.physReg().byte() * 8; PhysReg lo_reg = PhysReg(def.physReg().reg()); - Definition lo_half = Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte())); - Definition dst = Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes())); + Definition lo_half = + Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte())); + Definition dst = + Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes())); if (def.physReg().reg() == op.physReg().reg()) { - bld.vop2(aco_opcode::v_and_b32, lo_half, Operand((1 << bits) - 1u), Operand(lo_reg, lo_half.regClass())); + bld.vop2(aco_opcode::v_and_b32, lo_half, Operand((1 << bits) - 1u), + Operand(lo_reg, lo_half.regClass())); if (def.physReg().byte() == 1) { bld.vop2(aco_opcode::v_mul_u32_u24, dst, Operand((1 << bits) + 1u), op); } else if (def.physReg().byte() == 2) { bld.vop2(aco_opcode::v_cvt_pk_u16_u32, dst, Operand(lo_reg, v2b), op); } else if (def.physReg().byte() == 3) { - bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand((1 << bits) + 1u)); + bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), + Operand((1 << bits) + 1u)); bld.vop3(aco_opcode::v_mul_lo_u32, dst, Operand(scratch_sgpr, s1), op); } } else { lo_half.setFixed(lo_half.physReg().advance(4 - def.physReg().byte())); - bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand(32 - bits), Operand(lo_reg, lo_half.regClass())); - bld.vop3(aco_opcode::v_alignbyte_b32, dst, op, Operand(lo_half.physReg(), lo_half.regClass()), Operand(4 - def.physReg().byte())); + bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand(32 - bits), + Operand(lo_reg, lo_half.regClass())); + bld.vop3(aco_opcode::v_alignbyte_b32, dst, op, + Operand(lo_half.physReg(), lo_half.regClass()), + Operand(4 - def.physReg().byte())); } } else { bld.vop1(aco_opcode::v_mov_b32, def, op); @@ -1137,13 +1185,16 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool return did_copy; } -void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool preserve_scc, Pseudo_instruction *pi) +void +do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool preserve_scc, + Pseudo_instruction* pi) { unsigned offset = 0; if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) && (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) { - /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte swap */ + /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte + * swap */ PhysReg op = copy.op.physReg(); PhysReg def = copy.def.physReg(); op.reg_b &= ~0x3; @@ -1209,9 +1260,11 @@ void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op); bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op); if (preserve_scc) - bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1), Operand(0u)); + bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1), + Operand(0u)); } else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) { - bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op, Operand(2u)); + bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op, + Operand(2u)); } else { assert(def.regClass().is_subdword()); bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op); @@ -1232,7 +1285,8 @@ void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr); } -void do_pack_2x16(lower_context *ctx, Builder& bld, Definition def, Operand lo, Operand hi) +void +do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Operand hi) { if (lo.isConstant() && hi.isConstant()) { copy_constant(ctx, bld, def, Operand(lo.constantValue() | (hi.constantValue() << 16))); @@ -1241,8 +1295,7 @@ void do_pack_2x16(lower_context *ctx, Builder& bld, Definition def, Operand lo, bool can_use_pack = (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in) && (ctx->program->chip_class >= GFX10 || - (ctx->program->chip_class >= GFX9 && - !lo.isLiteral() && !hi.isLiteral())); + (ctx->program->chip_class >= GFX9 && !lo.isLiteral() && !hi.isLiteral())); if (can_use_pack) { Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi); @@ -1277,7 +1330,8 @@ void do_pack_2x16(lower_context *ctx, Builder& bld, Definition def, Operand lo, bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand(16u), lo); else bld.vop2(aco_opcode::v_and_b32, def_lo, Operand(0xFFFFu), lo); - bld.vop2(aco_opcode::v_or_b32, def, Operand(hi.constantValue() << 16u), Operand(def.physReg(), v1)); + bld.vop2(aco_opcode::v_or_b32, def, Operand(hi.constantValue() << 16u), + Operand(def.physReg(), v1)); return; } @@ -1331,9 +1385,9 @@ void do_pack_2x16(lower_context *ctx, Builder& bld, Definition def, Operand lo, bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand(2u)); } -void try_coalesce_copies(lower_context *ctx, - std::map& copy_map, - copy_operation& copy) +void +try_coalesce_copies(lower_context* ctx, std::map& copy_map, + copy_operation& copy) { // TODO try more relaxed alignment for subdword copies unsigned next_def_align = util_next_power_of_two(copy.bytes + 1); @@ -1359,8 +1413,8 @@ void try_coalesce_copies(lower_context *ctx, unsigned new_size = copy.bytes + other->second.bytes; if (copy.op.isConstant()) { - uint64_t val = copy.op.constantValue64() | - (other->second.op.constantValue64() << (copy.bytes * 8u)); + uint64_t val = + copy.op.constantValue64() | (other->second.op.constantValue64() << (copy.bytes * 8u)); if (!Operand::is_constant_representable(val, copy.bytes + other->second.bytes, true, copy.def.regClass().type() == RegType::vgpr)) return; @@ -1376,7 +1430,9 @@ void try_coalesce_copies(lower_context *ctx, copy_map.erase(other); } -void handle_operands(std::map& copy_map, lower_context* ctx, chip_class chip_class, Pseudo_instruction *pi) +void +handle_operands(std::map& copy_map, lower_context* ctx, + chip_class chip_class, Pseudo_instruction* pi) { Builder bld(ctx->program, &ctx->instructions); unsigned num_instructions_before = ctx->instructions.size(); @@ -1408,8 +1464,10 @@ void handle_operands(std::map& copy_map, lower_context* copy_operation copy = {hi_op, hi_def, it->second.bytes - 8}; copy_map[hi_def.physReg()] = copy; assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0); - it->second.op = Operand(it->second.op.physReg(), it->second.op.regClass().type() == RegType::sgpr ? s2 : v2); - it->second.def = Definition(it->second.def.physReg(), it->second.def.regClass().type() == RegType::sgpr ? s2 : v2); + it->second.op = Operand(it->second.op.physReg(), + it->second.op.regClass().type() == RegType::sgpr ? s2 : v2); + it->second.def = Definition(it->second.def.physReg(), + it->second.def.regClass().type() == RegType::sgpr ? s2 : v2); it->second.bytes = 8; } @@ -1435,7 +1493,8 @@ void handle_operands(std::map& copy_map, lower_context* bool skip_partial_copies = true; for (auto it = copy_map.begin();;) { if (copy_map.empty()) { - ctx->program->statistics[statistic_copies] += ctx->instructions.size() - num_instructions_before; + ctx->program->statistics[statistic_copies] += + ctx->instructions.size() - num_instructions_before; return; } if (it == copy_map.end()) { @@ -1451,12 +1510,10 @@ void handle_operands(std::map& copy_map, lower_context* std::map::iterator other = copy_map.find(reg_hi); if (other != copy_map.end() && other->second.bytes == 2) { /* check if the target register is otherwise unused */ - bool unused_lo = !it->second.is_used || - (it->second.is_used == 0x0101 && - other->second.op.physReg() == it->first); + bool unused_lo = !it->second.is_used || (it->second.is_used == 0x0101 && + other->second.op.physReg() == it->first); bool unused_hi = !other->second.is_used || - (other->second.is_used == 0x0101 && - it->second.op.physReg() == reg_hi); + (other->second.is_used == 0x0101 && it->second.op.physReg() == reg_hi); if (unused_lo && unused_hi) { Operand lo = it->second.op; Operand hi = other->second.op; @@ -1482,8 +1539,8 @@ void handle_operands(std::map& copy_map, lower_context* /* on GFX6/7, we need some small workarounds as there is no * SDWA instruction to do partial register writes */ if (ctx->program->chip_class < GFX8 && it->second.bytes < 4) { - if (it->first.byte() == 0 && it->second.op.physReg().byte() == 0 && - !it->second.is_used && pi->opcode == aco_opcode::p_split_vector) { + if (it->first.byte() == 0 && it->second.op.physReg().byte() == 0 && !it->second.is_used && + pi->opcode == aco_opcode::p_split_vector) { /* Other operations might overwrite the high bits, so change all users * of the high bits to the new target where they are still available. * This mechanism depends on also emitting dead definitions. */ @@ -1502,7 +1559,8 @@ void handle_operands(std::map& copy_map, lower_context* } else if (it->first.byte()) { assert(pi->opcode == aco_opcode::p_create_vector); /* on GFX6/7, if we target an upper half where the lower half hasn't yet been handled, - * move to the target operand's high bits. This is save to do as it cannot be an operand */ + * move to the target operand's high bits. This is save to do as it cannot be an operand + */ PhysReg lo = PhysReg(it->first.reg()); std::map::iterator other = copy_map.find(lo); if (other != copy_map.end()) { @@ -1511,8 +1569,10 @@ void handle_operands(std::map& copy_map, lower_context* it->second.def = Definition(new_reg_hi, it->second.def.regClass()); it->second.is_used = 0; other->second.bytes += it->second.bytes; - other->second.def.setTemp(Temp(other->second.def.tempId(), RegClass::get(RegType::vgpr, other->second.bytes))); - other->second.op.setTemp(Temp(other->second.op.tempId(), RegClass::get(RegType::vgpr, other->second.bytes))); + other->second.def.setTemp(Temp(other->second.def.tempId(), + RegClass::get(RegType::vgpr, other->second.bytes))); + other->second.op.setTemp(Temp(other->second.op.tempId(), + RegClass::get(RegType::vgpr, other->second.bytes))); /* if the new target's high bits are also a target, change uses */ std::map::iterator target = copy_map.find(new_reg_hi); if (target != copy_map.end()) { @@ -1604,7 +1664,7 @@ void handle_operands(std::map& copy_map, lower_context* * operand (for example, v[7:8] = v[8:9]) */ if (did_copy && !copy.second.op.isConstant()) { for (std::pair& other : copy_map) { - for (uint16_t i = 0; i < other.second.bytes; i++) { + for (uint16_t i = 0; i < other.second.bytes; i++) { /* distance might underflow */ unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b; if (distance < copy.second.bytes && !copy.second.uses[distance]) @@ -1690,13 +1750,15 @@ void handle_operands(std::map& copy_map, lower_context* /* change the operand reg of the target's uses and split uses if needed */ uint32_t bytes_left = u_bit_consecutive(0, swap.bytes); for (auto target = copy_map.begin(); target != copy_map.end(); ++target) { - if (target->second.op.physReg() == swap.def.physReg() && swap.bytes == target->second.bytes) { + if (target->second.op.physReg() == swap.def.physReg() && + swap.bytes == target->second.bytes) { target->second.op.setFixed(swap.op.physReg()); break; } - uint32_t imask = get_intersection_mask(swap.def.physReg().reg_b, swap.bytes, - target->second.op.physReg().reg_b, target->second.bytes); + uint32_t imask = + get_intersection_mask(swap.def.physReg().reg_b, swap.bytes, + target->second.op.physReg().reg_b, target->second.bytes); if (!imask) continue; @@ -1752,7 +1814,8 @@ void handle_operands(std::map& copy_map, lower_context* ctx->program->statistics[statistic_copies] += ctx->instructions.size() - num_instructions_before; } -void emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm) +void +emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm) { if (bld.program->chip_class >= GFX10) { if (set_round) @@ -1761,13 +1824,15 @@ void emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_d bld.sopp(aco_opcode::s_denorm_mode, -1, new_mode.denorm); } else if (set_round || set_denorm) { /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */ - Instruction *instr = bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand(new_mode.val), (7 << 11) | 1).instr; + Instruction* instr = + bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand(new_mode.val), (7 << 11) | 1).instr; /* has to be a literal */ instr->operands[0].setFixed(PhysReg{255}); } } -void emit_set_mode_from_block(Builder& bld, Program& program, Block* block, bool always_set) +void +emit_set_mode_from_block(Builder& bld, Program& program, Block* block, bool always_set) { float_mode config_mode; config_mode.val = program.config->float_mode; @@ -1788,13 +1853,13 @@ void emit_set_mode_from_block(Builder& bld, Program& program, Block* block, bool emit_set_mode(bld, block->fp_mode, set_round, set_denorm); } -void lower_to_hw_instr(Program* program) +void +lower_to_hw_instr(Program* program) { - Block *discard_block = NULL; + Block* discard_block = NULL; - for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) - { - Block *block = &program->blocks[block_idx]; + for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) { + Block* block = &program->blocks[block_idx]; lower_context ctx; ctx.program = program; ctx.block = block; @@ -1806,12 +1871,10 @@ void lower_to_hw_instr(Program* program) aco_ptr& instr = block->instructions[instr_idx]; aco_ptr mov; if (instr->isPseudo() && instr->opcode != aco_opcode::p_unit_test) { - Pseudo_instruction *pi = &instr->pseudo(); + Pseudo_instruction* pi = &instr->pseudo(); - switch (instr->opcode) - { - case aco_opcode::p_extract_vector: - { + switch (instr->opcode) { + case aco_opcode::p_extract_vector: { PhysReg reg = instr->operands[0].physReg(); Definition& def = instr->definitions[0]; reg.reg_b += instr->operands[1].constantValue() * def.bytes(); @@ -1819,21 +1882,22 @@ void lower_to_hw_instr(Program* program) if (reg == def.physReg()) break; - RegClass op_rc = def.regClass().is_subdword() ? def.regClass() : - RegClass(instr->operands[0].getTemp().type(), def.size()); + RegClass op_rc = def.regClass().is_subdword() + ? def.regClass() + : RegClass(instr->operands[0].getTemp().type(), def.size()); std::map copy_operations; copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()}; handle_operands(copy_operations, &ctx, program->chip_class, pi); break; } - case aco_opcode::p_create_vector: - { + case aco_opcode::p_create_vector: { std::map copy_operations; PhysReg reg = instr->definitions[0].physReg(); for (const Operand& op : instr->operands) { if (op.isConstant()) { - const Definition def = Definition(reg, RegClass(instr->definitions[0].getTemp().type(), op.size())); + const Definition def = Definition( + reg, RegClass(instr->definitions[0].getTemp().type(), op.size())); copy_operations[reg] = {op, def, op.bytes()}; reg.reg_b += op.bytes(); continue; @@ -1844,8 +1908,10 @@ void lower_to_hw_instr(Program* program) continue; } - RegClass rc_def = op.regClass().is_subdword() ? op.regClass() : - RegClass(instr->definitions[0].getTemp().type(), op.size()); + RegClass rc_def = + op.regClass().is_subdword() + ? op.regClass() + : RegClass(instr->definitions[0].getTemp().type(), op.size()); const Definition def = Definition(reg, rc_def); copy_operations[def.physReg()] = {op, def, op.bytes()}; reg.reg_b += op.bytes(); @@ -1853,14 +1919,14 @@ void lower_to_hw_instr(Program* program) handle_operands(copy_operations, &ctx, program->chip_class, pi); break; } - case aco_opcode::p_split_vector: - { + case aco_opcode::p_split_vector: { std::map copy_operations; PhysReg reg = instr->operands[0].physReg(); for (const Definition& def : instr->definitions) { - RegClass rc_op = def.regClass().is_subdword() ? def.regClass() : - RegClass(instr->operands[0].getTemp().type(), def.size()); + RegClass rc_op = def.regClass().is_subdword() + ? def.regClass() + : RegClass(instr->operands[0].getTemp().type(), def.size()); const Operand op = Operand(reg, rc_op); copy_operations[def.physReg()] = {op, def, def.bytes()}; reg.reg_b += def.bytes(); @@ -1869,26 +1935,26 @@ void lower_to_hw_instr(Program* program) break; } case aco_opcode::p_parallelcopy: - case aco_opcode::p_wqm: - { + case aco_opcode::p_wqm: { std::map copy_operations; for (unsigned j = 0; j < instr->operands.size(); j++) { assert(instr->definitions[j].bytes() == instr->operands[j].bytes()); - copy_operations[instr->definitions[j].physReg()] = {instr->operands[j], instr->definitions[j], instr->operands[j].bytes()}; + copy_operations[instr->definitions[j].physReg()] = { + instr->operands[j], instr->definitions[j], instr->operands[j].bytes()}; } handle_operands(copy_operations, &ctx, program->chip_class, pi); break; } - case aco_opcode::p_exit_early_if: - { + case aco_opcode::p_exit_early_if: { /* don't bother with an early exit near the end of the program */ if ((block->instructions.size() - 1 - instr_idx) <= 4 && - block->instructions.back()->opcode == aco_opcode::s_endpgm) { - unsigned null_exp_dest = (ctx.program->stage.hw == HWStage::FS) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS; + block->instructions.back()->opcode == aco_opcode::s_endpgm) { + unsigned null_exp_dest = + (ctx.program->stage.hw == HWStage::FS) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS; bool ignore_early_exit = true; for (unsigned k = instr_idx + 1; k < block->instructions.size(); ++k) { - const aco_ptr &instr2 = block->instructions[k]; + const aco_ptr& instr2 = block->instructions[k]; if (instr2->opcode == aco_opcode::s_endpgm || instr2->opcode == aco_opcode::p_logical_end) continue; @@ -1896,8 +1962,8 @@ void lower_to_hw_instr(Program* program) instr2->exp().dest == null_exp_dest) continue; else if (instr2->opcode == aco_opcode::p_parallelcopy && - instr2->definitions[0].isFixed() && - instr2->definitions[0].physReg() == exec) + instr2->definitions[0].isFixed() && + instr2->definitions[0].physReg() == exec) continue; ignore_early_exit = false; @@ -1912,50 +1978,49 @@ void lower_to_hw_instr(Program* program) block = &program->blocks[block_idx]; bld.reset(discard_block); - bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), - 0, V_008DFC_SQ_EXP_NULL, false, true, true); + bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0, + V_008DFC_SQ_EXP_NULL, false, true, true); bld.sopp(aco_opcode::s_endpgm); bld.reset(&ctx.instructions); } - //TODO: exec can be zero here with block_kind_discard + // TODO: exec can be zero here with block_kind_discard assert(instr->operands[0].physReg() == scc); - bld.sopp(aco_opcode::s_cbranch_scc0, Definition(exec, s2), instr->operands[0], discard_block->index); + bld.sopp(aco_opcode::s_cbranch_scc0, Definition(exec, s2), instr->operands[0], + discard_block->index); discard_block->linear_preds.push_back(block->index); block->linear_succs.push_back(discard_block->index); break; } - case aco_opcode::p_spill: - { + case aco_opcode::p_spill: { assert(instr->operands[0].regClass() == v1.as_linear()); for (unsigned i = 0; i < instr->operands[2].size(); i++) { - Operand src = instr->operands[2].isConstant() ? - Operand(uint32_t(instr->operands[2].constantValue64() >> (32 * i))) : - Operand(PhysReg{instr->operands[2].physReg() + i}, s1); - bld.writelane(bld.def(v1, instr->operands[0].physReg()), - src, + Operand src = + instr->operands[2].isConstant() + ? Operand(uint32_t(instr->operands[2].constantValue64() >> (32 * i))) + : Operand(PhysReg{instr->operands[2].physReg() + i}, s1); + bld.writelane(bld.def(v1, instr->operands[0].physReg()), src, Operand(instr->operands[1].constantValue() + i), instr->operands[0]); } break; } - case aco_opcode::p_reload: - { + case aco_opcode::p_reload: { assert(instr->operands[0].regClass() == v1.as_linear()); for (unsigned i = 0; i < instr->definitions[0].size(); i++) bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}), - instr->operands[0], - Operand(instr->operands[1].constantValue() + i)); + instr->operands[0], Operand(instr->operands[1].constantValue() + i)); break; } - case aco_opcode::p_as_uniform: - { - if (instr->operands[0].isConstant() || instr->operands[0].regClass().type() == RegType::sgpr) { + case aco_opcode::p_as_uniform: { + if (instr->operands[0].isConstant() || + instr->operands[0].regClass().type() == RegType::sgpr) { std::map copy_operations; - copy_operations[instr->definitions[0].physReg()] = {instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()}; + copy_operations[instr->definitions[0].physReg()] = { + instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()}; handle_operands(copy_operations, &ctx, program->chip_class, pi); } else { assert(instr->operands[0].regClass().type() == RegType::vgpr); @@ -1969,8 +2034,7 @@ void lower_to_hw_instr(Program* program) } break; } - case aco_opcode::p_bpermute: - { + case aco_opcode::p_bpermute: { if (ctx.program->chip_class <= GFX7) emit_gfx6_bpermute(program, instr, bld); else if (ctx.program->chip_class >= GFX10 && ctx.program->wave_size == 64) @@ -1979,8 +2043,7 @@ void lower_to_hw_instr(Program* program) unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute."); break; } - case aco_opcode::p_constaddr: - { + case aco_opcode::p_constaddr: { unsigned id = instr->definitions[0].tempId(); PhysReg reg = instr->definitions[0].physReg(); bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand(id)); @@ -1990,8 +2053,7 @@ void lower_to_hw_instr(Program* program) Operand(reg.advance(4), s1), Operand(0u), Operand(scc, s1)); break; } - case aco_opcode::p_extract: - { + case aco_opcode::p_extract: { assert(instr->operands[1].isConstant()); assert(instr->operands[2].isConstant()); assert(instr->operands[3].isConstant()); @@ -2006,26 +2068,28 @@ void lower_to_hw_instr(Program* program) if (dst.regClass() == s1) { if (offset == (32 - bits)) { - bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, - dst, bld.def(s1, scc), op, Operand(offset)); + bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst, + bld.def(s1, scc), op, Operand(offset)); } else if (offset == 0 && signext && (bits == 8 || bits == 16)) { - bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, dst, op); + bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, + dst, op); } else { - bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, - dst, bld.def(s1, scc), op, Operand((bits << 16) | offset)); + bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst, + bld.def(s1, scc), op, Operand((bits << 16) | offset)); } } else if (dst.regClass() == v1 || ctx.program->chip_class <= GFX7) { assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0); if (offset == (32 - bits) && op.regClass() != s1) { - bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, - dst, Operand(offset), op); + bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst, + Operand(offset), op); } else { - bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32, - dst, op, Operand(offset), Operand(bits)); + bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32, dst, op, + Operand(offset), Operand(bits)); } } else if (dst.regClass() == v2b) { aco_ptr sdwa{create_instruction( - aco_opcode::v_mov_b32, (Format)((uint16_t)Format::VOP1|(uint16_t)Format::SDWA), 1, 1)}; + aco_opcode::v_mov_b32, + (Format)((uint16_t)Format::VOP1 | (uint16_t)Format::SDWA), 1, 1)}; sdwa->operands[0] = Operand(op.physReg().advance(-op.physReg().byte()), RegClass::get(op.regClass().type(), 4)); sdwa->definitions[0] = dst; @@ -2037,8 +2101,7 @@ void lower_to_hw_instr(Program* program) } break; } - case aco_opcode::p_insert: - { + case aco_opcode::p_insert: { assert(instr->operands[1].isConstant()); assert(instr->operands[2].isConstant()); if (instr->definitions[0].regClass() == s1) @@ -2053,18 +2116,24 @@ void lower_to_hw_instr(Program* program) if (offset == (32 - bits)) { bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), op, Operand(offset)); } else if (offset == 0) { - bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op, Operand(bits << 16)); + bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op, + Operand(bits << 16)); } else { - bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op, Operand(bits << 16)); - bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), Operand(dst.physReg(), s1), Operand(offset)); + bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op, + Operand(bits << 16)); + bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), + Operand(dst.physReg(), s1), Operand(offset)); } } else if (dst.regClass() == v1 || ctx.program->chip_class <= GFX7) { if (offset == (dst.bytes() * 8u - bits)) { bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand(offset), op); } else if (offset == 0) { bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand(0u), Operand(bits)); - } else if (program->chip_class >= GFX9 || (op.regClass() != s1 && program->chip_class >= GFX8)) { - aco_ptr sdwa{create_instruction(aco_opcode::v_mov_b32, (Format)((uint16_t)Format::VOP1|(uint16_t)Format::SDWA), 1, 1)}; + } else if (program->chip_class >= GFX9 || + (op.regClass() != s1 && program->chip_class >= GFX8)) { + aco_ptr sdwa{create_instruction( + aco_opcode::v_mov_b32, + (Format)((uint16_t)Format::VOP1 | (uint16_t)Format::SDWA), 1, 1)}; sdwa->operands[0] = op; sdwa->definitions[0] = dst; sdwa->sel[0] = sdwa_udword; @@ -2072,14 +2141,17 @@ void lower_to_hw_instr(Program* program) bld.insert(std::move(sdwa)); } else { bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand(0u), Operand(bits)); - bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand(offset), Operand(dst.physReg(), v1)); + bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand(offset), + Operand(dst.physReg(), v1)); } } else { assert(dst.regClass() == v2b); aco_ptr sdwa{create_instruction( - aco_opcode::v_mov_b32, (Format)((uint16_t)Format::VOP1|(uint16_t)Format::SDWA), 1, 1)}; + aco_opcode::v_mov_b32, + (Format)((uint16_t)Format::VOP1 | (uint16_t)Format::SDWA), 1, 1)}; sdwa->operands[0] = op; - sdwa->definitions[0] = Definition(dst.physReg().advance(-dst.physReg().byte()), v1); + sdwa->definitions[0] = + Definition(dst.physReg().advance(-dst.physReg().byte()), v1); sdwa->sel[0] = sdwa_uword; sdwa->dst_sel = sdwa_ubyte0 + dst.physReg().byte() + index; sdwa->dst_preserve = 1; @@ -2087,8 +2159,7 @@ void lower_to_hw_instr(Program* program) } break; } - default: - break; + default: break; } } else if (instr->isBranch()) { Pseudo_branch_instruction* branch = &instr->branch(); @@ -2132,42 +2203,41 @@ void lower_to_hw_instr(Program* program) continue; switch (instr->opcode) { - case aco_opcode::p_branch: - assert(block->linear_succs[0] == target); - bld.sopp(aco_opcode::s_branch, branch->definitions[0], target); - break; - case aco_opcode::p_cbranch_nz: - assert(block->linear_succs[1] == target); - if (branch->operands[0].physReg() == exec) - bld.sopp(aco_opcode::s_cbranch_execnz, branch->definitions[0], target); - else if (branch->operands[0].physReg() == vcc) - bld.sopp(aco_opcode::s_cbranch_vccnz, branch->definitions[0], target); - else { - assert(branch->operands[0].physReg() == scc); - bld.sopp(aco_opcode::s_cbranch_scc1, branch->definitions[0], target); - } - break; - case aco_opcode::p_cbranch_z: - assert(block->linear_succs[1] == target); - if (branch->operands[0].physReg() == exec) - bld.sopp(aco_opcode::s_cbranch_execz, branch->definitions[0], target); - else if (branch->operands[0].physReg() == vcc) - bld.sopp(aco_opcode::s_cbranch_vccz, branch->definitions[0], target); - else { - assert(branch->operands[0].physReg() == scc); - bld.sopp(aco_opcode::s_cbranch_scc0, branch->definitions[0], target); - } - break; - default: - unreachable("Unknown Pseudo branch instruction!"); + case aco_opcode::p_branch: + assert(block->linear_succs[0] == target); + bld.sopp(aco_opcode::s_branch, branch->definitions[0], target); + break; + case aco_opcode::p_cbranch_nz: + assert(block->linear_succs[1] == target); + if (branch->operands[0].physReg() == exec) + bld.sopp(aco_opcode::s_cbranch_execnz, branch->definitions[0], target); + else if (branch->operands[0].physReg() == vcc) + bld.sopp(aco_opcode::s_cbranch_vccnz, branch->definitions[0], target); + else { + assert(branch->operands[0].physReg() == scc); + bld.sopp(aco_opcode::s_cbranch_scc1, branch->definitions[0], target); + } + break; + case aco_opcode::p_cbranch_z: + assert(block->linear_succs[1] == target); + if (branch->operands[0].physReg() == exec) + bld.sopp(aco_opcode::s_cbranch_execz, branch->definitions[0], target); + else if (branch->operands[0].physReg() == vcc) + bld.sopp(aco_opcode::s_cbranch_vccz, branch->definitions[0], target); + else { + assert(branch->operands[0].physReg() == scc); + bld.sopp(aco_opcode::s_cbranch_scc0, branch->definitions[0], target); + } + break; + default: unreachable("Unknown Pseudo branch instruction!"); } } else if (instr->isReduction()) { Pseudo_reduction_instruction& reduce = instr->reduction(); emit_reduction(&ctx, reduce.opcode, reduce.reduce_op, reduce.cluster_size, - reduce.operands[1].physReg(), // tmp + reduce.operands[1].physReg(), // tmp reduce.definitions[1].physReg(), // stmp - reduce.operands[2].physReg(), // vtmp + reduce.operands[2].physReg(), // vtmp reduce.definitions[2].physReg(), // sitmp reduce.operands[0], reduce.definitions[0]); } else if (instr->isBarrier()) { @@ -2196,10 +2266,9 @@ void lower_to_hw_instr(Program* program) } else { ctx.instructions.emplace_back(std::move(instr)); } - } block->instructions.swap(ctx.instructions); } } -} +} // namespace aco diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index 4faa9bad687..32c0bd8a120 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -36,8 +36,9 @@ namespace aco { namespace { -inline -uint32_t murmur_32_scramble(uint32_t h, uint32_t k) { +inline uint32_t +murmur_32_scramble(uint32_t h, uint32_t k) +{ k *= 0xcc9e2d51; k = (k << 15) | (k >> 17); h ^= k * 0x1b873593; @@ -46,8 +47,9 @@ uint32_t murmur_32_scramble(uint32_t h, uint32_t k) { return h; } -template -uint32_t hash_murmur_32(Instruction* instr) +template +uint32_t +hash_murmur_32(Instruction* instr) { uint32_t hash = uint32_t(instr->format) << 16 | uint32_t(instr->opcode); @@ -58,7 +60,7 @@ uint32_t hash_murmur_32(Instruction* instr) for (unsigned i = 2; i < (sizeof(T) >> 2); i++) { uint32_t u; /* Accesses it though a byte array, so doesn't violate the strict aliasing rule */ - memcpy(&u, reinterpret_cast(instr) + i * 4, 4); + memcpy(&u, reinterpret_cast(instr) + i * 4, 4); hash = murmur_32_scramble(hash, u); } @@ -92,32 +94,19 @@ struct InstrHash { return hash_murmur_32(instr); switch (instr->format) { - case Format::SMEM: - return hash_murmur_32(instr); - case Format::VINTRP: - return hash_murmur_32(instr); - case Format::DS: - return hash_murmur_32(instr); - case Format::SOPP: - return hash_murmur_32(instr); - case Format::SOPK: - return hash_murmur_32(instr); - case Format::EXP: - return hash_murmur_32(instr); - case Format::MUBUF: - return hash_murmur_32(instr); - case Format::MIMG: - return hash_murmur_32(instr); - case Format::MTBUF: - return hash_murmur_32(instr); - case Format::FLAT: - return hash_murmur_32(instr); - case Format::PSEUDO_BRANCH: - return hash_murmur_32(instr); - case Format::PSEUDO_REDUCTION: - return hash_murmur_32(instr); - default: - return hash_murmur_32(instr); + case Format::SMEM: return hash_murmur_32(instr); + case Format::VINTRP: return hash_murmur_32(instr); + case Format::DS: return hash_murmur_32(instr); + case Format::SOPP: return hash_murmur_32(instr); + case Format::SOPK: return hash_murmur_32(instr); + case Format::EXP: return hash_murmur_32(instr); + case Format::MUBUF: return hash_murmur_32(instr); + case Format::MIMG: return hash_murmur_32(instr); + case Format::MTBUF: return hash_murmur_32(instr); + case Format::FLAT: return hash_murmur_32(instr); + case Format::PSEUDO_BRANCH: return hash_murmur_32(instr); + case Format::PSEUDO_REDUCTION: return hash_murmur_32(instr); + default: return hash_murmur_32(instr); } } }; @@ -129,7 +118,8 @@ struct InstrPred { return false; if (a->opcode != b->opcode) return false; - if (a->operands.size() != b->operands.size() || a->definitions.size() != b->definitions.size()) + if (a->operands.size() != b->operands.size() || + a->definitions.size() != b->definitions.size()) return false; /* possible with pseudo-instructions */ for (unsigned i = 0; i < a->operands.size(); i++) { if (a->operands[i].isConstant()) { @@ -137,14 +127,12 @@ struct InstrPred { return false; if (a->operands[i].constantValue() != b->operands[i].constantValue()) return false; - } - else if (a->operands[i].isTemp()) { + } else if (a->operands[i].isTemp()) { if (!b->operands[i].isTemp()) return false; if (a->operands[i].tempId() != b->operands[i].tempId()) return false; - } - else if (a->operands[i].isUndefined() ^ b->operands[i].isUndefined()) + } else if (a->operands[i].isUndefined() ^ b->operands[i].isUndefined()) return false; if (a->operands[i].isFixed()) { if (!b->operands[i].isFixed()) @@ -179,154 +167,110 @@ struct InstrPred { VOP3_instruction& a3 = a->vop3(); VOP3_instruction& b3 = b->vop3(); for (unsigned i = 0; i < 3; i++) { - if (a3.abs[i] != b3.abs[i] || - a3.neg[i] != b3.neg[i]) + if (a3.abs[i] != b3.abs[i] || a3.neg[i] != b3.neg[i]) return false; } - return a3.clamp == b3.clamp && - a3.omod == b3.omod && - a3.opsel == b3.opsel; + return a3.clamp == b3.clamp && a3.omod == b3.omod && a3.opsel == b3.opsel; } if (a->isDPP()) { DPP_instruction& aDPP = a->dpp(); DPP_instruction& bDPP = b->dpp(); - return aDPP.pass_flags == bDPP.pass_flags && - aDPP.dpp_ctrl == bDPP.dpp_ctrl && - aDPP.bank_mask == bDPP.bank_mask && - aDPP.row_mask == bDPP.row_mask && - aDPP.bound_ctrl == bDPP.bound_ctrl && - aDPP.abs[0] == bDPP.abs[0] && - aDPP.abs[1] == bDPP.abs[1] && - aDPP.neg[0] == bDPP.neg[0] && + return aDPP.pass_flags == bDPP.pass_flags && aDPP.dpp_ctrl == bDPP.dpp_ctrl && + aDPP.bank_mask == bDPP.bank_mask && aDPP.row_mask == bDPP.row_mask && + aDPP.bound_ctrl == bDPP.bound_ctrl && aDPP.abs[0] == bDPP.abs[0] && + aDPP.abs[1] == bDPP.abs[1] && aDPP.neg[0] == bDPP.neg[0] && aDPP.neg[1] == bDPP.neg[1]; } if (a->isSDWA()) { SDWA_instruction& aSDWA = a->sdwa(); SDWA_instruction& bSDWA = b->sdwa(); - return aSDWA.sel[0] == bSDWA.sel[0] && - aSDWA.sel[1] == bSDWA.sel[1] && - aSDWA.dst_sel == bSDWA.dst_sel && - aSDWA.abs[0] == bSDWA.abs[0] && - aSDWA.abs[1] == bSDWA.abs[1] && - aSDWA.neg[0] == bSDWA.neg[0] && - aSDWA.neg[1] == bSDWA.neg[1] && - aSDWA.dst_preserve == bSDWA.dst_preserve && - aSDWA.clamp == bSDWA.clamp && - aSDWA.omod == bSDWA.omod; + return aSDWA.sel[0] == bSDWA.sel[0] && aSDWA.sel[1] == bSDWA.sel[1] && + aSDWA.dst_sel == bSDWA.dst_sel && aSDWA.abs[0] == bSDWA.abs[0] && + aSDWA.abs[1] == bSDWA.abs[1] && aSDWA.neg[0] == bSDWA.neg[0] && + aSDWA.neg[1] == bSDWA.neg[1] && aSDWA.dst_preserve == bSDWA.dst_preserve && + aSDWA.clamp == bSDWA.clamp && aSDWA.omod == bSDWA.omod; } switch (a->format) { - case Format::SOPK: { - if (a->opcode == aco_opcode::s_getreg_b32) + case Format::SOPK: { + if (a->opcode == aco_opcode::s_getreg_b32) + return false; + SOPK_instruction& aK = a->sopk(); + SOPK_instruction& bK = b->sopk(); + return aK.imm == bK.imm; + } + case Format::SMEM: { + SMEM_instruction& aS = a->smem(); + SMEM_instruction& bS = b->smem(); + /* isel shouldn't be creating situations where this assertion fails */ + assert(aS.prevent_overflow == bS.prevent_overflow); + return aS.sync == bS.sync && aS.glc == bS.glc && aS.dlc == bS.dlc && aS.nv == bS.nv && + aS.disable_wqm == bS.disable_wqm && aS.prevent_overflow == bS.prevent_overflow; + } + case Format::VINTRP: { + Interp_instruction& aI = a->vintrp(); + Interp_instruction& bI = b->vintrp(); + if (aI.attribute != bI.attribute) + return false; + if (aI.component != bI.component) + return false; + return true; + } + case Format::VOP3P: { + VOP3P_instruction& a3P = a->vop3p(); + VOP3P_instruction& b3P = b->vop3p(); + for (unsigned i = 0; i < 3; i++) { + if (a3P.neg_lo[i] != b3P.neg_lo[i] || a3P.neg_hi[i] != b3P.neg_hi[i]) return false; - SOPK_instruction& aK = a->sopk(); - SOPK_instruction& bK = b->sopk(); - return aK.imm == bK.imm; } - case Format::SMEM: { - SMEM_instruction& aS = a->smem(); - SMEM_instruction& bS = b->smem(); - /* isel shouldn't be creating situations where this assertion fails */ - assert(aS.prevent_overflow == bS.prevent_overflow); - return aS.sync == bS.sync && aS.glc == bS.glc && aS.dlc == bS.dlc && - aS.nv == bS.nv && aS.disable_wqm == bS.disable_wqm && - aS.prevent_overflow == bS.prevent_overflow; - } - case Format::VINTRP: { - Interp_instruction& aI = a->vintrp(); - Interp_instruction& bI = b->vintrp(); - if (aI.attribute != bI.attribute) - return false; - if (aI.component != bI.component) - return false; - return true; - } - case Format::VOP3P: { - VOP3P_instruction& a3P = a->vop3p(); - VOP3P_instruction& b3P = b->vop3p(); - for (unsigned i = 0; i < 3; i++) { - if (a3P.neg_lo[i] != b3P.neg_lo[i] || - a3P.neg_hi[i] != b3P.neg_hi[i]) - return false; - } - return a3P.opsel_lo == b3P.opsel_lo && - a3P.opsel_hi == b3P.opsel_hi && - a3P.clamp == b3P.clamp; - } - case Format::PSEUDO_REDUCTION: { - Pseudo_reduction_instruction& aR = a->reduction(); - Pseudo_reduction_instruction& bR = b->reduction(); - return aR.pass_flags == bR.pass_flags && - aR.reduce_op == bR.reduce_op && - aR.cluster_size == bR.cluster_size; - } - case Format::DS: { - assert(a->opcode == aco_opcode::ds_bpermute_b32 || - a->opcode == aco_opcode::ds_permute_b32 || - a->opcode == aco_opcode::ds_swizzle_b32); - DS_instruction& aD = a->ds(); - DS_instruction& bD = b->ds(); - return aD.sync == bD.sync && - aD.pass_flags == bD.pass_flags && - aD.gds == bD.gds && - aD.offset0 == bD.offset0 && - aD.offset1 == bD.offset1; - } - case Format::MTBUF: { - MTBUF_instruction& aM = a->mtbuf(); - MTBUF_instruction& bM = b->mtbuf(); - return aM.sync == bM.sync && - aM.dfmt == bM.dfmt && - aM.nfmt == bM.nfmt && - aM.offset == bM.offset && - aM.offen == bM.offen && - aM.idxen == bM.idxen && - aM.glc == bM.glc && - aM.dlc == bM.dlc && - aM.slc == bM.slc && - aM.tfe == bM.tfe && - aM.disable_wqm == bM.disable_wqm; - } - case Format::MUBUF: { - MUBUF_instruction& aM = a->mubuf(); - MUBUF_instruction& bM = b->mubuf(); - return aM.sync == bM.sync && - aM.offset == bM.offset && - aM.offen == bM.offen && - aM.idxen == bM.idxen && - aM.glc == bM.glc && - aM.dlc == bM.dlc && - aM.slc == bM.slc && - aM.tfe == bM.tfe && - aM.lds == bM.lds && - aM.disable_wqm == bM.disable_wqm; - } - case Format::MIMG: { - MIMG_instruction& aM = a->mimg(); - MIMG_instruction& bM = b->mimg(); - return aM.sync == bM.sync && - aM.dmask == bM.dmask && - aM.unrm == bM.unrm && - aM.glc == bM.glc && - aM.slc == bM.slc && - aM.tfe == bM.tfe && - aM.da == bM.da && - aM.lwe == bM.lwe && - aM.r128 == bM.r128 && - aM.a16 == bM.a16 && - aM.d16 == bM.d16 && - aM.disable_wqm == bM.disable_wqm; - } - case Format::FLAT: - case Format::GLOBAL: - case Format::SCRATCH: - case Format::EXP: - case Format::SOPP: - case Format::PSEUDO_BRANCH: - case Format::PSEUDO_BARRIER: - assert(false); - default: - return true; + return a3P.opsel_lo == b3P.opsel_lo && a3P.opsel_hi == b3P.opsel_hi && + a3P.clamp == b3P.clamp; + } + case Format::PSEUDO_REDUCTION: { + Pseudo_reduction_instruction& aR = a->reduction(); + Pseudo_reduction_instruction& bR = b->reduction(); + return aR.pass_flags == bR.pass_flags && aR.reduce_op == bR.reduce_op && + aR.cluster_size == bR.cluster_size; + } + case Format::DS: { + assert(a->opcode == aco_opcode::ds_bpermute_b32 || + a->opcode == aco_opcode::ds_permute_b32 || a->opcode == aco_opcode::ds_swizzle_b32); + DS_instruction& aD = a->ds(); + DS_instruction& bD = b->ds(); + return aD.sync == bD.sync && aD.pass_flags == bD.pass_flags && aD.gds == bD.gds && + aD.offset0 == bD.offset0 && aD.offset1 == bD.offset1; + } + case Format::MTBUF: { + MTBUF_instruction& aM = a->mtbuf(); + MTBUF_instruction& bM = b->mtbuf(); + return aM.sync == bM.sync && aM.dfmt == bM.dfmt && aM.nfmt == bM.nfmt && + aM.offset == bM.offset && aM.offen == bM.offen && aM.idxen == bM.idxen && + aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc && aM.tfe == bM.tfe && + aM.disable_wqm == bM.disable_wqm; + } + case Format::MUBUF: { + MUBUF_instruction& aM = a->mubuf(); + MUBUF_instruction& bM = b->mubuf(); + return aM.sync == bM.sync && aM.offset == bM.offset && aM.offen == bM.offen && + aM.idxen == bM.idxen && aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc && + aM.tfe == bM.tfe && aM.lds == bM.lds && aM.disable_wqm == bM.disable_wqm; + } + case Format::MIMG: { + MIMG_instruction& aM = a->mimg(); + MIMG_instruction& bM = b->mimg(); + return aM.sync == bM.sync && aM.dmask == bM.dmask && aM.unrm == bM.unrm && + aM.glc == bM.glc && aM.slc == bM.slc && aM.tfe == bM.tfe && aM.da == bM.da && + aM.lwe == bM.lwe && aM.r128 == bM.r128 && aM.a16 == bM.a16 && aM.d16 == bM.d16 && + aM.disable_wqm == bM.disable_wqm; + } + case Format::FLAT: + case Format::GLOBAL: + case Format::SCRATCH: + case Format::EXP: + case Format::SOPP: + case Format::PSEUDO_BRANCH: + case Format::PSEUDO_BARRIER: assert(false); + default: return true; } } }; @@ -345,7 +289,8 @@ struct vn_ctx { */ uint32_t exec_id = 1; - vn_ctx(Program* program_) : program(program_) { + vn_ctx(Program* program_) : program(program_) + { static_assert(sizeof(Temp) == 4, "Temp must fit in 32bits"); unsigned size = 0; for (Block& block : program->blocks) @@ -354,11 +299,11 @@ struct vn_ctx { } }; - /* dominates() returns true if the parent block dominates the child block and * if the parent block is part of the same loop or has a smaller loop nest depth. */ -bool dominates(vn_ctx& ctx, uint32_t parent, uint32_t child) +bool +dominates(vn_ctx& ctx, uint32_t parent, uint32_t child) { unsigned parent_loop_nest_depth = ctx.program->blocks[parent].loop_nest_depth; while (parent < child && parent_loop_nest_depth <= ctx.program->blocks[child].loop_nest_depth) @@ -375,42 +320,40 @@ bool dominates(vn_ctx& ctx, uint32_t parent, uint32_t child) * Note that expr_set must not be used with instructions * which cannot be eliminated. */ -bool can_eliminate(aco_ptr& instr) +bool +can_eliminate(aco_ptr& instr) { switch (instr->format) { - case Format::FLAT: - case Format::GLOBAL: - case Format::SCRATCH: - case Format::EXP: - case Format::SOPP: - case Format::PSEUDO_BRANCH: - case Format::PSEUDO_BARRIER: + case Format::FLAT: + case Format::GLOBAL: + case Format::SCRATCH: + case Format::EXP: + case Format::SOPP: + case Format::PSEUDO_BRANCH: + case Format::PSEUDO_BARRIER: return false; + case Format::DS: + return instr->opcode == aco_opcode::ds_bpermute_b32 || + instr->opcode == aco_opcode::ds_permute_b32 || + instr->opcode == aco_opcode::ds_swizzle_b32; + case Format::SMEM: + case Format::MUBUF: + case Format::MIMG: + case Format::MTBUF: + if (!get_sync_info(instr.get()).can_reorder()) return false; - case Format::DS: - return instr->opcode == aco_opcode::ds_bpermute_b32 || - instr->opcode == aco_opcode::ds_permute_b32 || - instr->opcode == aco_opcode::ds_swizzle_b32; - case Format::SMEM: - case Format::MUBUF: - case Format::MIMG: - case Format::MTBUF: - if (!get_sync_info(instr.get()).can_reorder()) - return false; - break; - default: - break; + break; + default: break; } - if (instr->definitions.empty() || - instr->opcode == aco_opcode::p_phi || - instr->opcode == aco_opcode::p_linear_phi || - instr->definitions[0].isNoCSE()) + if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi || + instr->opcode == aco_opcode::p_linear_phi || instr->definitions[0].isNoCSE()) return false; return true; } -void process_block(vn_ctx& ctx, Block& block) +void +process_block(vn_ctx& ctx, Block& block) { std::vector> new_instructions; new_instructions.reserve(block.instructions.size()); @@ -435,8 +378,9 @@ void process_block(vn_ctx& ctx, Block& block) } /* simple copy-propagation through renaming */ - bool copy_instr = instr->opcode == aco_opcode::p_parallelcopy || - (instr->opcode == aco_opcode::p_create_vector && instr->operands.size() == 1); + bool copy_instr = + instr->opcode == aco_opcode::p_parallelcopy || + (instr->opcode == aco_opcode::p_create_vector && instr->operands.size() == 1); if (copy_instr && !instr->definitions[0].isFixed() && instr->operands[0].isTemp() && instr->operands[0].regClass() == instr->definitions[0].regClass()) { ctx.renames[instr->definitions[0].tempId()] = instr->operands[0].getTemp(); @@ -479,7 +423,8 @@ void process_block(vn_ctx& ctx, Block& block) block.instructions = std::move(new_instructions); } -void rename_phi_operands(Block& block, std::map& renames) +void +rename_phi_operands(Block& block, std::map& renames) { for (aco_ptr& phi : block.instructions) { if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) @@ -496,8 +441,8 @@ void rename_phi_operands(Block& block, std::map& renames) } } /* end namespace */ - -void value_numbering(Program* program) +void +value_numbering(Program* program) { vn_ctx ctx(program); std::vector loop_headers; @@ -521,10 +466,8 @@ void value_numbering(Program* program) rename_phi_operands(block, ctx.renames); /* increment exec_id when entering nested control flow */ - if (block.kind & block_kind_branch || - block.kind & block_kind_loop_preheader || - block.kind & block_kind_break || - block.kind & block_kind_continue || + if (block.kind & block_kind_branch || block.kind & block_kind_loop_preheader || + block.kind & block_kind_break || block.kind & block_kind_continue || block.kind & block_kind_discard) ctx.exec_id++; else if (block.kind & block_kind_continue_or_break) @@ -538,4 +481,4 @@ void value_numbering(Program* program) } } -} +} // namespace aco diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 9ec30d6a0c1..da0769ec301 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -23,6 +23,7 @@ */ #include "aco_ir.h" + #include "util/half_float.h" #include "util/memstream.h" @@ -33,14 +34,15 @@ namespace aco { #ifndef NDEBUG -void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr) +void +perfwarn(Program* program, bool cond, const char* msg, Instruction* instr) { if (cond) { - char *out; + char* out; size_t outsize; struct u_memstream mem; u_memstream_open(&mem, &out, &outsize); - FILE *const memf = u_memstream_get(&mem); + FILE* const memf = u_memstream_get(&mem); fprintf(memf, "%s: ", msg); aco_print_instr(instr, memf); @@ -69,7 +71,6 @@ void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr) * instructions are removed from the sequence. */ - struct mad_info { aco_ptr add_instr; uint32_t mul_temp_id; @@ -77,7 +78,8 @@ struct mad_info { bool check_literal; mad_info(aco_ptr instr, uint32_t id) - : add_instr(std::move(instr)), mul_temp_id(id), literal_idx(0), check_literal(false) {} + : add_instr(std::move(instr)), mul_temp_id(id), literal_idx(0), check_literal(false) + {} }; enum Label { @@ -112,22 +114,25 @@ enum Label { label_b2i = 1 << 27, label_fcanonicalize = 1 << 28, label_constant_16bit = 1 << 29, - label_usedef = 1 << 30, /* generic label */ + label_usedef = 1 << 30, /* generic label */ label_vop3p = 1ull << 31, /* 1ull to prevent sign extension */ label_canonicalized = 1ull << 32, label_extract = 1ull << 33, label_insert = 1ull << 34, }; -static constexpr uint64_t instr_usedef_labels = label_vec | label_mul | label_mad | label_add_sub | label_vop3p | - label_bitwise | label_uniform_bitwise | label_minmax | label_vopc | - label_usedef | label_extract; -static constexpr uint64_t instr_mod_labels = label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert; +static constexpr uint64_t instr_usedef_labels = + label_vec | label_mul | label_mad | label_add_sub | label_vop3p | label_bitwise | + label_uniform_bitwise | label_minmax | label_vopc | label_usedef | label_extract; +static constexpr uint64_t instr_mod_labels = + label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert; static constexpr uint64_t instr_labels = instr_usedef_labels | instr_mod_labels; -static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool | - label_scc_invert | label_b2i | label_fcanonicalize; -static constexpr uint32_t val_labels = label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal; +static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | + label_uniform_bool | label_scc_invert | label_b2i | + label_fcanonicalize; +static constexpr uint32_t val_labels = + label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal; static_assert((instr_labels & temp_labels) == 0, "labels cannot intersect"); static_assert((instr_labels & val_labels) == 0, "labels cannot intersect"); @@ -161,7 +166,8 @@ struct ssa_info { label &= ~(instr_labels | val_labels); /* instr, temp and val alias */ } - uint32_t const_labels = label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit; + uint32_t const_labels = + label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit; if (new_label & const_labels) { label &= ~val_labels | const_labels; label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */ @@ -179,10 +185,7 @@ struct ssa_info { instr = vec; } - bool is_vec() - { - return label & label_vec; - } + bool is_vec() { return label & label_vec; } void set_constant(chip_class chip, uint64_t constant) { @@ -210,14 +213,10 @@ struct ssa_info { bool is_constant(unsigned bits) { switch (bits) { - case 8: - return label & label_literal; - case 16: - return label & label_constant_16bit; - case 32: - return label & label_constant_32bit; - case 64: - return label & label_constant_64bit; + case 8: return label & label_literal; + case 16: return label & label_constant_16bit; + case 32: return label & label_constant_32bit; + case 64: return label & label_constant_64bit; } return false; } @@ -226,14 +225,10 @@ struct ssa_info { { bool is_lit = label & label_literal; switch (bits) { - case 8: - return false; - case 16: - return is_lit && ~(label & label_constant_16bit); - case 32: - return is_lit && ~(label & label_constant_32bit); - case 64: - return false; + case 8: return false; + case 16: return is_lit && ~(label & label_constant_16bit); + case 32: return is_lit && ~(label & label_constant_32bit); + case 64: return false; } return false; } @@ -252,10 +247,7 @@ struct ssa_info { temp = abs_temp; } - bool is_abs() - { - return label & label_abs; - } + bool is_abs() { return label & label_abs; } void set_neg(Temp neg_temp) { @@ -263,10 +255,7 @@ struct ssa_info { temp = neg_temp; } - bool is_neg() - { - return label & label_neg; - } + bool is_neg() { return label & label_neg; } void set_neg_abs(Temp neg_abs_temp) { @@ -280,10 +269,7 @@ struct ssa_info { instr = mul; } - bool is_mul() - { - return label & label_mul; - } + bool is_mul() { return label & label_mul; } void set_temp(Temp tmp) { @@ -291,10 +277,7 @@ struct ssa_info { temp = tmp; } - bool is_temp() - { - return label & label_temp; - } + bool is_temp() { return label & label_temp; } void set_mad(Instruction* mad, uint32_t mad_info_idx) { @@ -303,10 +286,7 @@ struct ssa_info { instr = mad; } - bool is_mad() - { - return label & label_mad; - } + bool is_mad() { return label & label_mad; } void set_omod2(Instruction* mul) { @@ -314,10 +294,7 @@ struct ssa_info { instr = mul; } - bool is_omod2() - { - return label & label_omod2; - } + bool is_omod2() { return label & label_omod2; } void set_omod4(Instruction* mul) { @@ -325,10 +302,7 @@ struct ssa_info { instr = mul; } - bool is_omod4() - { - return label & label_omod4; - } + bool is_omod4() { return label & label_omod4; } void set_omod5(Instruction* mul) { @@ -336,31 +310,19 @@ struct ssa_info { instr = mul; } - bool is_omod5() - { - return label & label_omod5; - } + bool is_omod5() { return label & label_omod5; } - void set_clamp(Instruction *med3) + void set_clamp(Instruction* med3) { add_label(label_clamp); instr = med3; } - bool is_clamp() - { - return label & label_clamp; - } + bool is_clamp() { return label & label_clamp; } - void set_undefined() - { - add_label(label_undefined); - } + void set_undefined() { add_label(label_undefined); } - bool is_undefined() - { - return label & label_undefined; - } + bool is_undefined() { return label & label_undefined; } void set_vcc(Temp vcc_val) { @@ -368,10 +330,7 @@ struct ssa_info { temp = vcc_val; } - bool is_vcc() - { - return label & label_vcc; - } + bool is_vcc() { return label & label_vcc; } void set_b2f(Temp b2f_val) { @@ -379,74 +338,47 @@ struct ssa_info { temp = b2f_val; } - bool is_b2f() - { - return label & label_b2f; - } + bool is_b2f() { return label & label_b2f; } - void set_add_sub(Instruction *add_sub_instr) + void set_add_sub(Instruction* add_sub_instr) { add_label(label_add_sub); instr = add_sub_instr; } - bool is_add_sub() - { - return label & label_add_sub; - } + bool is_add_sub() { return label & label_add_sub; } - void set_bitwise(Instruction *bitwise_instr) + void set_bitwise(Instruction* bitwise_instr) { add_label(label_bitwise); instr = bitwise_instr; } - bool is_bitwise() - { - return label & label_bitwise; - } + bool is_bitwise() { return label & label_bitwise; } - void set_uniform_bitwise() - { - add_label(label_uniform_bitwise); - } + void set_uniform_bitwise() { add_label(label_uniform_bitwise); } - bool is_uniform_bitwise() - { - return label & label_uniform_bitwise; - } + bool is_uniform_bitwise() { return label & label_uniform_bitwise; } - void set_minmax(Instruction *minmax_instr) + void set_minmax(Instruction* minmax_instr) { add_label(label_minmax); instr = minmax_instr; } - bool is_minmax() - { - return label & label_minmax; - } + bool is_minmax() { return label & label_minmax; } - void set_vopc(Instruction *vopc_instr) + void set_vopc(Instruction* vopc_instr) { add_label(label_vopc); instr = vopc_instr; } - bool is_vopc() - { - return label & label_vopc; - } + bool is_vopc() { return label & label_vopc; } - void set_scc_needed() - { - add_label(label_scc_needed); - } + void set_scc_needed() { add_label(label_scc_needed); } - bool is_scc_needed() - { - return label & label_scc_needed; - } + bool is_scc_needed() { return label & label_scc_needed; } void set_scc_invert(Temp scc_inv) { @@ -454,10 +386,7 @@ struct ssa_info { temp = scc_inv; } - bool is_scc_invert() - { - return label & label_scc_invert; - } + bool is_scc_invert() { return label & label_scc_invert; } void set_uniform_bool(Temp uniform_bool) { @@ -465,20 +394,11 @@ struct ssa_info { temp = uniform_bool; } - bool is_uniform_bool() - { - return label & label_uniform_bool; - } + bool is_uniform_bool() { return label & label_uniform_bool; } - void set_vcc_hint() - { - add_label(label_vcc_hint); - } + void set_vcc_hint() { add_label(label_vcc_hint); } - bool is_vcc_hint() - { - return label & label_vcc_hint; - } + bool is_vcc_hint() { return label & label_vcc_hint; } void set_b2i(Temp b2i_val) { @@ -486,21 +406,15 @@ struct ssa_info { temp = b2i_val; } - bool is_b2i() - { - return label & label_b2i; - } + bool is_b2i() { return label & label_b2i; } - void set_usedef(Instruction *label_instr) + void set_usedef(Instruction* label_instr) { add_label(label_usedef); instr = label_instr; } - bool is_usedef() - { - return label & label_usedef; - } + bool is_usedef() { return label & label_usedef; } void set_vop3p(Instruction* vop3p_instr) { @@ -508,10 +422,7 @@ struct ssa_info { instr = vop3p_instr; } - bool is_vop3p() - { - return label & label_vop3p; - } + bool is_vop3p() { return label & label_vop3p; } void set_fcanonicalize(Temp tmp) { @@ -519,42 +430,27 @@ struct ssa_info { temp = tmp; } - bool is_fcanonicalize() - { - return label & label_fcanonicalize; - } + bool is_fcanonicalize() { return label & label_fcanonicalize; } - void set_canonicalized() - { - add_label(label_canonicalized); - } + void set_canonicalized() { add_label(label_canonicalized); } - bool is_canonicalized() - { - return label & label_canonicalized; - } + bool is_canonicalized() { return label & label_canonicalized; } - void set_extract(Instruction *extract) + void set_extract(Instruction* extract) { add_label(label_extract); instr = extract; } - bool is_extract() - { - return label & label_extract; - } + bool is_extract() { return label & label_extract; } - void set_insert(Instruction *insert) + void set_insert(Instruction* insert) { add_label(label_insert); instr = insert; } - bool is_insert() - { - return label & label_insert; - } + bool is_insert() { return label & label_insert; } }; struct opt_ctx { @@ -562,7 +458,7 @@ struct opt_ctx { float_mode fp_mode; std::vector> instructions; ssa_info* info; - std::pair last_literal; + std::pair last_literal; std::vector mad_infos; std::vector uses; }; @@ -577,9 +473,10 @@ struct CmpInfo { unsigned size; }; -ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo *info); +ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo* info); -bool can_swap_operands(aco_ptr& instr) +bool +can_swap_operands(aco_ptr& instr) { if (instr->operands[0].isConstant() || (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr)) @@ -612,23 +509,12 @@ bool can_swap_operands(aco_ptr& instr) case aco_opcode::v_max_i16_e64: case aco_opcode::v_min_i16_e64: case aco_opcode::v_max_u16_e64: - case aco_opcode::v_min_u16_e64: - return true; - case aco_opcode::v_sub_f16: - instr->opcode = aco_opcode::v_subrev_f16; - return true; - case aco_opcode::v_sub_f32: - instr->opcode = aco_opcode::v_subrev_f32; - return true; - case aco_opcode::v_sub_co_u32: - instr->opcode = aco_opcode::v_subrev_co_u32; - return true; - case aco_opcode::v_sub_u16: - instr->opcode = aco_opcode::v_subrev_u16; - return true; - case aco_opcode::v_sub_u32: - instr->opcode = aco_opcode::v_subrev_u32; - return true; + case aco_opcode::v_min_u16_e64: return true; + case aco_opcode::v_sub_f16: instr->opcode = aco_opcode::v_subrev_f16; return true; + case aco_opcode::v_sub_f32: instr->opcode = aco_opcode::v_subrev_f32; return true; + case aco_opcode::v_sub_co_u32: instr->opcode = aco_opcode::v_subrev_co_u32; return true; + case aco_opcode::v_sub_u16: instr->opcode = aco_opcode::v_subrev_u16; return true; + case aco_opcode::v_sub_u32: instr->opcode = aco_opcode::v_subrev_u32; return true; default: { CmpInfo info; get_cmp_info(instr->opcode, &info); @@ -645,7 +531,8 @@ bool can_swap_operands(aco_ptr& instr) } } -bool can_use_VOP3(opt_ctx& ctx, const aco_ptr& instr) +bool +can_use_VOP3(opt_ctx& ctx, const aco_ptr& instr) { if (instr->isVOP3()) return true; @@ -659,36 +546,34 @@ bool can_use_VOP3(opt_ctx& ctx, const aco_ptr& instr) if (instr->isDPP() || instr->isSDWA()) return false; - return instr->opcode != aco_opcode::v_madmk_f32 && - instr->opcode != aco_opcode::v_madak_f32 && - instr->opcode != aco_opcode::v_madmk_f16 && - instr->opcode != aco_opcode::v_madak_f16 && - instr->opcode != aco_opcode::v_fmamk_f32 && - instr->opcode != aco_opcode::v_fmaak_f32 && - instr->opcode != aco_opcode::v_fmamk_f16 && - instr->opcode != aco_opcode::v_fmaak_f16 && + return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 && + instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 && + instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 && + instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 && instr->opcode != aco_opcode::v_readlane_b32 && instr->opcode != aco_opcode::v_writelane_b32 && instr->opcode != aco_opcode::v_readfirstlane_b32; } -bool pseudo_propagate_temp(opt_ctx& ctx, aco_ptr& instr, - Temp temp, unsigned index) +bool +pseudo_propagate_temp(opt_ctx& ctx, aco_ptr& instr, Temp temp, unsigned index) { if (instr->definitions.empty()) return false; - const bool vgpr = instr->opcode == aco_opcode::p_as_uniform || - std::all_of(instr->definitions.begin(), instr->definitions.end(), - [] (const Definition& def) { return def.regClass().type() == RegType::vgpr;}); + const bool vgpr = + instr->opcode == aco_opcode::p_as_uniform || + std::all_of(instr->definitions.begin(), instr->definitions.end(), + [](const Definition& def) { return def.regClass().type() == RegType::vgpr; }); /* don't propagate VGPRs into SGPR instructions */ if (temp.type() == RegType::vgpr && !vgpr) return false; - bool can_accept_sgpr = ctx.program->chip_class >= GFX9 || - std::none_of(instr->definitions.begin(), instr->definitions.end(), - [] (const Definition& def) { return def.regClass().is_subdword();}); + bool can_accept_sgpr = + ctx.program->chip_class >= GFX9 || + std::none_of(instr->definitions.begin(), instr->definitions.end(), + [](const Definition& def) { return def.regClass().is_subdword(); }); switch (instr->opcode) { case aco_opcode::p_phi: @@ -725,15 +610,15 @@ bool pseudo_propagate_temp(opt_ctx& ctx, aco_ptr& instr, if (temp.regClass() == instr->definitions[0].regClass()) instr->opcode = aco_opcode::p_parallelcopy; break; - default: - return false; + default: return false; } instr->operands[index].setTemp(temp); return true; } -bool can_apply_sgprs(opt_ctx& ctx, aco_ptr& instr) +bool +can_apply_sgprs(opt_ctx& ctx, aco_ptr& instr) { if ((instr->isSDWA() && ctx.program->chip_class < GFX9) || instr->isDPP()) return false; @@ -746,14 +631,16 @@ bool can_apply_sgprs(opt_ctx& ctx, aco_ptr& instr) instr->opcode != aco_opcode::v_permlanex16_b32; } -void to_VOP3(opt_ctx& ctx, aco_ptr& instr) +void +to_VOP3(opt_ctx& ctx, aco_ptr& instr) { if (instr->isVOP3()) return; aco_ptr tmp = std::move(instr); Format format = asVOP3(tmp->format); - instr.reset(create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); + instr.reset(create_instruction(tmp->opcode, format, tmp->operands.size(), + tmp->definitions.size())); std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin()); for (unsigned i = 0; i < instr->definitions.size(); i++) { instr->definitions[i] = tmp->definitions[i]; @@ -767,12 +654,14 @@ void to_VOP3(opt_ctx& ctx, aco_ptr& instr) * been applied yet or this instruction isn't dead and so they've been ignored */ } -bool is_operand_vgpr(Operand op) +bool +is_operand_vgpr(Operand op) { return op.isTemp() && op.getTemp().type() == RegType::vgpr; } -void to_SDWA(opt_ctx& ctx, aco_ptr& instr) +void +to_SDWA(opt_ctx& ctx, aco_ptr& instr) { aco_ptr tmp = convert_to_SDWA(ctx.program->chip_class, instr); if (!tmp) @@ -786,15 +675,15 @@ void to_SDWA(opt_ctx& ctx, aco_ptr& instr) } /* only covers special cases */ -bool alu_can_accept_constant(aco_opcode opcode, unsigned operand) +bool +alu_can_accept_constant(aco_opcode opcode, unsigned operand) { switch (opcode) { case aco_opcode::v_interp_p2_f32: case aco_opcode::v_mac_f32: case aco_opcode::v_writelane_b32: case aco_opcode::v_writelane_b32_e64: - case aco_opcode::v_cndmask_b32: - return operand != 2; + case aco_opcode::v_cndmask_b32: return operand != 2; case aco_opcode::s_addk_i32: case aco_opcode::s_mulk_i32: case aco_opcode::p_wqm: @@ -804,25 +693,28 @@ bool alu_can_accept_constant(aco_opcode opcode, unsigned operand) case aco_opcode::v_readlane_b32_e64: case aco_opcode::v_readfirstlane_b32: case aco_opcode::p_extract: - case aco_opcode::p_insert: - return operand != 0; - default: - return true; + case aco_opcode::p_insert: return operand != 0; + default: return true; } } -bool valu_can_accept_vgpr(aco_ptr& instr, unsigned operand) +bool +valu_can_accept_vgpr(aco_ptr& instr, unsigned operand) { - if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_readlane_b32_e64 || - instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64) + if (instr->opcode == aco_opcode::v_readlane_b32 || + instr->opcode == aco_opcode::v_readlane_b32_e64 || + instr->opcode == aco_opcode::v_writelane_b32 || + instr->opcode == aco_opcode::v_writelane_b32_e64) return operand != 1; - if (instr->opcode == aco_opcode::v_permlane16_b32 || instr->opcode == aco_opcode::v_permlanex16_b32) + if (instr->opcode == aco_opcode::v_permlane16_b32 || + instr->opcode == aco_opcode::v_permlanex16_b32) return operand == 0; return true; } /* check constant bus and literal limitations */ -bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands) +bool +check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand* operands) { int limit = ctx.program->chip_class >= GFX10 ? 2 : 1; Operand literal32(s1); @@ -869,7 +761,9 @@ bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands) return true; } -bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset, bool prevent_overflow) +bool +parse_base_offset(opt_ctx& ctx, Instruction* instr, unsigned op_index, Temp* base, uint32_t* offset, + bool prevent_overflow) { Operand op = instr->operands[op_index]; @@ -879,17 +773,15 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp if (!ctx.info[tmp.id()].is_add_sub()) return false; - Instruction *add_instr = ctx.info[tmp.id()].instr; + Instruction* add_instr = ctx.info[tmp.id()].instr; switch (add_instr->opcode) { case aco_opcode::v_add_u32: case aco_opcode::v_add_co_u32: case aco_opcode::v_add_co_u32_e64: case aco_opcode::s_add_i32: - case aco_opcode::s_add_u32: - break; - default: - return false; + case aco_opcode::s_add_u32: break; + default: return false; } if (prevent_overflow && !add_instr->definitions[0].isNUW()) return false; @@ -921,11 +813,13 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp return false; } -unsigned get_operand_size(aco_ptr& instr, unsigned index) +unsigned +get_operand_size(aco_ptr& instr, unsigned index) { if (instr->isPseudo()) return instr->operands[index].bytes() * 8u; - else if (instr->opcode == aco_opcode::v_mad_u64_u32 || instr->opcode == aco_opcode::v_mad_i64_i32) + else if (instr->opcode == aco_opcode::v_mad_u64_u32 || + instr->opcode == aco_opcode::v_mad_i64_i32) return index == 2 ? 64 : 32; else if (instr->isVALU() || instr->isSALU()) return instr_info.operand_size[(int)instr->opcode]; @@ -933,19 +827,22 @@ unsigned get_operand_size(aco_ptr& instr, unsigned index) return 0; } -Operand get_constant_op(opt_ctx &ctx, ssa_info info, uint32_t bits) +Operand +get_constant_op(opt_ctx& ctx, ssa_info info, uint32_t bits) { if (bits == 64) return Operand(info.val, true); return Operand::get_const(ctx.program->chip_class, info.val, bits / 8u); } -bool fixed_to_exec(Operand op) +bool +fixed_to_exec(Operand op) { return op.isFixed() && op.physReg() == exec; } -int parse_extract(Instruction *instr) +int +parse_extract(Instruction* instr) { if (instr->opcode == aco_opcode::p_extract) { bool is_byte = instr->operands[2].constantEquals(8); @@ -961,7 +858,8 @@ int parse_extract(Instruction *instr) } } -int parse_insert(Instruction *instr) +int +parse_insert(Instruction* instr) { if (instr->opcode == aco_opcode::p_extract && instr->operands[3].constantEquals(0) && instr->operands[1].constantEquals(0)) { @@ -976,7 +874,8 @@ int parse_insert(Instruction *instr) } } -bool can_apply_extract(opt_ctx &ctx, aco_ptr& instr, unsigned idx, ssa_info& info) +bool +can_apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& info) { if (idx >= 2) return false; @@ -990,7 +889,8 @@ bool can_apply_extract(opt_ctx &ctx, aco_ptr& instr, unsigned idx, return true; } else if (can_use_SDWA(ctx.program->chip_class, instr, true) && (tmp.type() == RegType::vgpr || ctx.program->chip_class >= GFX9)) { - if (instr->isSDWA() && (static_cast(instr.get())->sel[idx] & sdwa_asuint) != sdwa_udword) + if (instr->isSDWA() && + (static_cast(instr.get())->sel[idx] & sdwa_asuint) != sdwa_udword) return false; return true; } else if (instr->isVOP3() && (sel & sdwa_isword) && @@ -1005,7 +905,8 @@ bool can_apply_extract(opt_ctx &ctx, aco_ptr& instr, unsigned idx, /* Combine an p_extract (or p_insert, in some cases) instruction with instr. * instr(p_extract(...)) -> instr() */ -void apply_extract(opt_ctx &ctx, aco_ptr& instr, unsigned idx, ssa_info& info) +void +apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& info) { Temp tmp = info.instr->operands[0].getTemp(); unsigned sel = parse_extract(info.instr); @@ -1013,18 +914,10 @@ void apply_extract(opt_ctx &ctx, aco_ptr& instr, unsigned idx, ssa_ if (sel == sdwa_udword || sel == sdwa_sdword) { } else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel <= sdwa_ubyte3) { switch (sel) { - case sdwa_ubyte0: - instr->opcode = aco_opcode::v_cvt_f32_ubyte0; - break; - case sdwa_ubyte1: - instr->opcode = aco_opcode::v_cvt_f32_ubyte1; - break; - case sdwa_ubyte2: - instr->opcode = aco_opcode::v_cvt_f32_ubyte2; - break; - case sdwa_ubyte3: - instr->opcode = aco_opcode::v_cvt_f32_ubyte3; - break; + case sdwa_ubyte0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break; + case sdwa_ubyte1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break; + case sdwa_ubyte2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break; + case sdwa_ubyte3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break; } } else if (can_use_SDWA(ctx.program->chip_class, instr, true) && (tmp.type() == RegType::vgpr || ctx.program->chip_class >= GFX9)) { @@ -1041,7 +934,8 @@ void apply_extract(opt_ctx &ctx, aco_ptr& instr, unsigned idx, ssa_ ctx.info[def.tempId()].label &= label_vopc; } -void check_sdwa_extract(opt_ctx &ctx, aco_ptr& instr) +void +check_sdwa_extract(opt_ctx& ctx, aco_ptr& instr) { /* only VALU can use SDWA */ if (!instr->isVALU()) @@ -1060,7 +954,8 @@ void check_sdwa_extract(opt_ctx &ctx, aco_ptr& instr) } } -bool does_fp_op_flush_denorms(opt_ctx &ctx, aco_opcode op) +bool +does_fp_op_flush_denorms(opt_ctx& ctx, aco_opcode op) { if (ctx.program->chip_class <= GFX8) { switch (op) { @@ -1070,18 +965,17 @@ bool does_fp_op_flush_denorms(opt_ctx &ctx, aco_opcode op) case aco_opcode::v_min3_f32: case aco_opcode::v_max3_f32: case aco_opcode::v_min_f16: - case aco_opcode::v_max_f16: - return false; - default: - break; + case aco_opcode::v_max_f16: return false; + default: break; } } return op != aco_opcode::v_cndmask_b32; } -bool can_eliminate_fcanonicalize(opt_ctx &ctx, aco_ptr& instr, Temp tmp) +bool +can_eliminate_fcanonicalize(opt_ctx& ctx, aco_ptr& instr, Temp tmp) { - float_mode *fp = &ctx.fp_mode; + float_mode* fp = &ctx.fp_mode; if (ctx.info[tmp.id()].is_canonicalized() || (tmp.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep) return true; @@ -1090,14 +984,17 @@ bool can_eliminate_fcanonicalize(opt_ctx &ctx, aco_ptr& instr, Temp return instr_info.can_use_input_modifiers[(int)op] && does_fp_op_flush_denorms(ctx, op); } -bool is_copy_label(opt_ctx &ctx, aco_ptr& instr, ssa_info& info) +bool +is_copy_label(opt_ctx& ctx, aco_ptr& instr, ssa_info& info) { - return info.is_temp() || (info.is_fcanonicalize() && can_eliminate_fcanonicalize(ctx, instr, info.temp)); + return info.is_temp() || + (info.is_fcanonicalize() && can_eliminate_fcanonicalize(ctx, instr, info.temp)); } -bool is_op_canonicalized(opt_ctx &ctx, Operand op) +bool +is_op_canonicalized(opt_ctx& ctx, Operand op) { - float_mode *fp = &ctx.fp_mode; + float_mode* fp = &ctx.fp_mode; if ((op.isTemp() && ctx.info[op.tempId()].is_canonicalized()) || (op.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep) return true; @@ -1112,22 +1009,24 @@ bool is_op_canonicalized(opt_ctx &ctx, Operand op) return false; } -void label_instruction(opt_ctx &ctx, aco_ptr& instr) +void +label_instruction(opt_ctx& ctx, aco_ptr& instr) { if (instr->isSALU() || instr->isVALU() || instr->isPseudo()) { ASSERTED bool all_const = false; for (Operand& op : instr->operands) - all_const = all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32)); + all_const = + all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32)); perfwarn(ctx.program, all_const, "All instruction operands are constant", instr.get()); ASSERTED bool is_copy = instr->opcode == aco_opcode::s_mov_b32 || instr->opcode == aco_opcode::s_mov_b64 || instr->opcode == aco_opcode::v_mov_b32; - perfwarn(ctx.program, is_copy && !instr->usesModifiers(), "Use p_parallelcopy instead", instr.get()); + perfwarn(ctx.program, is_copy && !instr->usesModifiers(), "Use p_parallelcopy instead", + instr.get()); } - for (unsigned i = 0; i < instr->operands.size(); i++) - { + for (unsigned i = 0; i < instr->operands.size(); i++) { if (!instr->operands[i].isTemp()) continue; @@ -1161,18 +1060,22 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) /* VALU: propagate neg, abs & inline constants */ else if (instr->isVALU()) { - if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::vgpr && valu_can_accept_vgpr(instr, i)) { + if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::vgpr && + valu_can_accept_vgpr(instr, i)) { instr->operands[i].setTemp(info.temp); info = ctx.info[info.temp.id()]; } /* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */ - if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) && instr->operands.size() == 1) { + if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) && + instr->operands.size() == 1) { instr->operands[i].setTemp(info.temp); info = ctx.info[info.temp.id()]; } - /* for instructions other than v_cndmask_b32, the size of the instruction should match the operand size */ - unsigned can_use_mod = instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4; + /* for instructions other than v_cndmask_b32, the size of the instruction should match the + * operand size */ + unsigned can_use_mod = + instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4; can_use_mod = can_use_mod && instr_info.can_use_input_modifiers[(int)instr->opcode]; if (instr->isSDWA()) @@ -1186,7 +1089,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) } else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16) { instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16; instr->operands[i].setTemp(info.temp); - } else if (info.is_neg() && can_use_mod && can_eliminate_fcanonicalize(ctx, instr, info.temp)) { + } else if (info.is_neg() && can_use_mod && + can_eliminate_fcanonicalize(ctx, instr, info.temp)) { if (!instr->isDPP() && !instr->isSDWA()) to_VOP3(ctx, instr); instr->operands[i].setTemp(info.temp); @@ -1213,7 +1117,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i) && (!instr->isSDWA() || ctx.program->chip_class >= GFX9)) { Operand op = get_constant_op(ctx, info, bits); - perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get()); + perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, + "v_cndmask_b32 with a constant selector", instr.get()); if (i == 0 || instr->isSDWA() || instr->isVOP3P() || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) { @@ -1248,23 +1153,28 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) bool vaddr_prevent_overflow = mubuf.swizzled && ctx.program->chip_class < GFX9; bool saddr_prevent_overflow = mubuf.swizzled; - if (mubuf.offen && i == 1 && info.is_constant_or_literal(32) && mubuf.offset + info.val < 4096) { + if (mubuf.offen && i == 1 && info.is_constant_or_literal(32) && + mubuf.offset + info.val < 4096) { assert(!mubuf.idxen); instr->operands[1] = Operand(v1); mubuf.offset += info.val; mubuf.offen = false; continue; } else if (i == 2 && info.is_constant_or_literal(32) && mubuf.offset + info.val < 4096) { - instr->operands[2] = Operand((uint32_t) 0); + instr->operands[2] = Operand((uint32_t)0); mubuf.offset += info.val; continue; - } else if (mubuf.offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, vaddr_prevent_overflow) && + } else if (mubuf.offen && i == 1 && + parse_base_offset(ctx, instr.get(), i, &base, &offset, + vaddr_prevent_overflow) && base.regClass() == v1 && mubuf.offset + offset < 4096) { assert(!mubuf.idxen); instr->operands[1].setTemp(base); mubuf.offset += offset; continue; - } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, saddr_prevent_overflow) && + } else if (i == 2 && + parse_base_offset(ctx, instr.get(), i, &base, &offset, + saddr_prevent_overflow) && base.regClass() == s1 && mubuf.offset + offset < 4096) { instr->operands[i].setTemp(base); mubuf.offset += offset; @@ -1279,17 +1189,24 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) Temp base; uint32_t offset; bool has_usable_ds_offset = ctx.program->chip_class >= GFX7; - if (has_usable_ds_offset && - i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) && + if (has_usable_ds_offset && i == 0 && + parse_base_offset(ctx, instr.get(), i, &base, &offset, false) && base.regClass() == instr->operands[i].regClass() && instr->opcode != aco_opcode::ds_swizzle_b32) { - if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 || - instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) { - unsigned mask = (instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) ? 0x7 : 0x3; - unsigned shifts = (instr->opcode == aco_opcode::ds_write2_b64 || instr->opcode == aco_opcode::ds_read2_b64) ? 3 : 2; + if (instr->opcode == aco_opcode::ds_write2_b32 || + instr->opcode == aco_opcode::ds_read2_b32 || + instr->opcode == aco_opcode::ds_write2_b64 || + instr->opcode == aco_opcode::ds_read2_b64) { + unsigned mask = (instr->opcode == aco_opcode::ds_write2_b64 || + instr->opcode == aco_opcode::ds_read2_b64) + ? 0x7 + : 0x3; + unsigned shifts = (instr->opcode == aco_opcode::ds_write2_b64 || + instr->opcode == aco_opcode::ds_read2_b64) + ? 3 + : 2; - if ((offset & mask) == 0 && - ds.offset0 + (offset >> shifts) <= 255 && + if ((offset & mask) == 0 && ds.offset0 + (offset >> shifts) <= 255 && ds.offset1 + (offset >> shifts) <= 255) { instr->operands[i].setTemp(base); ds.offset0 += offset >> shifts; @@ -1317,18 +1234,20 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) (ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) { instr->operands[i] = Operand(info.val); continue; - } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, prevent_overflow) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) { + } else if (i == 1 && + parse_base_offset(ctx, instr.get(), i, &base, &offset, prevent_overflow) && + base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) { bool soe = smem.operands.size() >= (!smem.definitions.empty() ? 3 : 4); - if (soe && - (!ctx.info[smem.operands.back().tempId()].is_constant_or_literal(32) || - ctx.info[smem.operands.back().tempId()].val != 0)) { + if (soe && (!ctx.info[smem.operands.back().tempId()].is_constant_or_literal(32) || + ctx.info[smem.operands.back().tempId()].val != 0)) { continue; } if (soe) { smem.operands[1] = Operand(offset); smem.operands.back() = Operand(base); } else { - SMEM_instruction *new_instr = create_instruction(smem.opcode, Format::SMEM, smem.operands.size() + 1, smem.definitions.size()); + SMEM_instruction* new_instr = create_instruction( + smem.opcode, Format::SMEM, smem.operands.size() + 1, smem.definitions.size()); new_instr->operands[0] = smem.operands[0]; new_instr->operands[1] = Operand(offset); if (smem.definitions.empty()) @@ -1350,7 +1269,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) else if (instr->isBranch()) { if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) { /* Flip the branch instruction to get rid of the scc_invert instruction */ - instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz : aco_opcode::p_cbranch_z; + instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz + : aco_opcode::p_cbranch_z; instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp); } } @@ -1415,7 +1335,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) if (ops.size() != instr->operands.size()) { assert(ops.size() > instr->operands.size()); Definition def = instr->definitions[0]; - instr.reset(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, ops.size(), 1)); + instr.reset(create_instruction(aco_opcode::p_create_vector, + Format::PSEUDO, ops.size(), 1)); for (unsigned i = 0; i < ops.size(); i++) { if (ops[i].isTemp() && ctx.info[ops[i].tempId()].is_temp() && ops[i].regClass() == ctx.info[ops[i].tempId()].temp.regClass()) @@ -1450,16 +1371,19 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) unsigned split_offset = 0; unsigned vec_offset = 0; unsigned vec_index = 0; - for (unsigned i = 0; i < instr->definitions.size(); split_offset += instr->definitions[i++].bytes()) { + for (unsigned i = 0; i < instr->definitions.size(); + split_offset += instr->definitions[i++].bytes()) { while (vec_offset < split_offset && vec_index < vec->operands.size()) vec_offset += vec->operands[vec_index++].bytes(); - if (vec_offset != split_offset || vec->operands[vec_index].bytes() != instr->definitions[i].bytes()) + if (vec_offset != split_offset || + vec->operands[vec_index].bytes() != instr->definitions[i].bytes()) continue; Operand vec_op = vec->operands[vec_index]; if (vec_op.isConstant()) { - ctx.info[instr->definitions[i].tempId()].set_constant(ctx.program->chip_class, vec_op.constantValue64()); + ctx.info[instr->definitions[i].tempId()].set_constant(ctx.program->chip_class, + vec_op.constantValue64()); } else if (vec_op.isUndefined()) { ctx.info[instr->definitions[i].tempId()].set_undefined(); } else { @@ -1493,7 +1417,9 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) /* propagate constants */ uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u); uint32_t val = (info.val >> (dst_offset * 8u)) & mask; - instr->operands[0] = Operand::get_const(ctx.program->chip_class, val, instr->definitions[0].bytes());; + instr->operands[0] = + Operand::get_const(ctx.program->chip_class, val, instr->definitions[0].bytes()); + ; } else if (index == 0 && instr->operands[0].size() == instr->definitions[0].size()) { ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); } @@ -1512,10 +1438,11 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) /* We might not be able to copy-propagate if it's a SGPR->VGPR copy, so * duplicate the vector instead. */ - Instruction *vec = ctx.info[instr->operands[0].tempId()].instr; + Instruction* vec = ctx.info[instr->operands[0].tempId()].instr; aco_ptr old_copy = std::move(instr); - instr.reset(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, vec->operands.size(), 1)); + instr.reset(create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, vec->operands.size(), 1)); instr->definitions[0] = old_copy->definitions[0]; std::copy(vec->operands.begin(), vec->operands.end(), instr->operands.begin()); for (unsigned i = 0; i < vec->operands.size(); i++) { @@ -1534,7 +1461,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) } else if (instr->usesModifiers()) { // TODO } else if (instr->operands[0].isConstant()) { - ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, instr->operands[0].constantValue64()); + ctx.info[instr->definitions[0].tempId()].set_constant( + ctx.program->chip_class, instr->operands[0].constantValue64()); } else if (instr->operands[0].isTemp()) { ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); if (ctx.info[instr->operands[0].tempId()].is_canonicalized()) @@ -1558,11 +1486,11 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) for (unsigned i = 0; i < 2; i++) { if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) { if (!instr->isDPP() && !instr->isSDWA() && - (instr->operands[!i].constantEquals(fp16 ? 0x3c00 : 0x3f800000) || /* 1.0 */ + (instr->operands[!i].constantEquals(fp16 ? 0x3c00 : 0x3f800000) || /* 1.0 */ instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u))) { /* -1.0 */ bool neg1 = instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u); - VOP3_instruction *vop3 = instr->isVOP3() ? &instr->vop3() : NULL; + VOP3_instruction* vop3 = instr->isVOP3() ? &instr->vop3() : NULL; if (vop3 && (vop3->abs[!i] || vop3->neg[!i] || vop3->clamp || vop3->omod)) continue; @@ -1580,14 +1508,18 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) ctx.info[instr->definitions[0].tempId()].set_fcanonicalize(other); } else if (uses_mods) { continue; - } else if (instr->operands[!i].constantValue() == (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */ + } else if (instr->operands[!i].constantValue() == + (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */ ctx.info[instr->operands[i].tempId()].set_omod2(instr.get()); - } else if (instr->operands[!i].constantValue() == (fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */ + } else if (instr->operands[!i].constantValue() == + (fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */ ctx.info[instr->operands[i].tempId()].set_omod4(instr.get()); - } else if (instr->operands[!i].constantValue() == (fp16 ? 0x3800 : 0x3f000000)) { /* 0.5 */ + } else if (instr->operands[!i].constantValue() == + (fp16 ? 0x3800 : 0x3f000000)) { /* 0.5 */ ctx.info[instr->operands[i].tempId()].set_omod5(instr.get()); } else if (instr->operands[!i].constantValue() == 0u && - !(fp16 ? ctx.fp_mode.preserve_signed_zero_inf_nan16_64 : ctx.fp_mode.preserve_signed_zero_inf_nan32)) { /* 0.0 */ + !(fp16 ? ctx.fp_mode.preserve_signed_zero_inf_nan16_64 + : ctx.fp_mode.preserve_signed_zero_inf_nan32)) { /* 0.0 */ ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, 0u); } else { continue; @@ -1609,16 +1541,14 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) case aco_opcode::v_med3_f16: case aco_opcode::v_med3_f32: { /* clamp */ VOP3_instruction& vop3 = instr->vop3(); - if (vop3.abs[0] || vop3.abs[1] || vop3.abs[2] || - vop3.neg[0] || vop3.neg[1] || vop3.neg[2] || + if (vop3.abs[0] || vop3.abs[1] || vop3.abs[2] || vop3.neg[0] || vop3.neg[1] || vop3.neg[2] || vop3.omod != 0 || vop3.opsel != 0) break; unsigned idx = 0; bool found_zero = false, found_one = false; bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16; - for (unsigned i = 0; i < 3; i++) - { + for (unsigned i = 0; i < 3; i++) { if (instr->operands[i].constantEquals(0)) found_zero = true; else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */ @@ -1631,23 +1561,22 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) break; } case aco_opcode::v_cndmask_b32: - if (instr->operands[0].constantEquals(0) && - instr->operands[1].constantEquals(0xFFFFFFFF)) + if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(0xFFFFFFFF)) ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp()); else if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(0x3f800000u)) ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp()); - else if (instr->operands[0].constantEquals(0) && - instr->operands[1].constantEquals(1)) + else if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(1)) ctx.info[instr->definitions[0].tempId()].set_b2i(instr->operands[2].getTemp()); ctx.info[instr->operands[2].tempId()].set_vcc_hint(); break; case aco_opcode::v_cmp_lg_u32: if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */ - instr->operands[0].constantEquals(0) && - instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_vcc()) - ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp); + instr->operands[0].constantEquals(0) && instr->operands[1].isTemp() && + ctx.info[instr->operands[1].tempId()].is_vcc()) + ctx.info[instr->definitions[0].tempId()].set_temp( + ctx.info[instr->operands[1].tempId()].temp); break; case aco_opcode::p_linear_phi: { /* lower_bool_phis() can create phis like this */ @@ -1656,7 +1585,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) if (all_same_temp) all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass(); for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) { - if (!instr->operands[i].isTemp() || instr->operands[i].tempId() != instr->operands[0].tempId()) + if (!instr->operands[i].isTemp() || + instr->operands[i].tempId() != instr->operands[0].tempId()) all_same_temp = false; } if (all_same_temp) { @@ -1684,10 +1614,12 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) case aco_opcode::s_not_b64: if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) { ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise(); - ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].temp); + ctx.info[instr->definitions[1].tempId()].set_scc_invert( + ctx.info[instr->operands[0].tempId()].temp); } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) { ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise(); - ctx.info[instr->definitions[1].tempId()].set_scc_invert(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); + ctx.info[instr->definitions[1].tempId()].set_scc_invert( + ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); } ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get()); break; @@ -1695,21 +1627,29 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) case aco_opcode::s_and_b64: if (fixed_to_exec(instr->operands[1]) && instr->operands[0].isTemp()) { if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) { - /* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a uniform bool into divergent */ - ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].temp); - ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].temp); + /* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a + * uniform bool into divergent */ + ctx.info[instr->definitions[1].tempId()].set_temp( + ctx.info[instr->operands[0].tempId()].temp); + ctx.info[instr->definitions[0].tempId()].set_uniform_bool( + ctx.info[instr->operands[0].tempId()].temp); break; } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) { - /* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction already produces the same SCC */ - ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); - ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); + /* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction + * already produces the same SCC */ + ctx.info[instr->definitions[1].tempId()].set_temp( + ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); + ctx.info[instr->definitions[0].tempId()].set_uniform_bool( + ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); break; } else if (ctx.info[instr->operands[0].tempId()].is_vopc()) { Instruction* vopc_instr = ctx.info[instr->operands[0].tempId()].instr; - /* Remove superfluous s_and when the VOPC instruction uses the same exec and thus already produces the same result */ + /* Remove superfluous s_and when the VOPC instruction uses the same exec and thus + * already produces the same result */ if (vopc_instr->pass_flags == instr->pass_flags) { assert(instr->pass_flags > 0); - ctx.info[instr->definitions[0].tempId()].set_temp(vopc_instr->definitions[0].getTemp()); + ctx.info[instr->definitions[0].tempId()].set_temp( + vopc_instr->definitions[0].getTemp()); break; } } @@ -1719,8 +1659,11 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) case aco_opcode::s_or_b64: case aco_opcode::s_xor_b32: case aco_opcode::s_xor_b64: - if (std::all_of(instr->operands.begin(), instr->operands.end(), [&ctx](const Operand& op) { - return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() || ctx.info[op.tempId()].is_uniform_bitwise()); + if (std::all_of(instr->operands.begin(), instr->operands.end(), + [&ctx](const Operand& op) + { + return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() || + ctx.info[op.tempId()].is_uniform_bitwise()); })) { ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise(); } @@ -1749,8 +1692,7 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) break; case aco_opcode::s_cselect_b64: case aco_opcode::s_cselect_b32: - if (instr->operands[0].constantEquals((unsigned) -1) && - instr->operands[1].constantEquals(0)) { + if (instr->operands[0].constantEquals((unsigned)-1) && instr->operands[1].constantEquals(0)) { /* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */ ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp()); } @@ -1761,8 +1703,7 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) } break; case aco_opcode::p_wqm: - if (instr->operands[0].isTemp() && - ctx.info[instr->operands[0].tempId()].is_scc_invert()) { + if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_scc_invert()) { ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); } break; @@ -1790,8 +1731,7 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) } break; } - default: - break; + default: break; } /* Don't remove label_extract if we can't apply the extract to @@ -1800,93 +1740,104 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) check_sdwa_extract(ctx, instr); } -ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo *info) +ALWAYS_INLINE bool +get_cmp_info(aco_opcode op, CmpInfo* info) { info->ordered = aco_opcode::num_opcodes; info->unordered = aco_opcode::num_opcodes; info->ordered_swapped = aco_opcode::num_opcodes; info->unordered_swapped = aco_opcode::num_opcodes; switch (op) { - #define CMP2(ord, unord, ord_swap, unord_swap, sz) \ - case aco_opcode::v_cmp_##ord##_f##sz:\ - case aco_opcode::v_cmp_n##unord##_f##sz:\ - info->ordered = aco_opcode::v_cmp_##ord##_f##sz;\ - info->unordered = aco_opcode::v_cmp_n##unord##_f##sz;\ - info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz;\ - info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz;\ - info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz : aco_opcode::v_cmp_n##ord##_f##sz;\ - info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 : aco_opcode::v_cmp_n##unord##_f32;\ - info->size = sz;\ + // clang-format off +#define CMP2(ord, unord, ord_swap, unord_swap, sz) \ + case aco_opcode::v_cmp_##ord##_f##sz: \ + case aco_opcode::v_cmp_n##unord##_f##sz: \ + info->ordered = aco_opcode::v_cmp_##ord##_f##sz; \ + info->unordered = aco_opcode::v_cmp_n##unord##_f##sz; \ + info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz; \ + info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz; \ + info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \ + : aco_opcode::v_cmp_n##ord##_f##sz; \ + info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 \ + : aco_opcode::v_cmp_n##unord##_f32; \ + info->size = sz; \ return true; - #define CMP(ord, unord, ord_swap, unord_swap) \ - CMP2(ord, unord, ord_swap, unord_swap, 16)\ - CMP2(ord, unord, ord_swap, unord_swap, 32)\ +#define CMP(ord, unord, ord_swap, unord_swap) \ + CMP2(ord, unord, ord_swap, unord_swap, 16) \ + CMP2(ord, unord, ord_swap, unord_swap, 32) \ CMP2(ord, unord, ord_swap, unord_swap, 64) - CMP(lt, /*n*/ge, gt, /*n*/le) - CMP(eq, /*n*/lg, eq, /*n*/lg) - CMP(le, /*n*/gt, ge, /*n*/lt) - CMP(gt, /*n*/le, lt, /*n*/le) - CMP(lg, /*n*/eq, lg, /*n*/eq) - CMP(ge, /*n*/lt, le, /*n*/gt) - #undef CMP - #undef CMP2 - #define ORD_TEST(sz) \ - case aco_opcode::v_cmp_u_f##sz:\ - info->f32 = aco_opcode::v_cmp_u_f32;\ - info->inverse = aco_opcode::v_cmp_o_f##sz;\ - info->size = sz;\ - return true;\ - case aco_opcode::v_cmp_o_f##sz:\ - info->f32 = aco_opcode::v_cmp_o_f32;\ - info->inverse = aco_opcode::v_cmp_u_f##sz;\ - info->size = sz;\ + CMP(lt, /*n*/ge, gt, /*n*/le) + CMP(eq, /*n*/lg, eq, /*n*/lg) + CMP(le, /*n*/gt, ge, /*n*/lt) + CMP(gt, /*n*/le, lt, /*n*/le) + CMP(lg, /*n*/eq, lg, /*n*/eq) + CMP(ge, /*n*/lt, le, /*n*/gt) +#undef CMP +#undef CMP2 +#define ORD_TEST(sz) \ + case aco_opcode::v_cmp_u_f##sz: \ + info->f32 = aco_opcode::v_cmp_u_f32; \ + info->inverse = aco_opcode::v_cmp_o_f##sz; \ + info->size = sz; \ + return true; \ + case aco_opcode::v_cmp_o_f##sz: \ + info->f32 = aco_opcode::v_cmp_o_f32; \ + info->inverse = aco_opcode::v_cmp_u_f##sz; \ + info->size = sz; \ return true; - ORD_TEST(16) - ORD_TEST(32) - ORD_TEST(64) - #undef ORD_TEST - default: - return false; + ORD_TEST(16) + ORD_TEST(32) + ORD_TEST(64) +#undef ORD_TEST + // clang-format on + default: return false; } } -aco_opcode get_ordered(aco_opcode op) +aco_opcode +get_ordered(aco_opcode op) { CmpInfo info; return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes; } -aco_opcode get_unordered(aco_opcode op) +aco_opcode +get_unordered(aco_opcode op) { CmpInfo info; return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes; } -aco_opcode get_inverse(aco_opcode op) +aco_opcode +get_inverse(aco_opcode op) { CmpInfo info; return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes; } -aco_opcode get_f32_cmp(aco_opcode op) +aco_opcode +get_f32_cmp(aco_opcode op) { CmpInfo info; return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes; } -unsigned get_cmp_bitsize(aco_opcode op) +unsigned +get_cmp_bitsize(aco_opcode op) { CmpInfo info; return get_cmp_info(op, &info) ? info.size : 0; } -bool is_cmp(aco_opcode op) +bool +is_cmp(aco_opcode op) { CmpInfo info; return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes; } -unsigned original_temp_id(opt_ctx &ctx, Temp tmp) +unsigned +original_temp_id(opt_ctx& ctx, Temp tmp) { if (ctx.info[tmp.id()].is_temp()) return ctx.info[tmp.id()].temp.id(); @@ -1894,7 +1845,8 @@ unsigned original_temp_id(opt_ctx &ctx, Temp tmp) return tmp.id(); } -void decrease_uses(opt_ctx &ctx, Instruction* instr) +void +decrease_uses(opt_ctx& ctx, Instruction* instr) { if (!--ctx.uses[instr->definitions[0].tempId()]) { for (const Operand& op : instr->operands) { @@ -1904,14 +1856,15 @@ void decrease_uses(opt_ctx &ctx, Instruction* instr) } } -Instruction *follow_operand(opt_ctx &ctx, Operand op, bool ignore_uses=false) +Instruction* +follow_operand(opt_ctx& ctx, Operand op, bool ignore_uses = false) { if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_usedef_labels)) return nullptr; if (!ignore_uses && ctx.uses[op.tempId()] > 1) return nullptr; - Instruction *instr = ctx.info[op.tempId()].instr; + Instruction* instr = ctx.info[op.tempId()].instr; if (instr->definitions.size() == 2) { assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId()); @@ -1924,7 +1877,8 @@ Instruction *follow_operand(opt_ctx &ctx, Operand op, bool ignore_uses=false) /* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b) * s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */ -bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) +bool +combine_ordering_test(opt_ctx& ctx, aco_ptr& instr) { if (instr->definitions[0].regClass() != ctx.program->lane_mask) return false; @@ -1936,7 +1890,7 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) bool neg[2] = {false, false}; bool abs[2] = {false, false}; uint8_t opsel = 0; - Instruction *op_instr[2]; + Instruction* op_instr[2]; Temp op[2]; unsigned bitsize = 0; @@ -1957,7 +1911,8 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) if (op_instr[i]->isVOP3()) { VOP3_instruction& vop3 = op_instr[i]->vop3(); - if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 || vop3.opsel == 2) + if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 || + vop3.opsel == 2) return false; neg[i] = vop3.neg[0]; abs[i] = vop3.abs[0]; @@ -1988,25 +1943,20 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) aco_opcode new_op = aco_opcode::num_opcodes; switch (bitsize) { - case 16: - new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16; - break; - case 32: - new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; - break; - case 64: - new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64; - break; + case 16: new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16; break; + case 32: new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; break; + case 64: new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64; break; } - Instruction *new_instr; + Instruction* new_instr; if (neg[0] || neg[1] || abs[0] || abs[1] || opsel || num_sgprs > 1) { - VOP3_instruction *vop3 = create_instruction(new_op, asVOP3(Format::VOPC), 2, 1); + VOP3_instruction* vop3 = + create_instruction(new_op, asVOP3(Format::VOPC), 2, 1); for (unsigned i = 0; i < 2; i++) { vop3->neg[i] = neg[i]; vop3->abs[i] = abs[i]; } vop3->opsel = opsel; - new_instr = static_cast(vop3); + new_instr = static_cast(vop3); } else { new_instr = create_instruction(new_op, Format::VOPC, 2, 1); instr->definitions[0].setHint(vcc); @@ -2025,7 +1975,8 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) /* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b) * s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */ -bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr& instr) +bool +combine_comparison_ordering(opt_ctx& ctx, aco_ptr& instr) { if (instr->definitions[0].regClass() != ctx.program->lane_mask) return false; @@ -2035,8 +1986,8 @@ bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr& instr) bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32; aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; - Instruction *nan_test = follow_operand(ctx, instr->operands[0], true); - Instruction *cmp = follow_operand(ctx, instr->operands[1], true); + Instruction* nan_test = follow_operand(ctx, instr->operands[0], true); + Instruction* cmp = follow_operand(ctx, instr->operands[1], true); if (!nan_test || !cmp) return false; if (nan_test->isSDWA() || cmp->isSDWA()) @@ -2070,9 +2021,10 @@ bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr& instr) decrease_uses(ctx, cmp); aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode); - Instruction *new_instr; + Instruction* new_instr; if (cmp->isVOP3()) { - VOP3_instruction *new_vop3 = create_instruction(new_op, asVOP3(Format::VOPC), 2, 1); + VOP3_instruction* new_vop3 = + create_instruction(new_op, asVOP3(Format::VOPC), 2, 1); VOP3_instruction& cmp_vop3 = cmp->vop3(); memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs)); memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg)); @@ -2096,7 +2048,8 @@ bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr& instr) return true; } -bool is_operand_constant(opt_ctx &ctx, Operand op, unsigned bit_size, uint64_t *value) +bool +is_operand_constant(opt_ctx& ctx, Operand op, unsigned bit_size, uint64_t* value) { if (op.isConstant()) { *value = op.constantValue64(); @@ -2111,7 +2064,8 @@ bool is_operand_constant(opt_ctx &ctx, Operand op, unsigned bit_size, uint64_t * return false; } -bool is_constant_nan(uint64_t value, unsigned bit_size) +bool +is_constant_nan(uint64_t value, unsigned bit_size) { if (bit_size == 16) return ((value >> 10) & 0x1f) == 0x1f && (value & 0x3ff); @@ -2123,7 +2077,8 @@ bool is_constant_nan(uint64_t value, unsigned bit_size) /* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b) * s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */ -bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr& instr) +bool +combine_constant_comparison_ordering(opt_ctx& ctx, aco_ptr& instr) { if (instr->definitions[0].regClass() != ctx.program->lane_mask) return false; @@ -2132,8 +2087,8 @@ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr& in bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32; - Instruction *nan_test = follow_operand(ctx, instr->operands[0], true); - Instruction *cmp = follow_operand(ctx, instr->operands[1], true); + Instruction* nan_test = follow_operand(ctx, instr->operands[0], true); + Instruction* cmp = follow_operand(ctx, instr->operands[1], true); if (!nan_test || !cmp || nan_test->isSDWA() || cmp->isSDWA()) return false; @@ -2162,13 +2117,15 @@ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr& in if (nan_test->isVOP3()) { VOP3_instruction& vop3 = nan_test->vop3(); - if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 || vop3.opsel == 2) + if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 || + vop3.opsel == 2) return false; } int constant_operand = -1; for (unsigned i = 0; i < 2; i++) { - if (cmp->operands[i].isTemp() && original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0) { + if (cmp->operands[i].isTemp() && + original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0) { constant_operand = !i; break; } @@ -2190,9 +2147,10 @@ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr& in decrease_uses(ctx, cmp); aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode); - Instruction *new_instr; + Instruction* new_instr; if (cmp->isVOP3()) { - VOP3_instruction *new_vop3 = create_instruction(new_op, asVOP3(Format::VOPC), 2, 1); + VOP3_instruction* new_vop3 = + create_instruction(new_op, asVOP3(Format::VOPC), 2, 1); VOP3_instruction& cmp_vop3 = cmp->vop3(); memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs)); memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg)); @@ -2217,14 +2175,15 @@ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr& in } /* s_andn2(exec, cmp(a, b)) -> get_inverse(cmp)(a, b) */ -bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr& instr) +bool +combine_inverse_comparison(opt_ctx& ctx, aco_ptr& instr) { if (!instr->operands[0].isFixed() || instr->operands[0].physReg() != exec) return false; if (ctx.uses[instr->definitions[1].tempId()]) return false; - Instruction *cmp = follow_operand(ctx, instr->operands[1]); + Instruction* cmp = follow_operand(ctx, instr->operands[1]); if (!cmp) return false; @@ -2240,9 +2199,10 @@ bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr& instr) /* This creates a new instruction instead of modifying the existing * comparison so that the comparison is done with the correct exec mask. */ - Instruction *new_instr; + Instruction* new_instr; if (cmp->isVOP3()) { - VOP3_instruction *new_vop3 = create_instruction(new_opcode, asVOP3(Format::VOPC), 2, 1); + VOP3_instruction* new_vop3 = + create_instruction(new_opcode, asVOP3(Format::VOPC), 2, 1); VOP3_instruction& cmp_vop3 = cmp->vop3(); memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs)); memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg)); @@ -2251,7 +2211,7 @@ bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr& instr) new_vop3->opsel = cmp_vop3.opsel; new_instr = new_vop3; } else if (cmp->isSDWA()) { - SDWA_instruction *new_sdwa = create_instruction( + SDWA_instruction* new_sdwa = create_instruction( new_opcode, (Format)((uint16_t)Format::SDWA | (uint16_t)Format::VOPC), 2, 1); SDWA_instruction& cmp_sdwa = cmp->sdwa(); memcpy(new_sdwa->abs, cmp_sdwa.abs, sizeof(new_sdwa->abs)); @@ -2280,25 +2240,24 @@ bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr& instr) /* op1(op2(1, 2), 0) if swap = false * op1(0, op2(1, 2)) if swap = true */ -bool match_op3_for_vop3(opt_ctx &ctx, aco_opcode op1, aco_opcode op2, - Instruction* op1_instr, bool swap, const char *shuffle_str, - Operand operands[3], bool neg[3], bool abs[3], uint8_t *opsel, - bool *op1_clamp, uint8_t *op1_omod, - bool *inbetween_neg, bool *inbetween_abs, bool *inbetween_opsel, - bool *precise) +bool +match_op3_for_vop3(opt_ctx& ctx, aco_opcode op1, aco_opcode op2, Instruction* op1_instr, bool swap, + const char* shuffle_str, Operand operands[3], bool neg[3], bool abs[3], + uint8_t* opsel, bool* op1_clamp, uint8_t* op1_omod, bool* inbetween_neg, + bool* inbetween_abs, bool* inbetween_opsel, bool* precise) { /* checks */ if (op1_instr->opcode != op1) return false; - Instruction *op2_instr = follow_operand(ctx, op1_instr->operands[swap]); + Instruction* op2_instr = follow_operand(ctx, op1_instr->operands[swap]); if (!op2_instr || op2_instr->opcode != op2) return false; if (fixed_to_exec(op2_instr->operands[0]) || fixed_to_exec(op2_instr->operands[1])) return false; - VOP3_instruction *op1_vop3 = op1_instr->isVOP3() ? &op1_instr->vop3() : NULL; - VOP3_instruction *op2_vop3 = op2_instr->isVOP3() ? &op2_instr->vop3() : NULL; + VOP3_instruction* op1_vop3 = op1_instr->isVOP3() ? &op1_instr->vop3() : NULL; + VOP3_instruction* op2_vop3 = op2_instr->isVOP3() ? &op2_instr->vop3() : NULL; if (op1_instr->isSDWA() || op2_instr->isSDWA()) return false; @@ -2326,8 +2285,7 @@ bool match_op3_for_vop3(opt_ctx &ctx, aco_opcode op1, aco_opcode op2, else if (op1_vop3 && op1_vop3->opsel & (1 << (unsigned)swap)) return false; - *precise = op1_instr->definitions[0].isPrecise() || - op2_instr->definitions[0].isPrecise(); + *precise = op1_instr->definitions[0].isPrecise() || op2_instr->definitions[0].isPrecise(); int shuffle[3]; shuffle[shuffle_str[0] - '0'] = 0; @@ -2355,11 +2313,12 @@ bool match_op3_for_vop3(opt_ctx &ctx, aco_opcode op1, aco_opcode op2, return true; } -void create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr& instr, - Operand operands[3], bool neg[3], bool abs[3], uint8_t opsel, - bool clamp, unsigned omod) +void +create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr& instr, + Operand operands[3], bool neg[3], bool abs[3], uint8_t opsel, bool clamp, + unsigned omod) { - VOP3_instruction *new_instr = create_instruction(opcode, Format::VOP3, 3, 1); + VOP3_instruction* new_instr = create_instruction(opcode, Format::VOP3, 3, 1); memcpy(new_instr->abs, abs, sizeof(bool[3])); memcpy(new_instr->neg, neg, sizeof(bool[3])); new_instr->clamp = clamp; @@ -2374,7 +2333,9 @@ void create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr& instr.reset(new_instr); } -bool combine_three_valu_op(opt_ctx& ctx, aco_ptr& instr, aco_opcode op2, aco_opcode new_op, const char *shuffle, uint8_t ops) +bool +combine_three_valu_op(opt_ctx& ctx, aco_ptr& instr, aco_opcode op2, aco_opcode new_op, + const char* shuffle, uint8_t ops) { for (unsigned swap = 0; swap < 2; swap++) { if (!((1 << swap) & ops)) @@ -2383,10 +2344,8 @@ bool combine_three_valu_op(opt_ctx& ctx, aco_ptr& instr, aco_opcode Operand operands[3]; bool neg[3], abs[3], clamp, precise; uint8_t opsel = 0, omod = 0; - if (match_op3_for_vop3(ctx, instr->opcode, op2, - instr.get(), swap, shuffle, - operands, neg, abs, &opsel, - &clamp, &omod, NULL, NULL, NULL, &precise)) { + if (match_op3_for_vop3(ctx, instr->opcode, op2, instr.get(), swap, shuffle, operands, neg, + abs, &opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) { ctx.uses[instr->operands[swap].tempId()]--; create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod); return true; @@ -2396,14 +2355,17 @@ bool combine_three_valu_op(opt_ctx& ctx, aco_ptr& instr, aco_opcode } /* creates v_lshl_add_u32, v_lshl_or_b32 or v_and_or_b32 */ -bool combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr& instr) +bool +combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr& instr) { bool is_or = instr->opcode == aco_opcode::v_or_b32; aco_opcode new_op_lshl = is_or ? aco_opcode::v_lshl_or_b32 : aco_opcode::v_lshl_add_u32; - if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32, "120", 1 | 2)) + if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32, + "120", 1 | 2)) return true; - if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32, "120", 1 | 2)) + if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32, + "120", 1 | 2)) return true; if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, new_op_lshl, "120", 1 | 2)) return true; @@ -2419,7 +2381,7 @@ bool combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr& instr) * v_add_u32(p_insert(a, 24/16, 8/16), b) -> v_lshl_add_b32(a, 24/16, b) */ for (unsigned i = 0; i < 2; i++) { - Instruction *extins = follow_operand(ctx, instr->operands[i]); + Instruction* extins = follow_operand(ctx, instr->operands[i]); if (!extins) continue; @@ -2429,14 +2391,17 @@ bool combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr& instr) if (extins->opcode == aco_opcode::p_insert && (extins->operands[1].constantValue() + 1) * extins->operands[2].constantValue() == 32) { op = new_op_lshl; - operands[1] = Operand(extins->operands[1].constantValue() * extins->operands[2].constantValue()); - } else if (is_or && (extins->opcode == aco_opcode::p_insert || - (extins->opcode == aco_opcode::p_extract && extins->operands[3].constantEquals(0))) && + operands[1] = + Operand(extins->operands[1].constantValue() * extins->operands[2].constantValue()); + } else if (is_or && + (extins->opcode == aco_opcode::p_insert || + (extins->opcode == aco_opcode::p_extract && + extins->operands[3].constantEquals(0))) && extins->operands[1].constantEquals(0)) { op = aco_opcode::v_and_or_b32; operands[1] = Operand(extins->operands[2].constantEquals(8) ? 0xffu : 0xffffu); } else { - continue; + continue; } operands[0] = extins->operands[0]; @@ -2459,7 +2424,8 @@ bool combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr& instr) return false; } -bool combine_minmax(opt_ctx& ctx, aco_ptr& instr, aco_opcode opposite, aco_opcode minmax3) +bool +combine_minmax(opt_ctx& ctx, aco_ptr& instr, aco_opcode opposite, aco_opcode minmax3) { /* TODO: this can handle SDWA min/max instructions by using opsel */ if (combine_three_valu_op(ctx, instr, instr->opcode, minmax3, "012", 1 | 2)) @@ -2472,10 +2438,8 @@ bool combine_minmax(opt_ctx& ctx, aco_ptr& instr, aco_opcode opposi bool neg[3], abs[3], clamp, precise; uint8_t opsel = 0, omod = 0; bool inbetween_neg; - if (match_op3_for_vop3(ctx, instr->opcode, opposite, - instr.get(), swap, "012", - operands, neg, abs, &opsel, - &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) && + if (match_op3_for_vop3(ctx, instr->opcode, opposite, instr.get(), swap, "012", operands, neg, + abs, &opsel, &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) && inbetween_neg) { ctx.uses[instr->operands[swap].tempId()]--; neg[1] = !neg[1]; @@ -2493,7 +2457,8 @@ bool combine_minmax(opt_ctx& ctx, aco_ptr& instr, aco_opcode opposi * s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b) * s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b) * s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */ -bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr& instr) +bool +combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr& instr) { /* checks */ if (!instr->operands[0].isTemp()) @@ -2501,7 +2466,7 @@ bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr& instr) if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) return false; - Instruction *op2_instr = follow_operand(ctx, instr->operands[0]); + Instruction* op2_instr = follow_operand(ctx, instr->operands[0]); if (!op2_instr) return false; switch (op2_instr->opcode) { @@ -2510,10 +2475,8 @@ bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr& instr) case aco_opcode::s_xor_b32: case aco_opcode::s_and_b64: case aco_opcode::s_or_b64: - case aco_opcode::s_xor_b64: - break; - default: - return false; + case aco_opcode::s_xor_b64: break; + default: return false; } /* create instruction */ @@ -2523,26 +2486,13 @@ bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr& instr) ctx.info[op2_instr->definitions[0].tempId()].label = 0; switch (op2_instr->opcode) { - case aco_opcode::s_and_b32: - op2_instr->opcode = aco_opcode::s_nand_b32; - break; - case aco_opcode::s_or_b32: - op2_instr->opcode = aco_opcode::s_nor_b32; - break; - case aco_opcode::s_xor_b32: - op2_instr->opcode = aco_opcode::s_xnor_b32; - break; - case aco_opcode::s_and_b64: - op2_instr->opcode = aco_opcode::s_nand_b64; - break; - case aco_opcode::s_or_b64: - op2_instr->opcode = aco_opcode::s_nor_b64; - break; - case aco_opcode::s_xor_b64: - op2_instr->opcode = aco_opcode::s_xnor_b64; - break; - default: - break; + case aco_opcode::s_and_b32: op2_instr->opcode = aco_opcode::s_nand_b32; break; + case aco_opcode::s_or_b32: op2_instr->opcode = aco_opcode::s_nor_b32; break; + case aco_opcode::s_xor_b32: op2_instr->opcode = aco_opcode::s_xnor_b32; break; + case aco_opcode::s_and_b64: op2_instr->opcode = aco_opcode::s_nand_b64; break; + case aco_opcode::s_or_b64: op2_instr->opcode = aco_opcode::s_nor_b64; break; + case aco_opcode::s_xor_b64: op2_instr->opcode = aco_opcode::s_xnor_b64; break; + default: break; } return true; @@ -2552,14 +2502,16 @@ bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr& instr) * s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b) * s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b) * s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */ -bool combine_salu_n2(opt_ctx& ctx, aco_ptr& instr) +bool +combine_salu_n2(opt_ctx& ctx, aco_ptr& instr) { if (instr->definitions[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_uniform_bool()) return false; for (unsigned i = 0; i < 2; i++) { - Instruction *op2_instr = follow_operand(ctx, instr->operands[i]); - if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 && op2_instr->opcode != aco_opcode::s_not_b64)) + Instruction* op2_instr = follow_operand(ctx, instr->operands[i]); + if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 && + op2_instr->opcode != aco_opcode::s_not_b64)) continue; if (ctx.uses[op2_instr->definitions[1].tempId()] || fixed_to_exec(op2_instr->operands[0])) continue; @@ -2574,20 +2526,11 @@ bool combine_salu_n2(opt_ctx& ctx, aco_ptr& instr) ctx.info[instr->definitions[0].tempId()].label = 0; switch (instr->opcode) { - case aco_opcode::s_and_b32: - instr->opcode = aco_opcode::s_andn2_b32; - break; - case aco_opcode::s_or_b32: - instr->opcode = aco_opcode::s_orn2_b32; - break; - case aco_opcode::s_and_b64: - instr->opcode = aco_opcode::s_andn2_b64; - break; - case aco_opcode::s_or_b64: - instr->opcode = aco_opcode::s_orn2_b64; - break; - default: - break; + case aco_opcode::s_and_b32: instr->opcode = aco_opcode::s_andn2_b32; break; + case aco_opcode::s_or_b32: instr->opcode = aco_opcode::s_orn2_b32; break; + case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_andn2_b64; break; + case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_orn2_b64; break; + default: break; } return true; @@ -2596,13 +2539,14 @@ bool combine_salu_n2(opt_ctx& ctx, aco_ptr& instr) } /* s_add_{i32,u32}(a, s_lshl_b32(b, )) -> s_lshl_add_u32(a, b) */ -bool combine_salu_lshl_add(opt_ctx& ctx, aco_ptr& instr) +bool +combine_salu_lshl_add(opt_ctx& ctx, aco_ptr& instr) { if (instr->opcode == aco_opcode::s_add_i32 && ctx.uses[instr->definitions[1].tempId()]) return false; for (unsigned i = 0; i < 2; i++) { - Instruction *op2_instr = follow_operand(ctx, instr->operands[i], true); + Instruction* op2_instr = follow_operand(ctx, instr->operands[i], true); if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 || ctx.uses[op2_instr->definitions[1].tempId()]) continue; @@ -2622,17 +2566,17 @@ bool combine_salu_lshl_add(opt_ctx& ctx, aco_ptr& instr) instr->operands[0] = op2_instr->operands[0]; ctx.info[instr->definitions[0].tempId()].label = 0; - instr->opcode = std::array{aco_opcode::s_lshl1_add_u32, - aco_opcode::s_lshl2_add_u32, - aco_opcode::s_lshl3_add_u32, - aco_opcode::s_lshl4_add_u32}[shift - 1]; + instr->opcode = std::array{ + aco_opcode::s_lshl1_add_u32, aco_opcode::s_lshl2_add_u32, aco_opcode::s_lshl3_add_u32, + aco_opcode::s_lshl4_add_u32}[shift - 1]; return true; } return false; } -bool combine_add_sub_b2i(opt_ctx& ctx, aco_ptr& instr, aco_opcode new_op, uint8_t ops) +bool +combine_add_sub_b2i(opt_ctx& ctx, aco_ptr& instr, aco_opcode new_op, uint8_t ops) { if (instr->usesModifiers()) return false; @@ -2640,16 +2584,17 @@ bool combine_add_sub_b2i(opt_ctx& ctx, aco_ptr& instr, aco_opcode n for (unsigned i = 0; i < 2; i++) { if (!((1 << i) & ops)) continue; - if (instr->operands[i].isTemp() && - ctx.info[instr->operands[i].tempId()].is_b2i() && + if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2i() && ctx.uses[instr->operands[i].tempId()] == 1) { aco_ptr new_instr; - if (instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) { + if (instr->operands[!i].isTemp() && + instr->operands[!i].getTemp().type() == RegType::vgpr) { new_instr.reset(create_instruction(new_op, Format::VOP2, 3, 2)); } else if (ctx.program->chip_class >= GFX10 || (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) { - new_instr.reset(create_instruction(new_op, asVOP3(Format::VOP2), 3, 2)); + new_instr.reset( + create_instruction(new_op, asVOP3(Format::VOP2), 3, 2)); } else { return false; } @@ -2678,19 +2623,20 @@ bool combine_add_sub_b2i(opt_ctx& ctx, aco_ptr& instr, aco_opcode n return false; } -bool combine_add_bcnt(opt_ctx& ctx, aco_ptr& instr) +bool +combine_add_bcnt(opt_ctx& ctx, aco_ptr& instr) { if (instr->usesModifiers()) return false; for (unsigned i = 0; i < 2; i++) { - Instruction *op_instr = follow_operand(ctx, instr->operands[i]); - if (op_instr && - op_instr->opcode == aco_opcode::v_bcnt_u32_b32 && + Instruction* op_instr = follow_operand(ctx, instr->operands[i]); + if (op_instr && op_instr->opcode == aco_opcode::v_bcnt_u32_b32 && op_instr->operands[0].isTemp() && op_instr->operands[0].getTemp().type() == RegType::vgpr && op_instr->operands[1].constantEquals(0)) { - aco_ptr new_instr{create_instruction(aco_opcode::v_bcnt_u32_b32, Format::VOP3, 2, 1)}; + aco_ptr new_instr{ + create_instruction(aco_opcode::v_bcnt_u32_b32, Format::VOP3, 2, 1)}; ctx.uses[instr->operands[i].tempId()]--; new_instr->operands[0] = op_instr->operands[0]; new_instr->operands[1] = instr->operands[!i]; @@ -2705,36 +2651,40 @@ bool combine_add_bcnt(opt_ctx& ctx, aco_ptr& instr) return false; } -bool get_minmax_info(aco_opcode op, aco_opcode *min, aco_opcode *max, aco_opcode *min3, aco_opcode *max3, aco_opcode *med3, bool *some_gfx9_only) +bool +get_minmax_info(aco_opcode op, aco_opcode* min, aco_opcode* max, aco_opcode* min3, aco_opcode* max3, + aco_opcode* med3, bool* some_gfx9_only) { switch (op) { - #define MINMAX(type, gfx9) \ - case aco_opcode::v_min_##type:\ - case aco_opcode::v_max_##type:\ - case aco_opcode::v_med3_##type:\ - *min = aco_opcode::v_min_##type;\ - *max = aco_opcode::v_max_##type;\ - *med3 = aco_opcode::v_med3_##type;\ - *min3 = aco_opcode::v_min3_##type;\ - *max3 = aco_opcode::v_max3_##type;\ - *some_gfx9_only = gfx9;\ +#define MINMAX(type, gfx9) \ + case aco_opcode::v_min_##type: \ + case aco_opcode::v_max_##type: \ + case aco_opcode::v_med3_##type: \ + *min = aco_opcode::v_min_##type; \ + *max = aco_opcode::v_max_##type; \ + *med3 = aco_opcode::v_med3_##type; \ + *min3 = aco_opcode::v_min3_##type; \ + *max3 = aco_opcode::v_max3_##type; \ + *some_gfx9_only = gfx9; \ return true; - MINMAX(f32, false) - MINMAX(u32, false) - MINMAX(i32, false) - MINMAX(f16, true) - MINMAX(u16, true) - MINMAX(i16, true) - #undef MINMAX - default: - return false; + MINMAX(f32, false) + MINMAX(u32, false) + MINMAX(i32, false) + MINMAX(f16, true) + MINMAX(u16, true) + MINMAX(i16, true) +#undef MINMAX + default: return false; } } -/* v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb - * v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb */ -bool combine_clamp(opt_ctx& ctx, aco_ptr& instr, - aco_opcode min, aco_opcode max, aco_opcode med) +/* when ub > lb: + * v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub) + * v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub) + */ +bool +combine_clamp(opt_ctx& ctx, aco_ptr& instr, aco_opcode min, aco_opcode max, + aco_opcode med) { /* TODO: GLSL's clamp(x, minVal, maxVal) and SPIR-V's * FClamp(x, minVal, maxVal)/NClamp(x, minVal, maxVal) are undefined if @@ -2751,9 +2701,8 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr& instr, Operand operands[3]; bool neg[3], abs[3], clamp, precise; uint8_t opsel = 0, omod = 0; - if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap, - "012", operands, neg, abs, &opsel, - &clamp, &omod, NULL, NULL, NULL, &precise)) { + if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap, "012", operands, neg, + abs, &opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) { /* max(min(src, upper), lower) returns upper if src is NaN, but * med3(src, lower, upper) returns lower. */ @@ -2766,7 +2715,8 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr& instr, uint32_t val; if (operands[i].isConstant()) { val = operands[i].constantValue(); - } else if (operands[i].isTemp() && ctx.info[operands[i].tempId()].is_constant_or_literal(32)) { + } else if (operands[i].isTemp() && + ctx.info[operands[i].tempId()].is_constant_or_literal(32)) { val = ctx.info[operands[i].tempId()].val; } else { continue; @@ -2799,10 +2749,14 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr& instr, const0_f = _mesa_half_to_float(const0); const1_f = _mesa_half_to_float(const1); } - if (abs[const0_idx]) const0_f = fabsf(const0_f); - if (abs[const1_idx]) const1_f = fabsf(const1_f); - if (neg[const0_idx]) const0_f = -const0_f; - if (neg[const1_idx]) const1_f = -const1_f; + if (abs[const0_idx]) + const0_f = fabsf(const0_f); + if (abs[const1_idx]) + const1_f = fabsf(const1_f); + if (neg[const0_idx]) + const0_f = -const0_f; + if (neg[const1_idx]) + const1_f = -const1_f; lower_idx = const0_f < const1_f ? const0_idx : const1_idx; break; } @@ -2815,8 +2769,10 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr& instr, break; } case aco_opcode::v_min_i32: { - int32_t const0_i = const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0; - int32_t const1_i = const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1; + int32_t const0_i = + const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0; + int32_t const1_i = + const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1; lower_idx = const0_i < const1_i ? const0_idx : const1_idx; break; } @@ -2826,8 +2782,7 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr& instr, lower_idx = const0_i < const1_i ? const0_idx : const1_idx; break; } - default: - break; + default: break; } int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx; @@ -2849,8 +2804,8 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr& instr, return false; } - -void apply_sgprs(opt_ctx &ctx, aco_ptr& instr) +void +apply_sgprs(opt_ctx& ctx, aco_ptr& instr) { bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 || instr->opcode == aco_opcode::v_lshrrev_b64 || @@ -2904,8 +2859,8 @@ void apply_sgprs(opt_ctx &ctx, aco_ptr& instr) /* Applying two sgprs require making it VOP3, so don't do it unless it's * definitively beneficial. * TODO: this is too conservative because later the use count could be reduced to 1 */ - if (!info.is_extract() && num_sgprs && ctx.uses[sgpr_info_id] > 1 && - !instr->isVOP3() && !instr->isSDWA() && instr->format != Format::VOP3P) + if (!info.is_extract() && num_sgprs && ctx.uses[sgpr_info_id] > 1 && !instr->isVOP3() && + !instr->isSDWA() && instr->format != Format::VOP3P) break; Temp sgpr = info.is_extract() ? info.instr->operands[0].getTemp() : info.temp; @@ -2913,7 +2868,8 @@ void apply_sgprs(opt_ctx &ctx, aco_ptr& instr) if (new_sgpr && num_sgprs >= max_sgprs) continue; - if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() || info.is_extract()) { + if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() || + info.is_extract()) { /* can_apply_extract() checks SGPR encoding restrictions */ if (info.is_extract() && can_apply_extract(ctx, instr, sgpr_idx, info)) apply_extract(ctx, instr, sgpr_idx, info); @@ -2946,7 +2902,8 @@ void apply_sgprs(opt_ctx &ctx, aco_ptr& instr) } template -bool apply_omod_clamp_helper(opt_ctx &ctx, T *instr, ssa_info& def_info) +bool +apply_omod_clamp_helper(opt_ctx& ctx, T* instr, ssa_info& def_info) { if (!def_info.is_clamp() && (instr->clamp || instr->omod)) return false; @@ -2964,7 +2921,8 @@ bool apply_omod_clamp_helper(opt_ctx &ctx, T *instr, ssa_info& def_info) } /* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */ -bool apply_omod_clamp(opt_ctx &ctx, aco_ptr& instr) +bool +apply_omod_clamp(opt_ctx& ctx, aco_ptr& instr) { if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1 || !instr_info.can_use_output_modifiers[(int)instr->opcode]) @@ -2977,8 +2935,8 @@ bool apply_omod_clamp(opt_ctx &ctx, aco_ptr& instr) /* omod flushes -0 to +0 and has no effect if denormals are enabled */ bool can_use_omod = (can_vop3 || ctx.program->chip_class >= GFX9); /* SDWA omod is GFX9+ */ if (instr->definitions[0].bytes() == 4) - can_use_omod = can_use_omod && ctx.fp_mode.denorm32 == 0 && - !ctx.fp_mode.preserve_signed_zero_inf_nan32; + can_use_omod = + can_use_omod && ctx.fp_mode.denorm32 == 0 && !ctx.fp_mode.preserve_signed_zero_inf_nan32; else can_use_omod = can_use_omod && ctx.fp_mode.denorm16_64 == 0 && !ctx.fp_mode.preserve_signed_zero_inf_nan16_64; @@ -3015,7 +2973,8 @@ bool apply_omod_clamp(opt_ctx &ctx, aco_ptr& instr) /* Combine an p_insert (or p_extract, in some cases) instruction with instr. * p_insert(instr(...)) -> instr_insert(). */ -bool apply_insert(opt_ctx &ctx, aco_ptr& instr) +bool +apply_insert(opt_ctx& ctx, aco_ptr& instr) { if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1) return false; @@ -3057,25 +3016,27 @@ bool apply_insert(opt_ctx &ctx, aco_ptr& instr) } /* v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc) */ -bool combine_and_subbrev(opt_ctx& ctx, aco_ptr& instr) +bool +combine_and_subbrev(opt_ctx& ctx, aco_ptr& instr) { if (instr->usesModifiers()) return false; for (unsigned i = 0; i < 2; i++) { - Instruction *op_instr = follow_operand(ctx, instr->operands[i], true); - if (op_instr && - op_instr->opcode == aco_opcode::v_subbrev_co_u32 && - op_instr->operands[0].constantEquals(0) && - op_instr->operands[1].constantEquals(0) && + Instruction* op_instr = follow_operand(ctx, instr->operands[i], true); + if (op_instr && op_instr->opcode == aco_opcode::v_subbrev_co_u32 && + op_instr->operands[0].constantEquals(0) && op_instr->operands[1].constantEquals(0) && !op_instr->usesModifiers()) { aco_ptr new_instr; - if (instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) { - new_instr.reset(create_instruction(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)); + if (instr->operands[!i].isTemp() && + instr->operands[!i].getTemp().type() == RegType::vgpr) { + new_instr.reset( + create_instruction(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)); } else if (ctx.program->chip_class >= GFX10 || (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) { - new_instr.reset(create_instruction(aco_opcode::v_cndmask_b32, asVOP3(Format::VOP2), 3, 1)); + new_instr.reset(create_instruction(aco_opcode::v_cndmask_b32, + asVOP3(Format::VOP2), 3, 1)); } else { return false; } @@ -3099,13 +3060,14 @@ bool combine_and_subbrev(opt_ctx& ctx, aco_ptr& instr) /* v_add_co(c, s_lshl(a, b)) -> v_mad_u32_u24(a, 1< v_mad_u32_u24(b, 1<& instr) +bool +combine_add_lshl(opt_ctx& ctx, aco_ptr& instr) { if (instr->usesModifiers()) return false; for (unsigned i = 0; i < 2; i++) { - Instruction *op_instr = follow_operand(ctx, instr->operands[i]); + Instruction* op_instr = follow_operand(ctx, instr->operands[i]); if (!op_instr) continue; @@ -3113,10 +3075,8 @@ bool combine_add_lshl(opt_ctx& ctx, aco_ptr& instr) op_instr->opcode != aco_opcode::v_lshlrev_b32) continue; - if (op_instr->opcode == aco_opcode::v_lshlrev_b32 && - op_instr->operands[1].isTemp() && - op_instr->operands[1].getTemp().type() == RegType::sgpr && - instr->operands[!i].isTemp() && + if (op_instr->opcode == aco_opcode::v_lshlrev_b32 && op_instr->operands[1].isTemp() && + op_instr->operands[1].getTemp().type() == RegType::sgpr && instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::sgpr) return false; @@ -3129,7 +3089,8 @@ bool combine_add_lshl(opt_ctx& ctx, aco_ptr& instr) ctx.uses[instr->operands[i].tempId()]--; - aco_ptr new_instr{create_instruction(aco_opcode::v_mad_u32_u24, Format::VOP3, 3, 1)}; + aco_ptr new_instr{ + create_instruction(aco_opcode::v_mad_u32_u24, Format::VOP3, 3, 1)}; new_instr->operands[0] = op_instr->operands[!shift_op_idx]; new_instr->operands[1] = Operand(multiplier); new_instr->operands[2] = instr->operands[!i]; @@ -3143,7 +3104,8 @@ bool combine_add_lshl(opt_ctx& ctx, aco_ptr& instr) return false; } -void propagate_swizzles(VOP3P_instruction* instr, uint8_t opsel_lo, uint8_t opsel_hi) +void +propagate_swizzles(VOP3P_instruction* instr, uint8_t opsel_lo, uint8_t opsel_hi) { /* propagate swizzles which apply to a result down to the instruction's operands: * result = a.xy + b.xx -> result.yx = a.yx + b.xx */ @@ -3151,8 +3113,8 @@ void propagate_swizzles(VOP3P_instruction* instr, uint8_t opsel_lo, uint8_t opse assert((opsel_hi & 1) == opsel_hi); uint8_t tmp_lo = instr->opsel_lo; uint8_t tmp_hi = instr->opsel_hi; - bool neg_lo[3] = { instr->neg_lo[0], instr->neg_lo[1], instr->neg_lo[2] }; - bool neg_hi[3] = { instr->neg_hi[0], instr->neg_hi[1], instr->neg_hi[2] }; + bool neg_lo[3] = {instr->neg_lo[0], instr->neg_lo[1], instr->neg_lo[2]}; + bool neg_hi[3] = {instr->neg_hi[0], instr->neg_hi[1], instr->neg_hi[2]}; if (opsel_lo == 1) { instr->opsel_lo = tmp_hi; for (unsigned i = 0; i < 3; i++) @@ -3165,16 +3127,14 @@ void propagate_swizzles(VOP3P_instruction* instr, uint8_t opsel_lo, uint8_t opse } } -void combine_vop3p(opt_ctx &ctx, aco_ptr& instr) +void +combine_vop3p(opt_ctx& ctx, aco_ptr& instr) { VOP3P_instruction* vop3p = &instr->vop3p(); /* apply clamp */ - if (instr->opcode == aco_opcode::v_pk_mul_f16 && - instr->operands[1].constantEquals(0x3C00) && - vop3p->clamp && - instr->operands[0].isTemp() && - ctx.uses[instr->operands[0].tempId()] == 1) { + if (instr->opcode == aco_opcode::v_pk_mul_f16 && instr->operands[1].constantEquals(0x3C00) && + vop3p->clamp && instr->operands[0].isTemp() && ctx.uses[instr->operands[0].tempId()] == 1) { ssa_info& info = ctx.info[instr->operands[0].tempId()]; if (info.is_vop3p() && instr_info.can_use_output_modifiers[(int)info.instr->opcode]) { @@ -3240,12 +3200,12 @@ void combine_vop3p(opt_ctx &ctx, aco_ptr& instr) if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_vop3p()) continue; ssa_info& info = ctx.info[instr->operands[i].tempId()]; - if (info.instr->opcode != aco_opcode::v_pk_mul_f16 || info.instr->definitions[0].isPrecise()) + if (info.instr->opcode != aco_opcode::v_pk_mul_f16 || + info.instr->definitions[0].isPrecise()) continue; Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]}; - if (ctx.uses[instr->operands[i].tempId()] >= uses || - !check_vop3_operands(ctx, 3, op)) + if (ctx.uses[instr->operands[i].tempId()] >= uses || !check_vop3_operands(ctx, 3, op)) continue; /* no clamp allowed between mul and add */ @@ -3274,7 +3234,8 @@ void combine_vop3p(opt_ctx &ctx, aco_ptr& instr) /* turn packed mul+add into v_pk_fma_f16 */ assert(mul_instr->isVOP3P()); - aco_ptr fma{create_instruction(aco_opcode::v_pk_fma_f16, Format::VOP3P, 3, 1)}; + aco_ptr fma{ + create_instruction(aco_opcode::v_pk_fma_f16, Format::VOP3P, 3, 1)}; VOP3P_instruction* mul = &mul_instr->vop3p(); for (unsigned i = 0; i < 2; i++) { fma->operands[i] = op[i]; @@ -3302,7 +3263,8 @@ void combine_vop3p(opt_ctx &ctx, aco_ptr& instr) // TODO: we could possibly move the whole label_instruction pass to combine_instruction: // this would mean that we'd have to fix the instruction uses while value propagation -void combine_instruction(opt_ctx &ctx, aco_ptr& instr) +void +combine_instruction(opt_ctx& ctx, aco_ptr& instr) { if (instr->definitions.empty() || is_dead(ctx.uses, instr.get())) return; @@ -3315,8 +3277,9 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) if (!op.isTemp()) continue; ssa_info& info = ctx.info[op.tempId()]; - if (info.is_extract() && (info.instr->operands[0].getTemp().type() == RegType::vgpr || - instr->operands[i].getTemp().type() == RegType::sgpr) && + if (info.is_extract() && + (info.instr->operands[0].getTemp().type() == RegType::vgpr || + instr->operands[i].getTemp().type() == RegType::sgpr) && can_apply_extract(ctx, instr, i, info)) { apply_extract(ctx, instr, i, info); ctx.uses[instr->operands[i].tempId()]--; @@ -3326,7 +3289,8 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) if (can_apply_sgprs(ctx, instr)) apply_sgprs(ctx, instr); - while (apply_omod_clamp(ctx, instr)) ; + while (apply_omod_clamp(ctx, instr)) + ; apply_insert(ctx, instr); } @@ -3351,7 +3315,8 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) * floats. */ /* neg(mul(a, b)) -> mul(neg(a), b) */ - if (ctx.info[instr->definitions[0].tempId()].is_neg() && ctx.uses[instr->operands[1].tempId()] == 1) { + if (ctx.info[instr->definitions[0].tempId()].is_neg() && + ctx.uses[instr->operands[1].tempId()] == 1) { Temp val = ctx.info[instr->definitions[0].tempId()].temp; if (!ctx.info[val.id()].is_mul()) @@ -3371,7 +3336,8 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) Definition def = instr->definitions[0]; /* neg(abs(mul(a, b))) -> mul(neg(abs(a)), abs(b)) */ bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs(); - instr.reset(create_instruction(mul_instr->opcode, asVOP3(Format::VOP2), 2, 1)); + instr.reset( + create_instruction(mul_instr->opcode, asVOP3(Format::VOP2), 2, 1)); instr->operands[0] = mul_instr->operands[0]; instr->operands[1] = mul_instr->operands[1]; instr->definitions[0] = def; @@ -3392,15 +3358,13 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) } /* combine mul+add -> mad */ - bool mad32 = instr->opcode == aco_opcode::v_add_f32 || - instr->opcode == aco_opcode::v_sub_f32 || + bool mad32 = instr->opcode == aco_opcode::v_add_f32 || instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_subrev_f32; - bool mad16 = instr->opcode == aco_opcode::v_add_f16 || - instr->opcode == aco_opcode::v_sub_f16 || + bool mad16 = instr->opcode == aco_opcode::v_add_f16 || instr->opcode == aco_opcode::v_sub_f16 || instr->opcode == aco_opcode::v_subrev_f16; if (mad16 || mad32) { - bool need_fma = mad32 ? (ctx.fp_mode.denorm32 != 0 || ctx.program->chip_class >= GFX10_3) : - (ctx.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10); + bool need_fma = mad32 ? (ctx.fp_mode.denorm32 != 0 || ctx.program->chip_class >= GFX10_3) + : (ctx.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10); if (need_fma && instr->definitions[0].isPrecise()) return; if (need_fma && mad32 && !ctx.program->dev.has_fast_fma32) @@ -3423,8 +3387,7 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) continue; Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]}; - if (info.instr->isSDWA() || - !check_vop3_operands(ctx, 3, op) || + if (info.instr->isSDWA() || !check_vop3_operands(ctx, 3, op) || ctx.uses[instr->operands[i].tempId()] >= uses) continue; @@ -3435,7 +3398,8 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) if (mul_instr) { /* turn mul+add into v_mad/v_fma */ - Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1], instr->operands[add_op_idx]}; + Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1], + instr->operands[add_op_idx]}; ctx.uses[mul_instr->definitions[0].tempId()]--; if (ctx.uses[mul_instr->definitions[0].tempId()]) { if (op[0].isTemp()) @@ -3475,15 +3439,19 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) } if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16) neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true; - else if (instr->opcode == aco_opcode::v_subrev_f32 || instr->opcode == aco_opcode::v_subrev_f16) + else if (instr->opcode == aco_opcode::v_subrev_f32 || + instr->opcode == aco_opcode::v_subrev_f16) neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true; aco_opcode mad_op = need_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32; if (mad16) - mad_op = need_fma ? (ctx.program->chip_class == GFX8 ? aco_opcode::v_fma_legacy_f16 : aco_opcode::v_fma_f16) : - (ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16 : aco_opcode::v_mad_f16); + mad_op = need_fma ? (ctx.program->chip_class == GFX8 ? aco_opcode::v_fma_legacy_f16 + : aco_opcode::v_fma_f16) + : (ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16 + : aco_opcode::v_mad_f16); - aco_ptr mad{create_instruction(mad_op, Format::VOP3, 3, 1)}; + aco_ptr mad{ + create_instruction(mad_op, Format::VOP3, 3, 1)}; for (unsigned i = 0; i < 3; i++) { mad->operands[i] = op[i]; mad->neg[i] = neg[i]; @@ -3504,12 +3472,13 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) else if (instr->opcode == aco_opcode::v_mul_f32 && !instr->isVOP3()) { for (unsigned i = 0; i < 2; i++) { if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() && - ctx.uses[instr->operands[i].tempId()] == 1 && - instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) { + ctx.uses[instr->operands[i].tempId()] == 1 && instr->operands[!i].isTemp() && + instr->operands[!i].getTemp().type() == RegType::vgpr) { ctx.uses[instr->operands[i].tempId()]--; ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++; - aco_ptr new_instr{create_instruction(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)}; + aco_ptr new_instr{ + create_instruction(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)}; new_instr->operands[0] = Operand(0u); new_instr->operands[1] = instr->operands[!i]; new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp); @@ -3520,34 +3489,49 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) } } } else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->chip_class >= GFX9) { - if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012", 1 | 2)) ; - else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32, "012", 1 | 2)) ; - else combine_add_or_then_and_lshl(ctx, instr) ; + if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012", + 1 | 2)) { + } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32, + "012", 1 | 2)) { + } else if (combine_add_or_then_and_lshl(ctx, instr)) { + } } else if (instr->opcode == aco_opcode::v_xor_b32 && ctx.program->chip_class >= GFX10) { - if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012", 1 | 2)) ; - else combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32, "012", 1 | 2); + if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012", + 1 | 2)) { + } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32, + "012", 1 | 2)) { + } } else if (instr->opcode == aco_opcode::v_add_u32) { - if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ; - else if (combine_add_bcnt(ctx, instr)) ; - else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, aco_opcode::v_mad_u32_u24, "120", 1 | 2)) ; - else if (ctx.program->chip_class >= GFX9 && !instr->usesModifiers()) { - if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ; - else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ; - else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32, "012", 1 | 2)) ; - else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ; - else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ; - else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16, aco_opcode::v_mad_u32_u16, "120", 1 | 2)) ; - else combine_add_or_then_and_lshl(ctx, instr) ; + if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) { + } else if (combine_add_bcnt(ctx, instr)) { + } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, + aco_opcode::v_mad_u32_u24, "120", 1 | 2)) { + } else if (ctx.program->chip_class >= GFX9 && !instr->usesModifiers()) { + if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120", + 1 | 2)) { + } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32, + "120", 1 | 2)) { + } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32, + "012", 1 | 2)) { + } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32, + "012", 1 | 2)) { + } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32, + "012", 1 | 2)) { + } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16, + aco_opcode::v_mad_u32_u16, "120", 1 | 2)) { + } else if (combine_add_or_then_and_lshl(ctx, instr)) { + } } } else if (instr->opcode == aco_opcode::v_add_co_u32 || instr->opcode == aco_opcode::v_add_co_u32_e64) { bool carry_out = ctx.uses[instr->definitions[1].tempId()] > 0; - if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ; - else if (!carry_out && combine_add_bcnt(ctx, instr)) ; - else if (!carry_out && combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, aco_opcode::v_mad_u32_u24, "120", 1 | 2)) ; - else if (!carry_out) combine_add_lshl(ctx, instr); - } else if (instr->opcode == aco_opcode::v_sub_u32 || - instr->opcode == aco_opcode::v_sub_co_u32 || + if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) { + } else if (!carry_out && combine_add_bcnt(ctx, instr)) { + } else if (!carry_out && combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, + aco_opcode::v_mad_u32_u24, "120", 1 | 2)) { + } else if (!carry_out && combine_add_lshl(ctx, instr)) { + } + } else if (instr->opcode == aco_opcode::v_sub_u32 || instr->opcode == aco_opcode::v_sub_co_u32 || instr->opcode == aco_opcode::v_sub_co_u32_e64) { combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2); } else if (instr->opcode == aco_opcode::v_subrev_u32 || @@ -3555,17 +3539,20 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) instr->opcode == aco_opcode::v_subrev_co_u32_e64) { combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 1); } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->chip_class >= GFX9) { - combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120", 2); - } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) && ctx.program->chip_class >= GFX9) { + combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120", + 2); + } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) && + ctx.program->chip_class >= GFX9) { combine_salu_lshl_add(ctx, instr); } else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) { combine_salu_not_bitwise(ctx, instr); } else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 || instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) { - if (combine_ordering_test(ctx, instr)) ; - else if (combine_comparison_ordering(ctx, instr)) ; - else if (combine_constant_comparison_ordering(ctx, instr)) ; - else combine_salu_n2(ctx, instr); + if (combine_ordering_test(ctx, instr)) { + } else if (combine_comparison_ordering(ctx, instr)) { + } else if (combine_constant_comparison_ordering(ctx, instr)) { + } else if (combine_salu_n2(ctx, instr)) { + } } else if (instr->opcode == aco_opcode::v_and_b32) { combine_and_subbrev(ctx, instr); } else { @@ -3573,8 +3560,11 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) bool some_gfx9_only; if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &some_gfx9_only) && (!some_gfx9_only || ctx.program->chip_class >= GFX9)) { - if (combine_minmax(ctx, instr, instr->opcode == min ? max : min, instr->opcode == min ? min3 : max3)) ; - else combine_clamp(ctx, instr, min, max, med3); + if (combine_minmax(ctx, instr, instr->opcode == min ? max : min, + instr->opcode == min ? min3 : max3)) { + } else { + combine_clamp(ctx, instr, min, max, med3); + } } } @@ -3583,27 +3573,22 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) combine_inverse_comparison(ctx, instr); } -bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr &instr) +bool +to_uniform_bool_instr(opt_ctx& ctx, aco_ptr& instr) { switch (instr->opcode) { - case aco_opcode::s_and_b32: - case aco_opcode::s_and_b64: - instr->opcode = aco_opcode::s_and_b32; - break; - case aco_opcode::s_or_b32: - case aco_opcode::s_or_b64: - instr->opcode = aco_opcode::s_or_b32; - break; - case aco_opcode::s_xor_b32: - case aco_opcode::s_xor_b64: - instr->opcode = aco_opcode::s_absdiff_i32; - break; - default: - /* Don't transform other instructions. They are very unlikely to appear here. */ - return false; + case aco_opcode::s_and_b32: + case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_and_b32; break; + case aco_opcode::s_or_b32: + case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_or_b32; break; + case aco_opcode::s_xor_b32: + case aco_opcode::s_xor_b64: instr->opcode = aco_opcode::s_absdiff_i32; break; + default: + /* Don't transform other instructions. They are very unlikely to appear here. */ + return false; } - for (Operand &op : instr->operands) { + for (Operand& op : instr->operands) { ctx.uses[op.tempId()]--; if (ctx.info[op.tempId()].is_uniform_bool()) { @@ -3611,12 +3596,14 @@ bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr &instr) op.setTemp(ctx.info[op.tempId()].temp); } else if (ctx.info[op.tempId()].is_uniform_bitwise()) { /* Use the SCC definition of the predecessor instruction. - * This allows the predecessor to get picked up by the same optimization (if it has no divergent users), - * and it also makes sure that the current instruction will keep working even if the predecessor won't be transformed. + * This allows the predecessor to get picked up by the same optimization (if it has no + * divergent users), and it also makes sure that the current instruction will keep working + * even if the predecessor won't be transformed. */ - Instruction *pred_instr = ctx.info[op.tempId()].instr; + Instruction* pred_instr = ctx.info[op.tempId()].instr; assert(pred_instr->definitions.size() >= 2); - assert(pred_instr->definitions[1].isFixed() && pred_instr->definitions[1].physReg() == scc); + assert(pred_instr->definitions[1].isFixed() && + pred_instr->definitions[1].physReg() == scc); op.setTemp(pred_instr->definitions[1].getTemp()); } else { unreachable("Invalid operand on uniform bitwise instruction."); @@ -3631,7 +3618,8 @@ bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr &instr) return true; } -void select_mul_u32_u24(opt_ctx &ctx, aco_ptr& instr) +void +select_mul_u32_u24(opt_ctx& ctx, aco_ptr& instr) { if (instr->usesModifiers()) return; @@ -3655,12 +3643,12 @@ void select_mul_u32_u24(opt_ctx &ctx, aco_ptr& instr) /* VOP2 instructions can only take constants/sgprs in operand 0. */ if ((instr->operands[1].isConstant() || - (instr->operands[1].hasRegClass() && - instr->operands[1].regClass().type() == RegType::sgpr))) { + (instr->operands[1].hasRegClass() && + instr->operands[1].regClass().type() == RegType::sgpr))) { swap = true; if ((instr->operands[0].isConstant() || - (instr->operands[0].hasRegClass() && - instr->operands[0].regClass().type() == RegType::sgpr))) { + (instr->operands[0].hasRegClass() && + instr->operands[0].regClass().type() == RegType::sgpr))) { /* VOP2 can't take both constants/sgprs, keep v_mad_u32_u16 because * v_mul_u32_u24 has no advantages. */ @@ -3668,14 +3656,16 @@ void select_mul_u32_u24(opt_ctx &ctx, aco_ptr& instr) } } - VOP2_instruction *new_instr = create_instruction(aco_opcode::v_mul_u32_u24, Format::VOP2, 2, 1); + VOP2_instruction* new_instr = + create_instruction(aco_opcode::v_mul_u32_u24, Format::VOP2, 2, 1); new_instr->operands[0] = instr->operands[swap]; new_instr->operands[1] = instr->operands[!swap]; new_instr->definitions[0] = instr->definitions[0]; instr.reset(new_instr); } -void select_instruction(opt_ctx &ctx, aco_ptr& instr) +void +select_instruction(opt_ctx& ctx, aco_ptr& instr) { const uint32_t threshold = 4; @@ -3689,7 +3679,8 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) unsigned num_used = 0; unsigned idx = 0; unsigned split_offset = 0; - for (unsigned i = 0, offset = 0; i < instr->definitions.size(); offset += instr->definitions[i++].bytes()) { + for (unsigned i = 0, offset = 0; i < instr->definitions.size(); + offset += instr->definitions[i++].bytes()) { if (ctx.uses[instr->definitions[i].tempId()]) { num_used++; idx = i; @@ -3699,7 +3690,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) bool done = false; if (num_used == 1 && ctx.info[instr->operands[0].tempId()].is_vec() && ctx.uses[instr->operands[0].tempId()] == 1) { - Instruction *vec = ctx.info[instr->operands[0].tempId()].instr; + Instruction* vec = ctx.info[instr->operands[0].tempId()].instr; unsigned off = 0; Operand op; @@ -3719,7 +3710,8 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) if (op.isTemp()) ctx.uses[op.tempId()]++; - aco_ptr extract{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)}; + aco_ptr extract{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)}; extract->operands[0] = op; extract->definitions[0] = instr->definitions[idx]; instr.reset(extract.release()); @@ -3731,9 +3723,10 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) if (!done && num_used == 1 && instr->operands[0].bytes() % instr->definitions[idx].bytes() == 0 && split_offset % instr->definitions[idx].bytes() == 0) { - aco_ptr extract{create_instruction(aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)}; + aco_ptr extract{create_instruction( + aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)}; extract->operands[0] = instr->operands[0]; - extract->operands[1] = Operand((uint32_t) split_offset / instr->definitions[idx].bytes()); + extract->operands[1] = Operand((uint32_t)split_offset / instr->definitions[idx].bytes()); extract->definitions[0] = instr->definitions[idx]; instr.reset(extract.release()); } @@ -3762,8 +3755,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) bool sgpr_used = false; uint32_t literal_idx = 0; uint32_t literal_uses = UINT32_MAX; - for (unsigned i = 0; i < instr->operands.size(); i++) - { + for (unsigned i = 0; i < instr->operands.size(); i++) { if (instr->operands[i].isConstant() && i > 0) { literal_uses = UINT32_MAX; break; @@ -3771,8 +3763,10 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) if (!instr->operands[i].isTemp()) continue; unsigned bits = get_operand_size(instr, i); - /* if one of the operands is sgpr, we cannot add a literal somewhere else on pre-GFX10 or operands other than the 1st */ - if (instr->operands[i].getTemp().type() == RegType::sgpr && (i > 0 || ctx.program->chip_class < GFX10)) { + /* if one of the operands is sgpr, we cannot add a literal somewhere else on pre-GFX10 + * or operands other than the 1st */ + if (instr->operands[i].getTemp().type() == RegType::sgpr && + (i > 0 || ctx.program->chip_class < GFX10)) { if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal(bits)) { literal_uses = ctx.uses[instr->operands[i].tempId()]; literal_idx = i; @@ -3781,8 +3775,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) } sgpr_used = true; /* don't break because we still need to check constants */ - } else if (!sgpr_used && - ctx.info[instr->operands[i].tempId()].is_literal(bits) && + } else if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal(bits) && ctx.uses[instr->operands[i].tempId()] < literal_uses) { literal_uses = ctx.uses[instr->operands[i].tempId()]; literal_idx = i; @@ -3805,20 +3798,17 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) } } - /* Mark SCC needed, so the uniform boolean transformation won't swap the definitions when it isn't beneficial */ - if (instr->isBranch() && - instr->operands.size() && - instr->operands[0].isTemp() && - instr->operands[0].isFixed() && - instr->operands[0].physReg() == scc) { + /* Mark SCC needed, so the uniform boolean transformation won't swap the definitions + * when it isn't beneficial */ + if (instr->isBranch() && instr->operands.size() && instr->operands[0].isTemp() && + instr->operands[0].isFixed() && instr->operands[0].physReg() == scc) { ctx.info[instr->operands[0].tempId()].set_scc_needed(); return; } else if ((instr->opcode == aco_opcode::s_cselect_b64 || instr->opcode == aco_opcode::s_cselect_b32) && instr->operands[2].isTemp()) { ctx.info[instr->operands[2].tempId()].set_scc_needed(); - } else if (instr->opcode == aco_opcode::p_wqm && - instr->operands[0].isTemp() && + } else if (instr->opcode == aco_opcode::p_wqm && instr->operands[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_scc_needed()) { /* Propagate label so it is correctly detected by the uniform bool transform */ ctx.info[instr->operands[0].tempId()].set_scc_needed(); @@ -3832,13 +3822,13 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) return; /* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */ - if (instr->definitions.size() && - ctx.uses[instr->definitions[0].tempId()] == 0 && + if (instr->definitions.size() && ctx.uses[instr->definitions[0].tempId()] == 0 && ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) { bool transform_done = to_uniform_bool_instr(ctx, instr); if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) { - /* Swap the two definition IDs in order to avoid overusing the SCC. This reduces extra moves generated by RA. */ + /* Swap the two definition IDs in order to avoid overusing the SCC. + * This reduces extra moves generated by RA. */ uint32_t def0_id = instr->definitions[0].getTemp().id(); uint32_t def1_id = instr->definitions[1].getTemp().id(); instr->definitions[0].setTemp(Temp(def1_id, s1)); @@ -3851,8 +3841,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) if (instr->opcode == aco_opcode::v_mad_u32_u16) select_mul_u32_u24(ctx, instr); - if (instr->isSDWA() || instr->isDPP() || - (instr->isVOP3() && ctx.program->chip_class < GFX10) || + if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10) || (instr->isVOP3P() && ctx.program->chip_class < GFX10)) return; /* some encodings can't ever take literals */ @@ -3864,8 +3853,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) Operand literal(s1); unsigned num_operands = 1; if (instr->isSALU() || - (ctx.program->chip_class >= GFX10 && - (can_use_VOP3(ctx, instr) || instr->isVOP3P()))) + (ctx.program->chip_class >= GFX10 && (can_use_VOP3(ctx, instr) || instr->isVOP3P()))) num_operands = instr->operands.size(); /* catch VOP2 with a 3rd SGPR operand (e.g. v_cndmask_b32, v_addc_co_u32) */ else if (instr->isVALU() && instr->operands.size() >= 3) @@ -3905,7 +3893,6 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) mask |= (op.tempId() == literal_id) << i; } - /* don't go over the constant bus limit */ bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 || instr->opcode == aco_opcode::v_lshrrev_b64 || @@ -3931,8 +3918,8 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) } } - -void apply_literals(opt_ctx &ctx, aco_ptr& instr) +void +apply_literals(opt_ctx& ctx, aco_ptr& instr) { /* Cleanup Dead Instructions */ if (!instr) @@ -3945,10 +3932,12 @@ void apply_literals(opt_ctx &ctx, aco_ptr& instr) (ctx.uses[instr->operands[info->literal_idx].tempId()] == 0 || info->literal_idx == 2)) { aco_ptr new_mad; - aco_opcode new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32; + aco_opcode new_op = + info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32; if (instr->opcode == aco_opcode::v_fma_f32) new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32; - else if (instr->opcode == aco_opcode::v_mad_f16 || instr->opcode == aco_opcode::v_mad_legacy_f16) + else if (instr->opcode == aco_opcode::v_mad_f16 || + instr->opcode == aco_opcode::v_mad_legacy_f16) new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16; else if (instr->opcode == aco_opcode::v_fma_f16) new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16; @@ -3985,8 +3974,8 @@ void apply_literals(opt_ctx &ctx, aco_ptr& instr) ctx.instructions.emplace_back(std::move(instr)); } - -void optimize(Program* program) +void +optimize(Program* program) { opt_ctx ctx; ctx.program = program; @@ -4010,10 +3999,12 @@ void optimize(Program* program) } /* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */ - for (auto block_rit = program->blocks.rbegin(); block_rit != program->blocks.rend(); ++block_rit) { + for (auto block_rit = program->blocks.rbegin(); block_rit != program->blocks.rend(); + ++block_rit) { Block* block = &(*block_rit); ctx.fp_mode = block->fp_mode; - for (auto instr_rit = block->instructions.rbegin(); instr_rit != block->instructions.rend(); ++instr_rit) + for (auto instr_rit = block->instructions.rbegin(); instr_rit != block->instructions.rend(); + ++instr_rit) select_instruction(ctx, *instr_rit); } @@ -4025,7 +4016,6 @@ void optimize(Program* program) apply_literals(ctx, instr); block.instructions.swap(ctx.instructions); } - } -} +} // namespace aco diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp index 590b9e3f1e9..2e426cf81a3 100644 --- a/src/amd/compiler/aco_optimizer_postRA.cpp +++ b/src/amd/compiler/aco_optimizer_postRA.cpp @@ -24,9 +24,9 @@ #include "aco_ir.h" -#include #include #include +#include #include namespace aco { @@ -41,15 +41,14 @@ enum { written_by_multiple_instrs = -4, }; -struct pr_opt_ctx -{ - Program *program; - Block *current_block; +struct pr_opt_ctx { + Program* program; + Block* current_block; int current_instr_idx; std::vector uses; std::array instr_idx_by_regs; - void reset_block(Block *block) + void reset_block(Block* block) { current_block = block; current_instr_idx = -1; @@ -57,9 +56,10 @@ struct pr_opt_ctx } }; -void save_reg_writes(pr_opt_ctx &ctx, aco_ptr &instr) +void +save_reg_writes(pr_opt_ctx& ctx, aco_ptr& instr) { - for (const Definition &def : instr->definitions) { + for (const Definition& def : instr->definitions) { assert(def.regClass().type() != RegType::sgpr || def.physReg().reg() <= 255); assert(def.regClass().type() != RegType::vgpr || def.physReg().reg() >= 256); @@ -75,20 +75,21 @@ void save_reg_writes(pr_opt_ctx &ctx, aco_ptr &instr) } } -int last_writer_idx(pr_opt_ctx &ctx, PhysReg physReg, RegClass rc) +int +last_writer_idx(pr_opt_ctx& ctx, PhysReg physReg, RegClass rc) { /* Verify that all of the operand's registers are written by the same instruction. */ int instr_idx = ctx.instr_idx_by_regs[physReg.reg()]; unsigned dw_size = DIV_ROUND_UP(rc.bytes(), 4u); unsigned r = physReg.reg(); - bool all_same = std::all_of( - &ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size], - [instr_idx](int i) { return i == instr_idx; }); + bool all_same = std::all_of(&ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size], + [instr_idx](int i) { return i == instr_idx; }); return all_same ? instr_idx : written_by_multiple_instrs; } -int last_writer_idx(pr_opt_ctx &ctx, const Operand &op) +int +last_writer_idx(pr_opt_ctx& ctx, const Operand& op) { if (op.isConstant() || op.isUndefined()) return const_or_undef; @@ -104,7 +105,8 @@ int last_writer_idx(pr_opt_ctx &ctx, const Operand &op) return instr_idx; } -void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr &instr) +void +try_apply_branch_vcc(pr_opt_ctx& ctx, aco_ptr& instr) { /* We are looking for the following pattern: * @@ -123,8 +125,7 @@ void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr &instr) if (ctx.program->chip_class < GFX8) return; - if (instr->format != Format::PSEUDO_BRANCH || - instr->operands.size() == 0 || + if (instr->format != Format::PSEUDO_BRANCH || instr->operands.size() == 0 || instr->operands[0].physReg() != scc) return; @@ -141,13 +142,12 @@ void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr &instr) last_exec_wr_idx > last_vcc_wr_idx || last_exec_wr_idx < not_written_in_block) return; - aco_ptr &op0_instr = ctx.current_block->instructions[op0_instr_idx]; - aco_ptr &last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx]; + aco_ptr& op0_instr = ctx.current_block->instructions[op0_instr_idx]; + aco_ptr& last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx]; if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ && op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) || - op0_instr->operands[0].physReg() != vcc || - op0_instr->operands[1].physReg() != exec || + op0_instr->operands[0].physReg() != vcc || op0_instr->operands[1].physReg() != exec || !last_vcc_wr->isVOPC()) return; @@ -159,7 +159,8 @@ void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr &instr) instr->operands[0] = op0_instr->operands[0]; } -void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr &instr) +void +try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr& instr) { /* We are looking for the following pattern: * @@ -180,8 +181,7 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr &instr) if (instr->isSOPC() && (instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 || instr->opcode == aco_opcode::s_cmp_lg_u32 || instr->opcode == aco_opcode::s_cmp_lg_i32 || - instr->opcode == aco_opcode::s_cmp_eq_u64 || - instr->opcode == aco_opcode::s_cmp_lg_u64) && + instr->opcode == aco_opcode::s_cmp_eq_u64 || instr->opcode == aco_opcode::s_cmp_lg_u64) && (instr->operands[0].constantEquals(0) || instr->operands[1].constantEquals(0)) && (instr->operands[0].isTemp() || instr->operands[1].isTemp())) { /* Make sure the constant is always in operand 1 */ @@ -197,8 +197,9 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr &instr) if (wr_idx < 0 || wr_idx != sccwr_idx) return; - aco_ptr &wr_instr = ctx.current_block->instructions[wr_idx]; - if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 || wr_instr->definitions[1].physReg() != scc) + aco_ptr& wr_instr = ctx.current_block->instructions[wr_idx]; + if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 || + wr_instr->definitions[1].physReg() != scc) return; /* Look for instructions which set SCC := (D != 0) */ @@ -232,10 +233,8 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr &instr) case aco_opcode::s_ashr_i32: case aco_opcode::s_ashr_i64: case aco_opcode::s_abs_i32: - case aco_opcode::s_absdiff_i32: - break; - default: - return; + case aco_opcode::s_absdiff_i32: break; + default: return; } /* Use the SCC def from wr_instr */ @@ -245,13 +244,12 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr &instr) /* Set the opcode and operand to 32-bit */ instr->operands[1] = Operand(0u); - instr->opcode = (instr->opcode == aco_opcode::s_cmp_eq_u32 || - instr->opcode == aco_opcode::s_cmp_eq_i32 || - instr->opcode == aco_opcode::s_cmp_eq_u64) - ? aco_opcode::s_cmp_eq_u32 - : aco_opcode::s_cmp_lg_u32; - } else if ((instr->format == Format::PSEUDO_BRANCH && - instr->operands.size() == 1 && + instr->opcode = + (instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 || + instr->opcode == aco_opcode::s_cmp_eq_u64) + ? aco_opcode::s_cmp_eq_u32 + : aco_opcode::s_cmp_lg_u32; + } else if ((instr->format == Format::PSEUDO_BRANCH && instr->operands.size() == 1 && instr->operands[0].physReg() == scc) || instr->opcode == aco_opcode::s_cselect_b32) { @@ -265,10 +263,11 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr &instr) if (wr_idx < 0) return; - aco_ptr &wr_instr = ctx.current_block->instructions[wr_idx]; + aco_ptr& wr_instr = ctx.current_block->instructions[wr_idx]; /* Check if we found the pattern above. */ - if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 && wr_instr->opcode != aco_opcode::s_cmp_lg_u32) + if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 && + wr_instr->opcode != aco_opcode::s_cmp_lg_u32) return; if (wr_instr->operands[0].physReg() != scc) return; @@ -282,11 +281,13 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr &instr) if (wr_instr->opcode == aco_opcode::s_cmp_eq_u32) { /* Flip the meaning of the instruction to correctly use the SCC. */ if (instr->format == Format::PSEUDO_BRANCH) - instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz : aco_opcode::p_cbranch_z; + instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz + : aco_opcode::p_cbranch_z; else if (instr->opcode == aco_opcode::s_cselect_b32) std::swap(instr->operands[0], instr->operands[1]); else - unreachable("scc_nocompare optimization is only implemented for p_cbranch and s_cselect"); + unreachable( + "scc_nocompare optimization is only implemented for p_cbranch and s_cselect"); } /* Use the SCC def from the original instruction, not the comparison */ @@ -295,7 +296,8 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr &instr) } } -void process_instruction(pr_opt_ctx &ctx, aco_ptr &instr) +void +process_instruction(pr_opt_ctx& ctx, aco_ptr& instr) { ctx.current_instr_idx++; @@ -307,9 +309,10 @@ void process_instruction(pr_opt_ctx &ctx, aco_ptr &instr) save_reg_writes(ctx, instr); } -} /* End of empty namespace */ +} // namespace -void optimize_postRA(Program* program) +void +optimize_postRA(Program* program) { pr_opt_ctx ctx; ctx.program = program; @@ -319,10 +322,10 @@ void optimize_postRA(Program* program) * Goes through each instruction exactly once, and can transform * instructions or adjust the use counts of temps. */ - for (auto &block : program->blocks) { + for (auto& block : program->blocks) { ctx.reset_block(&block); - for (aco_ptr &instr : block.instructions) + for (aco_ptr& instr : block.instructions) process_instruction(ctx, instr); } @@ -330,13 +333,12 @@ void optimize_postRA(Program* program) * Gets rid of instructions which are manually deleted or * no longer have any uses. */ - for (auto &block : program->blocks) { - auto new_end = std::remove_if( - block.instructions.begin(), block.instructions.end(), - [&ctx](const aco_ptr &instr) { return !instr || is_dead(ctx.uses, instr.get()); }); + for (auto& block : program->blocks) { + auto new_end = std::remove_if(block.instructions.begin(), block.instructions.end(), + [&ctx](const aco_ptr& instr) + { return !instr || is_dead(ctx.uses, instr.get()); }); block.instructions.resize(new_end - block.instructions.begin()); } } -} /* End of aco namespace */ - +} // namespace aco diff --git a/src/amd/compiler/aco_print_asm.cpp b/src/amd/compiler/aco_print_asm.cpp index ec86327e212..dcc7c4bc747 100644 --- a/src/amd/compiler/aco_print_asm.cpp +++ b/src/amd/compiler/aco_print_asm.cpp @@ -39,17 +39,17 @@ namespace { /* LLVM disassembler only supports GFX8+, try to disassemble with CLRXdisasm * for GFX6-GFX7 if found on the system, this is better than nothing. -*/ -bool print_asm_gfx6_gfx7(Program *program, std::vector& binary, - FILE *output) + */ +bool +print_asm_gfx6_gfx7(Program* program, std::vector& binary, FILE* output) { #ifdef _WIN32 return true; #else char path[] = "/tmp/fileXXXXXX"; char line[2048], command[128]; - const char *gpu_type; - FILE *p; + const char* gpu_type; + FILE* p; int fd; /* Dump the binary into a temporary file. */ @@ -57,8 +57,7 @@ bool print_asm_gfx6_gfx7(Program *program, std::vector& binary, if (fd < 0) return true; - for (uint32_t w : binary) - { + for (uint32_t w : binary) { if (write(fd, &w, sizeof(w)) == -1) goto fail; } @@ -69,30 +68,16 @@ bool print_asm_gfx6_gfx7(Program *program, std::vector& binary, switch (program->chip_class) { case GFX6: switch (program->family) { - case CHIP_TAHITI: - gpu_type = "tahiti"; - break; - case CHIP_PITCAIRN: - gpu_type = "pitcairn"; - break; - case CHIP_VERDE: - gpu_type = "capeverde"; - break; - case CHIP_OLAND: - gpu_type = "oland"; - break; - case CHIP_HAINAN: - gpu_type = "hainan"; - break; - default: - unreachable("Invalid GFX6 family!"); + case CHIP_TAHITI: gpu_type = "tahiti"; break; + case CHIP_PITCAIRN: gpu_type = "pitcairn"; break; + case CHIP_VERDE: gpu_type = "capeverde"; break; + case CHIP_OLAND: gpu_type = "oland"; break; + case CHIP_HAINAN: gpu_type = "hainan"; break; + default: unreachable("Invalid GFX6 family!"); } break; - case GFX7: - gpu_type = "gfx700"; - break; - default: - unreachable("Invalid chip class!"); + case GFX7: gpu_type = "gfx700"; break; + default: unreachable("Invalid chip class!"); } sprintf(command, "clrxdisasm --gpuType=%s -r %s", gpu_type, path); @@ -121,22 +106,21 @@ fail: #endif } -std::pair disasm_instr(chip_class chip, LLVMDisasmContextRef disasm, - uint32_t *binary, unsigned exec_size, size_t pos, - char *outline, unsigned outline_size) +std::pair +disasm_instr(chip_class chip, LLVMDisasmContextRef disasm, uint32_t* binary, unsigned exec_size, + size_t pos, char* outline, unsigned outline_size) { /* mask out src2 on v_writelane_b32 */ if (((chip == GFX8 || chip == GFX9) && (binary[pos] & 0xffff8000) == 0xd28a0000) || (chip >= GFX10 && (binary[pos] & 0xffff8000) == 0xd7610000)) { - binary[pos+1] = binary[pos+1] & 0xF803FFFF; + binary[pos + 1] = binary[pos + 1] & 0xF803FFFF; } - size_t l = LLVMDisasmInstruction(disasm, (uint8_t *) &binary[pos], - (exec_size - pos) * sizeof(uint32_t), pos * 4, - outline, outline_size); + size_t l = + LLVMDisasmInstruction(disasm, (uint8_t*)&binary[pos], (exec_size - pos) * sizeof(uint32_t), + pos * 4, outline, outline_size); - if (chip >= GFX10 && l == 8 && - ((binary[pos] & 0xffff0000) == 0xd7610000) && + if (chip >= GFX10 && l == 8 && ((binary[pos] & 0xffff0000) == 0xd7610000) && ((binary[pos + 1] & 0x1ff) == 0xff)) { /* v_writelane with literal uses 3 dwords but llvm consumes only 2 */ l += 4; @@ -145,14 +129,14 @@ std::pair disasm_instr(chip_class chip, LLVMDisasmContextRef disas bool invalid = false; size_t size; if (!l && - ((chip >= GFX9 && (binary[pos] & 0xffff8000) == 0xd1348000) || /* v_add_u32_e64 + clamp */ + ((chip >= GFX9 && (binary[pos] & 0xffff8000) == 0xd1348000) || /* v_add_u32_e64 + clamp */ (chip >= GFX10 && (binary[pos] & 0xffff8000) == 0xd7038000) || /* v_add_u16_e64 + clamp */ - (chip <= GFX9 && (binary[pos] & 0xffff8000) == 0xd1268000) || /* v_add_u16_e64 + clamp */ + (chip <= GFX9 && (binary[pos] & 0xffff8000) == 0xd1268000) || /* v_add_u16_e64 + clamp */ (chip >= GFX10 && (binary[pos] & 0xffff8000) == 0xd76d8000) || /* v_add3_u32 + clamp */ (chip == GFX9 && (binary[pos] & 0xffff8000) == 0xd1ff8000)) /* v_add3_u32 + clamp */) { strcpy(outline, "\tinteger addition + clamp"); - bool has_literal = chip >= GFX10 && - (((binary[pos+1] & 0x1ff) == 0xff) || (((binary[pos+1] >> 9) & 0x1ff) == 0xff)); + bool has_literal = chip >= GFX10 && (((binary[pos + 1] & 0x1ff) == 0xff) || + (((binary[pos + 1] >> 9) & 0x1ff) == 0xff)); size = 2 + has_literal; } else if (chip >= GFX10 && l == 4 && ((binary[pos] & 0xfe0001ff) == 0x020000f9)) { strcpy(outline, "\tv_cndmask_b32 + sdwa"); @@ -170,8 +154,8 @@ std::pair disasm_instr(chip_class chip, LLVMDisasmContextRef disas } } /* end namespace */ -bool print_asm(Program *program, std::vector& binary, - unsigned exec_size, FILE *output) +bool +print_asm(Program* program, std::vector& binary, unsigned exec_size, FILE* output) { if (program->chip_class <= GFX7) { /* Do not abort if clrxdisasm isn't found. */ @@ -187,7 +171,7 @@ bool print_asm(Program *program, std::vector& binary, } std::vector symbols; - std::vector> block_names; + std::vector> block_names; block_names.reserve(program->blocks.size()); for (Block& block : program->blocks) { if (!referenced_blocks[block.index]) @@ -195,18 +179,18 @@ bool print_asm(Program *program, std::vector& binary, std::array name; sprintf(name.data(), "BB%u", block.index); block_names.push_back(name); - symbols.emplace_back(block.offset * 4, llvm::StringRef(block_names[block_names.size() - 1].data()), 0); + symbols.emplace_back(block.offset * 4, + llvm::StringRef(block_names[block_names.size() - 1].data()), 0); } - const char *features = ""; + const char* features = ""; if (program->chip_class >= GFX10 && program->wave_size == 64) { features = "+wavefrontsize64"; } - LLVMDisasmContextRef disasm = LLVMCreateDisasmCPUFeatures("amdgcn-mesa-mesa3d", - ac_get_llvm_processor_name(program->family), - features, - &symbols, 0, NULL, NULL); + LLVMDisasmContextRef disasm = + LLVMCreateDisasmCPUFeatures("amdgcn-mesa-mesa3d", ac_get_llvm_processor_name(program->family), + features, &symbols, 0, NULL, NULL); size_t pos = 0; bool invalid = false; @@ -216,7 +200,8 @@ bool print_asm(Program *program, std::vector& binary, unsigned prev_pos = 0; unsigned repeat_count = 0; while (pos < exec_size) { - bool new_block = next_block < program->blocks.size() && pos == program->blocks[next_block].offset; + bool new_block = + next_block < program->blocks.size() && pos == program->blocks[next_block].offset; if (pos + prev_size <= exec_size && prev_pos != pos && !new_block && memcmp(&binary[prev_pos], &binary[pos], prev_size * 4) == 0) { repeat_count++; @@ -235,8 +220,8 @@ bool print_asm(Program *program, std::vector& binary, } char outline[1024]; - std::pair res = disasm_instr( - program->chip_class, disasm, binary.data(), exec_size, pos, outline, sizeof(outline)); + std::pair res = disasm_instr(program->chip_class, disasm, binary.data(), + exec_size, pos, outline, sizeof(outline)); invalid |= res.first; fprintf(output, "%-60s ;", outline); @@ -271,4 +256,4 @@ bool print_asm(Program *program, std::vector& binary, return invalid; } -} +} // namespace aco diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index c45e823ca65..339b938c3eb 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -86,36 +86,38 @@ const std::array reduce_ops = []() return ret; }(); -static void print_reg_class(const RegClass rc, FILE *output) +static void +print_reg_class(const RegClass rc, FILE* output) { switch (rc) { - case RegClass::s1: fprintf(output, " s1: "); return; - case RegClass::s2: fprintf(output, " s2: "); return; - case RegClass::s3: fprintf(output, " s3: "); return; - case RegClass::s4: fprintf(output, " s4: "); return; - case RegClass::s6: fprintf(output, " s6: "); return; - case RegClass::s8: fprintf(output, " s8: "); return; - case RegClass::s16: fprintf(output, "s16: "); return; - case RegClass::v1: fprintf(output, " v1: "); return; - case RegClass::v2: fprintf(output, " v2: "); return; - case RegClass::v3: fprintf(output, " v3: "); return; - case RegClass::v4: fprintf(output, " v4: "); return; - case RegClass::v5: fprintf(output, " v5: "); return; - case RegClass::v6: fprintf(output, " v6: "); return; - case RegClass::v7: fprintf(output, " v7: "); return; - case RegClass::v8: fprintf(output, " v8: "); return; - case RegClass::v1b: fprintf(output, " v1b: "); return; - case RegClass::v2b: fprintf(output, " v2b: "); return; - case RegClass::v3b: fprintf(output, " v3b: "); return; - case RegClass::v4b: fprintf(output, " v4b: "); return; - case RegClass::v6b: fprintf(output, " v6b: "); return; - case RegClass::v8b: fprintf(output, " v8b: "); return; - case RegClass::v1_linear: fprintf(output, " v1: "); return; - case RegClass::v2_linear: fprintf(output, " v2: "); return; + case RegClass::s1: fprintf(output, " s1: "); return; + case RegClass::s2: fprintf(output, " s2: "); return; + case RegClass::s3: fprintf(output, " s3: "); return; + case RegClass::s4: fprintf(output, " s4: "); return; + case RegClass::s6: fprintf(output, " s6: "); return; + case RegClass::s8: fprintf(output, " s8: "); return; + case RegClass::s16: fprintf(output, "s16: "); return; + case RegClass::v1: fprintf(output, " v1: "); return; + case RegClass::v2: fprintf(output, " v2: "); return; + case RegClass::v3: fprintf(output, " v3: "); return; + case RegClass::v4: fprintf(output, " v4: "); return; + case RegClass::v5: fprintf(output, " v5: "); return; + case RegClass::v6: fprintf(output, " v6: "); return; + case RegClass::v7: fprintf(output, " v7: "); return; + case RegClass::v8: fprintf(output, " v8: "); return; + case RegClass::v1b: fprintf(output, " v1b: "); return; + case RegClass::v2b: fprintf(output, " v2b: "); return; + case RegClass::v3b: fprintf(output, " v3b: "); return; + case RegClass::v4b: fprintf(output, " v4b: "); return; + case RegClass::v6b: fprintf(output, " v6b: "); return; + case RegClass::v8b: fprintf(output, " v8b: "); return; + case RegClass::v1_linear: fprintf(output, " v1: "); return; + case RegClass::v2_linear: fprintf(output, " v2: "); return; } } -void print_physReg(PhysReg reg, unsigned bytes, FILE *output, unsigned flags) +void +print_physReg(PhysReg reg, unsigned bytes, FILE* output, unsigned flags) { if (reg == 124) { fprintf(output, "m0"); @@ -134,16 +136,17 @@ void print_physReg(PhysReg reg, unsigned bytes, FILE *output, unsigned flags) } else { fprintf(output, "%c[%d", is_vgpr ? 'v' : 's', r); if (size > 1) - fprintf(output, "-%d]", r + size -1); + fprintf(output, "-%d]", r + size - 1); else fprintf(output, "]"); } if (reg.byte() || bytes % 4) - fprintf(output, "[%d:%d]", reg.byte()*8, (reg.byte()+bytes) * 8); + fprintf(output, "[%d:%d]", reg.byte() * 8, (reg.byte() + bytes) * 8); } } -static void print_constant(uint8_t reg, FILE *output) +static void +print_constant(uint8_t reg, FILE* output) { if (reg >= 128 && reg <= 192) { fprintf(output, "%d", reg - 128); @@ -154,37 +157,20 @@ static void print_constant(uint8_t reg, FILE *output) } switch (reg) { - case 240: - fprintf(output, "0.5"); - break; - case 241: - fprintf(output, "-0.5"); - break; - case 242: - fprintf(output, "1.0"); - break; - case 243: - fprintf(output, "-1.0"); - break; - case 244: - fprintf(output, "2.0"); - break; - case 245: - fprintf(output, "-2.0"); - break; - case 246: - fprintf(output, "4.0"); - break; - case 247: - fprintf(output, "-4.0"); - break; - case 248: - fprintf(output, "1/(2*PI)"); - break; + case 240: fprintf(output, "0.5"); break; + case 241: fprintf(output, "-0.5"); break; + case 242: fprintf(output, "1.0"); break; + case 243: fprintf(output, "-1.0"); break; + case 244: fprintf(output, "2.0"); break; + case 245: fprintf(output, "-2.0"); break; + case 246: fprintf(output, "4.0"); break; + case 247: fprintf(output, "-4.0"); break; + case 248: fprintf(output, "1/(2*PI)"); break; } } -void aco_print_operand(const Operand *operand, FILE *output, unsigned flags) +void +aco_print_operand(const Operand* operand, FILE* output, unsigned flags) { if (operand->isLiteral() || (operand->isConstant() && operand->bytes() == 1)) { if (operand->bytes() == 1) @@ -216,7 +202,8 @@ void aco_print_operand(const Operand *operand, FILE *output, unsigned flags) } } -static void print_definition(const Definition *definition, FILE *output, unsigned flags) +static void +print_definition(const Definition* definition, FILE* output, unsigned flags) { if (!(flags & print_no_ssa)) print_reg_class(definition->regClass(), output); @@ -235,7 +222,8 @@ static void print_definition(const Definition *definition, FILE *output, unsigne print_physReg(definition->physReg(), definition->bytes(), output, flags); } -static void print_storage(storage_class storage, FILE *output) +static void +print_storage(storage_class storage, FILE* output) { fprintf(output, " storage:"); int printed = 0; @@ -255,7 +243,8 @@ static void print_storage(storage_class storage, FILE *output) printed += fprintf(output, "%svgpr_spill", printed ? "," : ""); } -static void print_semantics(memory_semantics sem, FILE *output) +static void +print_semantics(memory_semantics sem, FILE* output) { fprintf(output, " semantics:"); int printed = 0; @@ -275,36 +264,29 @@ static void print_semantics(memory_semantics sem, FILE *output) printed += fprintf(output, "%srmw", printed ? "," : ""); } -static void print_scope(sync_scope scope, FILE *output, const char *prefix="scope") +static void +print_scope(sync_scope scope, FILE* output, const char* prefix = "scope") { fprintf(output, " %s:", prefix); switch (scope) { - case scope_invocation: - fprintf(output, "invocation"); - break; - case scope_subgroup: - fprintf(output, "subgroup"); - break; - case scope_workgroup: - fprintf(output, "workgroup"); - break; - case scope_queuefamily: - fprintf(output, "queuefamily"); - break; - case scope_device: - fprintf(output, "device"); - break; + case scope_invocation: fprintf(output, "invocation"); break; + case scope_subgroup: fprintf(output, "subgroup"); break; + case scope_workgroup: fprintf(output, "workgroup"); break; + case scope_queuefamily: fprintf(output, "queuefamily"); break; + case scope_device: fprintf(output, "device"); break; } } -static void print_sync(memory_sync_info sync, FILE *output) +static void +print_sync(memory_sync_info sync, FILE* output) { print_storage(sync.storage, output); print_semantics(sync.semantics, output); print_scope(sync.scope, output); } -static void print_instr_format_specific(const Instruction *instr, FILE *output) +static void +print_instr_format_specific(const Instruction* instr, FILE* output) { switch (instr->format) { case Format::SOPK: { @@ -319,9 +301,12 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output) /* we usually should check the chip class for vmcnt/lgkm, but * insert_waitcnt() should fill it in regardless. */ unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10); - if (vmcnt != 63) fprintf(output, " vmcnt(%d)", vmcnt); - if (((imm >> 4) & 0x7) < 0x7) fprintf(output, " expcnt(%d)", (imm >> 4) & 0x7); - if (((imm >> 8) & 0x3F) < 0x3F) fprintf(output, " lgkmcnt(%d)", (imm >> 8) & 0x3F); + if (vmcnt != 63) + fprintf(output, " vmcnt(%d)", vmcnt); + if (((imm >> 4) & 0x7) < 0x7) + fprintf(output, " expcnt(%d)", (imm >> 4) & 0x7); + if (((imm >> 8) & 0x3F) < 0x3F) + fprintf(output, " lgkmcnt(%d)", (imm >> 8) & 0x3F); break; } case aco_opcode::s_endpgm: @@ -337,35 +322,21 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output) case aco_opcode::s_sendmsg: { unsigned id = imm & sendmsg_id_mask; switch (id) { - case sendmsg_none: - fprintf(output, " sendmsg(MSG_NONE)"); - break; + case sendmsg_none: fprintf(output, " sendmsg(MSG_NONE)"); break; case _sendmsg_gs: - fprintf(output, " sendmsg(gs%s%s, %u)", - imm & 0x10 ? ", cut" : "", imm & 0x20 ? ", emit" : "", imm >> 8); + fprintf(output, " sendmsg(gs%s%s, %u)", imm & 0x10 ? ", cut" : "", + imm & 0x20 ? ", emit" : "", imm >> 8); break; case _sendmsg_gs_done: - fprintf(output, " sendmsg(gs_done%s%s, %u)", - imm & 0x10 ? ", cut" : "", imm & 0x20 ? ", emit" : "", imm >> 8); - break; - case sendmsg_save_wave: - fprintf(output, " sendmsg(save_wave)"); - break; - case sendmsg_stall_wave_gen: - fprintf(output, " sendmsg(stall_wave_gen)"); - break; - case sendmsg_halt_waves: - fprintf(output, " sendmsg(halt_waves)"); - break; - case sendmsg_ordered_ps_done: - fprintf(output, " sendmsg(ordered_ps_done)"); - break; - case sendmsg_early_prim_dealloc: - fprintf(output, " sendmsg(early_prim_dealloc)"); - break; - case sendmsg_gs_alloc_req: - fprintf(output, " sendmsg(gs_alloc_req)"); + fprintf(output, " sendmsg(gs_done%s%s, %u)", imm & 0x10 ? ", cut" : "", + imm & 0x20 ? ", emit" : "", imm >> 8); break; + case sendmsg_save_wave: fprintf(output, " sendmsg(save_wave)"); break; + case sendmsg_stall_wave_gen: fprintf(output, " sendmsg(stall_wave_gen)"); break; + case sendmsg_halt_waves: fprintf(output, " sendmsg(halt_waves)"); break; + case sendmsg_ordered_ps_done: fprintf(output, " sendmsg(ordered_ps_done)"); break; + case sendmsg_early_prim_dealloc: fprintf(output, " sendmsg(early_prim_dealloc)"); break; + case sendmsg_gs_alloc_req: fprintf(output, " sendmsg(gs_alloc_req)"); break; } break; } @@ -433,40 +404,21 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output) } case Format::MIMG: { const MIMG_instruction& mimg = instr->mimg(); - unsigned identity_dmask = !instr->definitions.empty() ? - (1 << instr->definitions[0].size()) - 1 : - 0xf; + unsigned identity_dmask = + !instr->definitions.empty() ? (1 << instr->definitions[0].size()) - 1 : 0xf; if ((mimg.dmask & identity_dmask) != identity_dmask) - fprintf(output, " dmask:%s%s%s%s", - mimg.dmask & 0x1 ? "x" : "", - mimg.dmask & 0x2 ? "y" : "", - mimg.dmask & 0x4 ? "z" : "", + fprintf(output, " dmask:%s%s%s%s", mimg.dmask & 0x1 ? "x" : "", + mimg.dmask & 0x2 ? "y" : "", mimg.dmask & 0x4 ? "z" : "", mimg.dmask & 0x8 ? "w" : ""); switch (mimg.dim) { - case ac_image_1d: - fprintf(output, " 1d"); - break; - case ac_image_2d: - fprintf(output, " 2d"); - break; - case ac_image_3d: - fprintf(output, " 3d"); - break; - case ac_image_cube: - fprintf(output, " cube"); - break; - case ac_image_1darray: - fprintf(output, " 1darray"); - break; - case ac_image_2darray: - fprintf(output, " 2darray"); - break; - case ac_image_2dmsaa: - fprintf(output, " 2dmsaa"); - break; - case ac_image_2darraymsaa: - fprintf(output, " 2darraymsaa"); - break; + case ac_image_1d: fprintf(output, " 1d"); break; + case ac_image_2d: fprintf(output, " 2d"); break; + case ac_image_3d: fprintf(output, " 3d"); break; + case ac_image_cube: fprintf(output, " cube"); break; + case ac_image_1darray: fprintf(output, " 1darray"); break; + case ac_image_2darray: fprintf(output, " 2darray"); break; + case ac_image_2dmsaa: fprintf(output, " 2dmsaa"); break; + case ac_image_2darraymsaa: fprintf(output, " 2darraymsaa"); break; } if (mimg.unrm) fprintf(output, " unrm"); @@ -495,10 +447,8 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output) const Export_instruction& exp = instr->exp(); unsigned identity_mask = exp.compressed ? 0x5 : 0xf; if ((exp.enabled_mask & identity_mask) != identity_mask) - fprintf(output, " en:%c%c%c%c", - exp.enabled_mask & 0x1 ? 'r' : '*', - exp.enabled_mask & 0x2 ? 'g' : '*', - exp.enabled_mask & 0x4 ? 'b' : '*', + fprintf(output, " en:%c%c%c%c", exp.enabled_mask & 0x1 ? 'r' : '*', + exp.enabled_mask & 0x2 ? 'g' : '*', exp.enabled_mask & 0x4 ? 'b' : '*', exp.enabled_mask & 0x8 ? 'a' : '*'); if (exp.compressed) fprintf(output, " compr"); @@ -624,15 +574,9 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output) if (instr->isVOP3()) { const VOP3_instruction& vop3 = instr->vop3(); switch (vop3.omod) { - case 1: - fprintf(output, " *2"); - break; - case 2: - fprintf(output, " *4"); - break; - case 3: - fprintf(output, " *0.5"); - break; + case 1: fprintf(output, " *2"); break; + case 2: fprintf(output, " *4"); break; + case 3: fprintf(output, " *0.5"); break; } if (vop3.clamp) fprintf(output, " clamp"); @@ -641,8 +585,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output) } else if (instr->isDPP()) { const DPP_instruction& dpp = instr->dpp(); if (dpp.dpp_ctrl <= 0xff) { - fprintf(output, " quad_perm:[%d,%d,%d,%d]", - dpp.dpp_ctrl & 0x3, (dpp.dpp_ctrl >> 2) & 0x3, + fprintf(output, " quad_perm:[%d,%d,%d,%d]", dpp.dpp_ctrl & 0x3, (dpp.dpp_ctrl >> 2) & 0x3, (dpp.dpp_ctrl >> 4) & 0x3, (dpp.dpp_ctrl >> 6) & 0x3); } else if (dpp.dpp_ctrl >= 0x101 && dpp.dpp_ctrl <= 0x10f) { fprintf(output, " row_shl:%d", dpp.dpp_ctrl & 0xf); @@ -678,21 +621,14 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output) } else if (instr->isSDWA()) { const SDWA_instruction& sdwa = instr->sdwa(); switch (sdwa.omod) { - case 1: - fprintf(output, " *2"); - break; - case 2: - fprintf(output, " *4"); - break; - case 3: - fprintf(output, " *0.5"); - break; + case 1: fprintf(output, " *2"); break; + case 2: fprintf(output, " *4"); break; + case 3: fprintf(output, " *0.5"); break; } if (sdwa.clamp) fprintf(output, " clamp"); switch (sdwa.dst_sel & sdwa_asuint) { - case sdwa_udword: - break; + case sdwa_udword: break; case sdwa_ubyte0: case sdwa_ubyte1: case sdwa_ubyte2: @@ -711,7 +647,8 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output) } } -void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags) +void +aco_print_instr(const Instruction* instr, FILE* output, unsigned flags) { if (!instr->definitions.empty()) { for (unsigned i = 0; i < instr->definitions.size(); ++i) { @@ -723,10 +660,10 @@ void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags) } fprintf(output, "%s", instr_info.name[(int)instr->opcode]); if (instr->operands.size()) { - bool *const abs = (bool *)alloca(instr->operands.size() * sizeof(bool)); - bool *const neg = (bool *)alloca(instr->operands.size() * sizeof(bool)); - bool *const opsel = (bool *)alloca(instr->operands.size() * sizeof(bool)); - uint8_t *const sel = (uint8_t *)alloca(instr->operands.size() * sizeof(uint8_t)); + bool* const abs = (bool*)alloca(instr->operands.size() * sizeof(bool)); + bool* const neg = (bool*)alloca(instr->operands.size() * sizeof(bool)); + bool* const opsel = (bool*)alloca(instr->operands.size() * sizeof(bool)); + uint8_t* const sel = (uint8_t*)alloca(instr->operands.size() * sizeof(uint8_t)); for (unsigned i = 0; i < instr->operands.size(); ++i) { abs[i] = false; neg[i] = false; @@ -792,8 +729,7 @@ void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags) if (instr->isVOP3P()) { const VOP3P_instruction& vop3 = instr->vop3p(); if ((vop3.opsel_lo & (1 << i)) || !(vop3.opsel_hi & (1 << i))) { - fprintf(output, ".%c%c", - vop3.opsel_lo & (1 << i) ? 'y' : 'x', + fprintf(output, ".%c%c", vop3.opsel_lo & (1 << i) ? 'y' : 'x', vop3.opsel_hi & (1 << i) ? 'y' : 'x'); } if (vop3.neg_lo[i] && vop3.neg_hi[i]) @@ -808,7 +744,8 @@ void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags) print_instr_format_specific(instr, output); } -static void print_block_kind(uint16_t kind, FILE *output) +static void +print_block_kind(uint16_t kind, FILE* output) { if (kind & block_kind_uniform) fprintf(output, "uniform, "); @@ -844,7 +781,8 @@ static void print_block_kind(uint16_t kind, FILE *output) fprintf(output, "export_end, "); } -static void print_stage(Stage stage, FILE *output) +static void +print_stage(Stage stage, FILE* output) { fprintf(output, "ACO shader stage: "); @@ -888,7 +826,8 @@ static void print_stage(Stage stage, FILE *output) fprintf(output, "\n"); } -void aco_print_block(const Block* block, FILE *output, unsigned flags, const live& live_vars) +void +aco_print_block(const Block* block, FILE* output, unsigned flags, const live& live_vars) { fprintf(output, "BB%d\n", block->index); fprintf(output, "/* logical preds: "); @@ -927,19 +866,16 @@ void aco_print_block(const Block* block, FILE *output, unsigned flags, const liv } } -void aco_print_program(const Program *program, FILE *output, const live& live_vars, unsigned flags) +void +aco_print_program(const Program* program, FILE* output, const live& live_vars, unsigned flags) { switch (program->progress) { - case CompilationProgress::after_isel: - fprintf(output, "After Instruction Selection:\n"); - break; + case CompilationProgress::after_isel: fprintf(output, "After Instruction Selection:\n"); break; case CompilationProgress::after_spilling: fprintf(output, "After Spilling:\n"); flags |= print_kill; break; - case CompilationProgress::after_ra: - fprintf(output, "After RA:\n"); - break; + case CompilationProgress::after_ra: fprintf(output, "After RA:\n"); break; } print_stage(program->stage, output); @@ -965,9 +901,10 @@ void aco_print_program(const Program *program, FILE *output, const live& live_va fprintf(output, "\n"); } -void aco_print_program(const Program *program, FILE *output, unsigned flags) +void +aco_print_program(const Program* program, FILE* output, unsigned flags) { aco_print_program(program, output, live(), flags); } -} +} // namespace aco diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp index c7ba4ff16a2..ce99779327b 100644 --- a/src/amd/compiler/aco_reduce_assign.cpp +++ b/src/amd/compiler/aco_reduce_assign.cpp @@ -36,7 +36,8 @@ namespace aco { -void setup_reduce_temp(Program* program) +void +setup_reduce_temp(Program* program) { unsigned last_top_level_block_idx = 0; unsigned maxSize = 0; @@ -69,7 +70,8 @@ void setup_reduce_temp(Program* program) if (reduceTmp_in_loop && block.loop_nest_depth == 0) { assert(inserted_at == (int)last_top_level_block_idx); - aco_ptr end{create_instruction(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_in_loop ? 2 : 1, 0)}; + aco_ptr end{create_instruction( + aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_in_loop ? 2 : 1, 0)}; end->operands[0] = Operand(reduceTmp); if (vtmp_in_loop) end->operands[1] = Operand(vtmp); @@ -89,7 +91,7 @@ void setup_reduce_temp(Program* program) std::vector>::iterator it; for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { - Instruction *instr = (*it).get(); + Instruction* instr = (*it).get(); if (instr->format != Format::PSEUDO_REDUCTION) continue; @@ -98,7 +100,8 @@ void setup_reduce_temp(Program* program) if ((int)last_top_level_block_idx != inserted_at) { reduceTmp = program->allocateTmp(reduceTmp.regClass()); - aco_ptr create{create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; + aco_ptr create{create_instruction( + aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; create->definitions[0] = Definition(reduceTmp); /* find the right place to insert this definition */ if (last_top_level_block_idx == block.index) { @@ -110,18 +113,19 @@ void setup_reduce_temp(Program* program) } else { assert(last_top_level_block_idx < block.index); /* insert before the branch at last top level block */ - std::vector>& instructions = program->blocks[last_top_level_block_idx].instructions; - instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create)); + std::vector>& instructions = + program->blocks[last_top_level_block_idx].instructions; + instructions.insert(std::next(instructions.begin(), instructions.size() - 1), + std::move(create)); inserted_at = last_top_level_block_idx; } } /* same as before, except for the vector temporary instead of the reduce temporary */ unsigned cluster_size = instr->reduction().cluster_size; - bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || - op == fmin64 || op == fmax64 || op == umin64 || - op == umax64 || op == imin64 || op == imax64 || - op == imul64; + bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 || + op == fmax64 || op == umin64 || op == umax64 || op == imin64 || + op == imax64 || op == imul64; bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 || op == imul16 || op == imax16 || op == imin16 || op == umin16 || op == iadd64; @@ -138,15 +142,18 @@ void setup_reduce_temp(Program* program) vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0; if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) { vtmp = program->allocateTmp(vtmp.regClass()); - aco_ptr create{create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; + aco_ptr create{create_instruction( + aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; create->definitions[0] = Definition(vtmp); if (last_top_level_block_idx == block.index) { it = block.instructions.insert(it, std::move(create)); it++; } else { assert(last_top_level_block_idx < block.index); - std::vector>& instructions = program->blocks[last_top_level_block_idx].instructions; - instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create)); + std::vector>& instructions = + program->blocks[last_top_level_block_idx].instructions; + instructions.insert(std::next(instructions.begin(), instructions.size() - 1), + std::move(create)); vtmp_inserted_at = last_top_level_block_idx; } } @@ -158,5 +165,4 @@ void setup_reduce_temp(Program* program) } } -}; - +}; // namespace aco diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 9723caddc47..3ec0b21db48 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -37,10 +37,14 @@ namespace { struct ra_ctx; -unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr& instr, unsigned idx, RegClass rc); -void add_subdword_operand(ra_ctx& ctx, aco_ptr& instr, unsigned idx, unsigned byte, RegClass rc); -std::pair get_subdword_definition_info(Program *program, const aco_ptr& instr, RegClass rc); -void add_subdword_definition(Program *program, aco_ptr& instr, unsigned idx, PhysReg reg); +unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr& instr, + unsigned idx, RegClass rc); +void add_subdword_operand(ra_ctx& ctx, aco_ptr& instr, unsigned idx, unsigned byte, + RegClass rc); +std::pair +get_subdword_definition_info(Program* program, const aco_ptr& instr, RegClass rc); +void add_subdword_definition(Program* program, aco_ptr& instr, unsigned idx, + PhysReg reg); struct assignment { PhysReg reg; @@ -71,12 +75,11 @@ struct ra_ctx { ra_test_policy policy; ra_ctx(Program* program_, ra_test_policy policy_) - : program(program_), - assignments(program->peekAllocationId()), - renames(program->blocks.size()), - policy(policy_) + : program(program_), assignments(program->peekAllocationId()), + renames(program->blocks.size()), policy(policy_) { - pseudo_dummy.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0)); + pseudo_dummy.reset( + create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0)); sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); vgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); } @@ -92,31 +95,25 @@ struct PhysRegIterator { PhysReg reg; - PhysReg operator*() const { - return reg; - } + PhysReg operator*() const { return reg; } - PhysRegIterator& operator++() { + PhysRegIterator& operator++() + { reg.reg_b += 4; return *this; } - PhysRegIterator& operator--() { + PhysRegIterator& operator--() + { reg.reg_b -= 4; return *this; } - bool operator==(PhysRegIterator oth) const { - return reg == oth.reg; - } + bool operator==(PhysRegIterator oth) const { return reg == oth.reg; } - bool operator!=(PhysRegIterator oth) const { - return reg != oth.reg; - } + bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; } - bool operator<(PhysRegIterator oth) const { - return reg < oth.reg; - } + bool operator<(PhysRegIterator oth) const { return reg < oth.reg; } }; /* Half-open register interval used in "sliding window"-style for-loops */ @@ -125,72 +122,65 @@ struct PhysRegInterval { unsigned size; /* Inclusive lower bound */ - PhysReg lo() const { - return lo_; - } + PhysReg lo() const { return lo_; } /* Exclusive upper bound */ - PhysReg hi() const { - return PhysReg { lo() + size }; - } + PhysReg hi() const { return PhysReg{lo() + size}; } - PhysRegInterval& operator+=(uint32_t stride) { - lo_ = PhysReg { lo_.reg() + stride }; + PhysRegInterval& operator+=(uint32_t stride) + { + lo_ = PhysReg{lo_.reg() + stride}; return *this; } - bool operator!=(const PhysRegInterval& oth) const { - return lo_ != oth.lo_ || size != oth.size; - } + bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; } /* Construct a half-open interval, excluding the end register */ - static PhysRegInterval from_until(PhysReg first, PhysReg end) { - return { first, end - first }; - } + static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; } - bool contains(PhysReg reg) const { - return lo() <= reg && reg < hi(); - } + bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); } - bool contains(const PhysRegInterval& needle) const { + bool contains(const PhysRegInterval& needle) const + { return needle.lo() >= lo() && needle.hi() <= hi(); } - PhysRegIterator begin() const { - return { lo_ }; - } + PhysRegIterator begin() const { return {lo_}; } - PhysRegIterator end() const { - return { PhysReg { lo_ + size } }; - } + PhysRegIterator end() const { return {PhysReg{lo_ + size}}; } }; -bool intersects(const PhysRegInterval& a, const PhysRegInterval& b) { - return ((a.lo() >= b.lo() && a.lo() < b.hi()) || - (a.hi() > b.lo() && a.hi() <= b.hi())); +bool +intersects(const PhysRegInterval& a, const PhysRegInterval& b) +{ + return ((a.lo() >= b.lo() && a.lo() < b.hi()) || (a.hi() > b.lo() && a.hi() <= b.hi())); } /* Gets the stride for full (non-subdword) registers */ -uint32_t get_stride(RegClass rc) { - if (rc.type() == RegType::vgpr) { - return 1; - } else { - uint32_t size = rc.size(); - if (size == 2) { - return 2; - } else if (size >= 4) { - return 4; - } else { - return 1; - } - } +uint32_t +get_stride(RegClass rc) +{ + if (rc.type() == RegType::vgpr) { + return 1; + } else { + uint32_t size = rc.size(); + if (size == 2) { + return 2; + } else if (size >= 4) { + return 4; + } else { + return 1; + } + } } -PhysRegInterval get_reg_bounds(Program* program, RegType type) { +PhysRegInterval +get_reg_bounds(Program* program, RegType type) +{ if (type == RegType::vgpr) { - return { PhysReg { 256 }, (unsigned)program->max_reg_demand.vgpr }; + return {PhysReg{256}, (unsigned)program->max_reg_demand.vgpr}; } else { - return { PhysReg { 0 }, (unsigned)program->max_reg_demand.sgpr }; + return {PhysReg{0}, (unsigned)program->max_reg_demand.sgpr}; } } @@ -200,7 +190,8 @@ struct DefInfo { uint8_t stride; RegClass rc; - DefInfo(ra_ctx& ctx, aco_ptr& instr, RegClass rc_, int operand) : rc(rc_) { + DefInfo(ra_ctx& ctx, aco_ptr& instr, RegClass rc_, int operand) : rc(rc_) + { size = rc.size(); stride = get_stride(rc); @@ -229,20 +220,17 @@ struct DefInfo { class RegisterFile { public: - RegisterFile() {regs.fill(0);} + RegisterFile() { regs.fill(0); } std::array regs; std::map> subdword_regs; - const uint32_t& operator [] (PhysReg index) const { - return regs[index]; - } + const uint32_t& operator[](PhysReg index) const { return regs[index]; } - uint32_t& operator [] (PhysReg index) { - return regs[index]; - } + uint32_t& operator[](PhysReg index) { return regs[index]; } - unsigned count_zero(PhysRegInterval reg_interval) { + unsigned count_zero(PhysRegInterval reg_interval) + { unsigned res = 0; for (PhysReg reg : reg_interval) res += !regs[reg]; @@ -250,7 +238,8 @@ public: } /* Returns true if any of the bytes in the given range are allocated or blocked */ - bool test(PhysReg start, unsigned num_bytes) { + bool test(PhysReg start, unsigned num_bytes) + { for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) { assert(i <= 511); if (regs[i] & 0x0FFFFFFF) @@ -266,14 +255,16 @@ public: return false; } - void block(PhysReg start, RegClass rc) { + void block(PhysReg start, RegClass rc) + { if (rc.is_subdword()) fill_subdword(start, rc.bytes(), 0xFFFFFFFF); else fill(start, rc.size(), 0xFFFFFFFF); } - bool is_blocked(PhysReg start) { + bool is_blocked(PhysReg start) + { if (regs[start] == 0xFFFFFFFF) return true; if (regs[start] == 0xF0000000) { @@ -284,7 +275,8 @@ public: return false; } - bool is_empty_or_blocked(PhysReg start) { + bool is_empty_or_blocked(PhysReg start) + { /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the * incremented value to 1 */ if (regs[start] == 0xF0000000) { @@ -293,50 +285,53 @@ public: return regs[start] + 1 <= 1; } - void clear(PhysReg start, RegClass rc) { + void clear(PhysReg start, RegClass rc) + { if (rc.is_subdword()) fill_subdword(start, rc.bytes(), 0); else fill(start, rc.size(), 0); } - void fill(Operand op) { + void fill(Operand op) + { if (op.regClass().is_subdword()) fill_subdword(op.physReg(), op.bytes(), op.tempId()); else fill(op.physReg(), op.size(), op.tempId()); } - void clear(Operand op) { - clear(op.physReg(), op.regClass()); - } + void clear(Operand op) { clear(op.physReg(), op.regClass()); } - void fill(Definition def) { + void fill(Definition def) + { if (def.regClass().is_subdword()) fill_subdword(def.physReg(), def.bytes(), def.tempId()); else fill(def.physReg(), def.size(), def.tempId()); } - void clear(Definition def) { - clear(def.physReg(), def.regClass()); - } + void clear(Definition def) { clear(def.physReg(), def.regClass()); } - unsigned get_id(PhysReg reg) { + unsigned get_id(PhysReg reg) + { return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg]; } private: - void fill(PhysReg start, unsigned size, uint32_t val) { + void fill(PhysReg start, unsigned size, uint32_t val) + { for (unsigned i = 0; i < size; i++) regs[start + i] = val; } - void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val) { + void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val) + { fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000); for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) { /* emplace or get */ - std::array& sub = subdword_regs.emplace(i, std::array{0, 0, 0, 0}).first->second; + std::array& sub = + subdword_regs.emplace(i, std::array{0, 0, 0, 0}).first->second; for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) sub[j] = val; @@ -348,22 +343,25 @@ private: } }; - std::set> find_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval); /* helper function for debugging */ -UNUSED void print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable) { +UNUSED void +print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable) +{ if (reg_file[reg] == 0xFFFFFFFF) { printf("☐"); } else if (reg_file[reg]) { const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000); if (show_subdword_alloc) { const char* block_chars[] = { + // clang-format off "?", "▘", "▝", "▀", "▖", "▌", "▞", "▛", "▗", "▚", "▐", "▜", "▄", "▙", "▟", "▉" + // clang-format on }; unsigned index = 0; for (int i = 0; i < 4; ++i) { @@ -387,7 +385,8 @@ UNUSED void print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjace } /* helper function for debugging */ -UNUSED void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) +UNUSED void +print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) { PhysRegInterval regs = get_reg_bounds(ctx.program, vgprs ? RegType::vgpr : RegType::sgpr); char reg_char = vgprs ? 'v' : 's'; @@ -403,7 +402,8 @@ UNUSED void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) /* print usage */ auto line_begin_it = regs.begin(); while (line_begin_it != regs.end()) { - const int regs_in_line = std::min(max_regs_per_line, std::distance(line_begin_it, regs.end())); + const int regs_in_line = + std::min(max_regs_per_line, std::distance(line_begin_it, regs.end())); if (line_begin_it == regs.begin()) { printf("%cgprs: ", reg_char); @@ -413,9 +413,9 @@ UNUSED void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) const auto line_end_it = std::next(line_begin_it, regs_in_line); for (auto reg_it = line_begin_it; reg_it != line_end_it; ++reg_it) { - bool has_adjacent_variable = (std::next(reg_it) != line_end_it && - reg_file[*reg_it] != reg_file[*std::next(reg_it)] && - reg_file[*std::next(reg_it)]); + bool has_adjacent_variable = + (std::next(reg_it) != line_end_it && + reg_file[*reg_it] != reg_file[*std::next(reg_it)] && reg_file[*std::next(reg_it)]); print_reg(reg_file, *reg_it, has_adjacent_variable); } @@ -423,11 +423,13 @@ UNUSED void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) printf("\n"); } - const unsigned free_regs = std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; }); + const unsigned free_regs = + std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; }); printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size); /* print assignments ordered by registers */ - std::map> regs_to_vars; /* maps to byte size and temp id */ + std::map> + regs_to_vars; /* maps to byte size and temp id */ for (const auto& size_id : find_vars(ctx, reg_file, regs)) { auto reg = ctx.assignments[size_id.second].reg; ASSERTED auto inserted = regs_to_vars.emplace(reg, size_id); @@ -439,7 +441,8 @@ UNUSED void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) const auto& size_id = reg_and_var.second; printf("%%%u ", size_id.second); - if (ctx.orig_names.count(size_id.second) && ctx.orig_names[size_id.second].id() != size_id.second) { + if (ctx.orig_names.count(size_id.second) && + ctx.orig_names[size_id.second].id() != size_id.second) { printf("(was %%%d) ", ctx.orig_names[size_id.second].id()); } printf("= %c[%d", reg_char, first_reg.reg() - regs.lo()); @@ -456,8 +459,9 @@ UNUSED void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) } } - -unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr& instr, unsigned idx, RegClass rc) +unsigned +get_subdword_operand_stride(chip_class chip, const aco_ptr& instr, unsigned idx, + RegClass rc) { /* v_readfirstlane_b32 cannot use SDWA */ if (instr->opcode == aco_opcode::p_as_uniform) @@ -477,8 +481,7 @@ unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr switch (instr->opcode) { case aco_opcode::ds_write_b8: - case aco_opcode::ds_write_b16: - return chip >= GFX8 ? 2 : 4; + case aco_opcode::ds_write_b16: return chip >= GFX8 ? 2 : 4; case aco_opcode::buffer_store_byte: case aco_opcode::buffer_store_short: case aco_opcode::flat_store_byte: @@ -486,16 +489,16 @@ unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr case aco_opcode::scratch_store_byte: case aco_opcode::scratch_store_short: case aco_opcode::global_store_byte: - case aco_opcode::global_store_short: - return chip >= GFX9 ? 2 : 4; - default: - break; + case aco_opcode::global_store_short: return chip >= GFX9 ? 2 : 4; + default: break; } return 4; } -void add_subdword_operand(ra_ctx& ctx, aco_ptr& instr, unsigned idx, unsigned byte, RegClass rc) +void +add_subdword_operand(ra_ctx& ctx, aco_ptr& instr, unsigned idx, unsigned byte, + RegClass rc) { chip_class chip = ctx.program->chip_class; if (instr->isPseudo() || byte == 0) @@ -505,18 +508,10 @@ void add_subdword_operand(ra_ctx& ctx, aco_ptr& instr, unsigned idx if (!instr->usesModifiers() && instr->opcode == aco_opcode::v_cvt_f32_ubyte0) { switch (byte) { - case 0: - instr->opcode = aco_opcode::v_cvt_f32_ubyte0; - break; - case 1: - instr->opcode = aco_opcode::v_cvt_f32_ubyte1; - break; - case 2: - instr->opcode = aco_opcode::v_cvt_f32_ubyte2; - break; - case 3: - instr->opcode = aco_opcode::v_cvt_f32_ubyte3; - break; + case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break; + case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break; + case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break; + case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break; } return; } else if (can_use_SDWA(chip, instr, false)) { @@ -565,7 +560,8 @@ void add_subdword_operand(ra_ctx& ctx, aco_ptr& instr, unsigned idx } /* minimum_stride, bytes_written */ -std::pair get_subdword_definition_info(Program *program, const aco_ptr& instr, RegClass rc) +std::pair +get_subdword_definition_info(Program* program, const aco_ptr& instr, RegClass rc) { chip_class chip = program->chip_class; @@ -581,11 +577,8 @@ std::pair get_subdword_definition_info(Program *program, con case aco_opcode::v_mad_i16: case aco_opcode::v_fma_f16: case aco_opcode::v_div_fixup_f16: - case aco_opcode::v_interp_p2_f16: - bytes_written = chip >= GFX9 ? rc.bytes() : 4u; - break; - default: - break; + case aco_opcode::v_interp_p2_f16: bytes_written = chip >= GFX9 ? rc.bytes() : 4u; break; + default: break; } bytes_written = bytes_written > 4 ? align(bytes_written, 4) : bytes_written; bytes_written = MAX2(bytes_written, instr_info.definition_size[(int)instr->opcode] / 8u); @@ -611,16 +604,15 @@ std::pair get_subdword_definition_info(Program *program, con return std::make_pair(2u, 2u); else return std::make_pair(2u, 4u); - case aco_opcode::v_fma_mixlo_f16: - return std::make_pair(2u, 2u); - default: - break; + case aco_opcode::v_fma_mixlo_f16: return std::make_pair(2u, 2u); + default: break; } return std::make_pair(4u, bytes_written); } -void add_subdword_definition(Program *program, aco_ptr& instr, unsigned idx, PhysReg reg) +void +add_subdword_definition(Program* program, aco_ptr& instr, unsigned idx, PhysReg reg) { RegClass rc = instr->definitions[idx].regClass(); chip_class chip = program->chip_class; @@ -632,7 +624,8 @@ void add_subdword_definition(Program *program, aco_ptr& instr, unsi if (reg.byte() || chip < GFX10 || def_size > rc.bytes() * 8u) convert_to_SDWA(chip, instr); return; - } else if (reg.byte() && rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, -1, reg.byte() / 2)) { + } else if (reg.byte() && rc.bytes() == 2 && + can_use_opsel(chip, instr->opcode, -1, reg.byte() / 2)) { VOP3_instruction& vop3 = instr->vop3(); if (reg.byte() == 2) vop3.opsel |= (1 << 3); /* dst in high half */ @@ -667,7 +660,8 @@ void add_subdword_definition(Program *program, aco_ptr& instr, unsi } } -void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) +void +adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) { uint16_t max_addressible_sgpr = ctx.sgpr_limit; unsigned size = rc.size(); @@ -687,9 +681,10 @@ enum UpdateRenames { }; MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames); -void update_renames(ra_ctx& ctx, RegisterFile& reg_file, - std::vector>& parallelcopies, - aco_ptr& instr, UpdateRenames flags) +void +update_renames(ra_ctx& ctx, RegisterFile& reg_file, + std::vector>& parallelcopies, + aco_ptr& instr, UpdateRenames flags) { /* clear operands */ for (std::pair& copy : parallelcopies) { @@ -765,9 +760,9 @@ void update_renames(ra_ctx& ctx, RegisterFile& reg_file, bool omit_renaming = !(flags & rename_not_killed_ops) && !op.isKillBeforeDef(); for (std::pair& pc : parallelcopies) { PhysReg def_reg = pc.second.physReg(); - omit_renaming &= def_reg > copy.first.physReg() ? - (copy.first.physReg() + copy.first.size() <= def_reg.reg()) : - (def_reg + pc.second.size() <= copy.first.physReg().reg()); + omit_renaming &= def_reg > copy.first.physReg() + ? (copy.first.physReg() + copy.first.size() <= def_reg.reg()) + : (def_reg + pc.second.size() <= copy.first.physReg().reg()); } if (omit_renaming) { if (first) @@ -791,9 +786,8 @@ void update_renames(ra_ctx& ctx, RegisterFile& reg_file, } } -std::pair get_reg_simple(ra_ctx& ctx, - RegisterFile& reg_file, - DefInfo info) +std::pair +get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info) { const PhysRegInterval& bounds = info.bounds; uint32_t size = info.size; @@ -811,15 +805,18 @@ std::pair get_reg_simple(ra_ctx& ctx, return res; } - auto is_free = [&](PhysReg reg_index) { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; }; + auto is_free = [&](PhysReg reg_index) + { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; }; if (stride == 1) { /* best fit algorithm: find the smallest gap to fit in the variable */ - PhysRegInterval best_gap { PhysReg { 0 }, UINT_MAX }; - const unsigned max_gpr = (rc.type() == RegType::vgpr) ? (256 + ctx.max_used_vgpr) : ctx.max_used_sgpr; + PhysRegInterval best_gap{PhysReg{0}, UINT_MAX}; + const unsigned max_gpr = + (rc.type() == RegType::vgpr) ? (256 + ctx.max_used_vgpr) : ctx.max_used_sgpr; PhysRegIterator reg_it = bounds.begin(); - const PhysRegIterator end_it = std::min(bounds.end(), std::max(PhysRegIterator { PhysReg { max_gpr + 1 } }, reg_it)); + const PhysRegIterator end_it = + std::min(bounds.end(), std::max(PhysRegIterator{PhysReg{max_gpr + 1}}, reg_it)); while (reg_it != bounds.end()) { /* Find the next chunk of available register slots */ reg_it = std::find_if(reg_it, end_it, is_free); @@ -859,14 +856,15 @@ std::pair get_reg_simple(ra_ctx& ctx, if (((best_gap.lo() + size) % 8 != 0 && (best_gap.lo() + buffer) % 8 == 0) || ((best_gap.lo() + size) % 4 != 0 && (best_gap.lo() + buffer) % 4 == 0) || ((best_gap.lo() + size) % 2 != 0 && (best_gap.lo() + buffer) % 2 == 0)) - best_gap = { PhysReg { best_gap.lo() + buffer }, best_gap.size - buffer }; + best_gap = {PhysReg{best_gap.lo() + buffer}, best_gap.size - buffer}; } adjust_max_used_regs(ctx, rc, best_gap.lo()); return {best_gap.lo(), true}; } - for (PhysRegInterval reg_win = { bounds.lo(), size }; reg_win.hi() <= bounds.hi(); reg_win += stride) { + for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi(); + reg_win += stride) { if (reg_file[reg_win.lo()] != 0) { continue; } @@ -887,14 +885,15 @@ std::pair get_reg_simple(ra_ctx& ctx, if (!bounds.contains(PhysReg{entry.first})) continue; - for (unsigned i = 0; i < 4; i+= info.stride) { + for (unsigned i = 0; i < 4; i += info.stride) { /* check if there's a block of free bytes large enough to hold the register */ - bool reg_found = std::all_of(&entry.second[i], &entry.second[std::min(4u, i + rc.bytes())], - [](unsigned v) { return v == 0; }); + bool reg_found = + std::all_of(&entry.second[i], &entry.second[std::min(4u, i + rc.bytes())], + [](unsigned v) { return v == 0; }); /* check if also the neighboring reg is free if needed */ if (reg_found && i + rc.bytes() > 4) - reg_found = (reg_file[PhysReg{entry.first + 1}] == 0); + reg_found = (reg_file[PhysReg{entry.first + 1}] == 0); if (reg_found) { PhysReg res{entry.first}; @@ -910,8 +909,8 @@ std::pair get_reg_simple(ra_ctx& ctx, } /* collect variables from a register area and clear reg_file */ -std::set> find_vars(ra_ctx& ctx, RegisterFile& reg_file, - const PhysRegInterval reg_interval) +std::set> +find_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval) { std::set> vars; for (PhysReg j : reg_interval) { @@ -935,8 +934,8 @@ std::set> find_vars(ra_ctx& ctx, RegisterFile& reg } /* collect variables from a register area and clear reg_file */ -std::set> collect_vars(ra_ctx& ctx, RegisterFile& reg_file, - const PhysRegInterval reg_interval) +std::set> +collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval) { std::set> vars = find_vars(ctx, reg_file, reg_interval); for (std::pair size_id : vars) { @@ -946,17 +945,18 @@ std::set> collect_vars(ra_ctx& ctx, RegisterFile& return vars; } -bool get_regs_for_copies(ra_ctx& ctx, - RegisterFile& reg_file, - std::vector>& parallelcopies, - const std::set> &vars, - const PhysRegInterval bounds, - aco_ptr& instr, - const PhysRegInterval def_reg) +bool +get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, + std::vector>& parallelcopies, + const std::set>& vars, + const PhysRegInterval bounds, aco_ptr& instr, + const PhysRegInterval def_reg) { /* variables are sorted from small sized to large */ - /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */ - for (std::set>::const_reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) { + /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders + * slightly though. */ + for (std::set>::const_reverse_iterator it = vars.rbegin(); + it != vars.rend(); ++it) { unsigned id = it->second; assignment& var = ctx.assignments[id]; DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1); @@ -980,7 +980,8 @@ bool get_regs_for_copies(ra_ctx& ctx, PhysReg reg(def_reg.lo()); for (unsigned i = 0; i < instr->operands.size(); i++) { if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) { - res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) && !reg_file.test(reg, var.rc.bytes())}; + res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) && + !reg_file.test(reg, var.rc.bytes())}; break; } reg.reg_b += instr->operands[i].bytes(); @@ -1021,8 +1022,8 @@ bool get_regs_for_copies(ra_ctx& ctx, /* we use a sliding window to find potential positions */ unsigned stride = var.rc.is_subdword() ? 1 : info.stride; - for (PhysRegInterval reg_win { bounds.lo(), size }; - reg_win.hi() <= bounds.hi(); reg_win += stride) { + for (PhysRegInterval reg_win{bounds.lo(), size}; reg_win.hi() <= bounds.hi(); + reg_win += stride) { if (!is_dead_operand && intersects(reg_win, def_reg)) continue; @@ -1082,7 +1083,7 @@ bool get_regs_for_copies(ra_ctx& ctx, if (num_moves == 0xFF) return false; - PhysRegInterval reg_win { best_pos, size }; + PhysRegInterval reg_win{best_pos, size}; /* collect variables and block reg file */ std::set> new_vars = collect_vars(ctx, reg_file, reg_win); @@ -1105,12 +1106,10 @@ bool get_regs_for_copies(ra_ctx& ctx, return true; } - -std::pair get_reg_impl(ra_ctx& ctx, - RegisterFile& reg_file, - std::vector>& parallelcopies, - const DefInfo& info, - aco_ptr& instr) +std::pair +get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file, + std::vector>& parallelcopies, const DefInfo& info, + aco_ptr& instr) { const PhysRegInterval& bounds = info.bounds; uint32_t size = info.size; @@ -1125,9 +1124,7 @@ std::pair get_reg_impl(ra_ctx& ctx, std::bitset<256> is_killed_operand; /* per-register */ for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) { Operand& op = instr->operands[j]; - if (op.isTemp() && - op.isFirstKillBeforeDef() && - bounds.contains(op.physReg()) && + if (op.isTemp() && op.isFirstKillBeforeDef() && bounds.contains(op.physReg()) && !reg_file.test(PhysReg{op.physReg().reg()}, align(op.bytes() + op.physReg().byte(), 4))) { assert(op.isFixed()); @@ -1147,12 +1144,13 @@ std::pair get_reg_impl(ra_ctx& ctx, op_moves = size - (regs_free - killed_ops); /* find the best position to place the definition */ - PhysRegInterval best_win = { bounds.lo(), size }; + PhysRegInterval best_win = {bounds.lo(), size}; unsigned num_moves = 0xFF; unsigned num_vars = 0; /* we use a sliding window to check potential positions */ - for (PhysRegInterval reg_win = { bounds.lo(), size }; reg_win.hi() <= bounds.hi(); reg_win += stride) { + for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi(); + reg_win += stride) { /* first check if the register window starts in the middle of an * allocated variable: this is what we have to fix to allow for * num_moves > size */ @@ -1232,12 +1230,10 @@ std::pair get_reg_impl(ra_ctx& ctx, * or which are in the definition space */ PhysReg reg = best_win.lo(); for (Operand& op : instr->operands) { - if (op.isTemp() && op.isFirstKillBeforeDef() && - op.getTemp().type() == rc.type()) { - if (op.physReg() != reg && - (ctx.program->chip_class >= GFX9 || - (op.physReg().advance(op.bytes()) > best_win.lo() && - op.physReg() < best_win.hi()))) { + if (op.isTemp() && op.isFirstKillBeforeDef() && op.getTemp().type() == rc.type()) { + if (op.physReg() != reg && (ctx.program->chip_class >= GFX9 || + (op.physReg().advance(op.bytes()) > best_win.lo() && + op.physReg() < best_win.hi()))) { vars.emplace(op.bytes(), op.tempId()); tmp_file.clear(op); } else { @@ -1264,11 +1260,9 @@ std::pair get_reg_impl(ra_ctx& ctx, return {best_win.lo(), true}; } -bool get_reg_specified(ra_ctx& ctx, - RegisterFile& reg_file, - RegClass rc, - aco_ptr& instr, - PhysReg reg) +bool +get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr& instr, + PhysReg reg) { /* catch out-of-range registers */ if (reg >= PhysReg{512}) @@ -1286,9 +1280,9 @@ bool get_reg_specified(ra_ctx& ctx, if (rc.type() == RegType::sgpr && reg % get_stride(rc) != 0) return false; - PhysRegInterval reg_win = { reg, rc.size() }; + PhysRegInterval reg_win = {reg, rc.size()}; PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type()); - PhysRegInterval vcc_win = { vcc, 2 }; + PhysRegInterval vcc_win = {vcc, 2}; /* VCC is outside the bounds */ bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win); bool is_m0 = rc == s1 && reg == m0; @@ -1309,11 +1303,15 @@ bool get_reg_specified(ra_ctx& ctx, return true; } -bool increase_register_file(ra_ctx& ctx, RegType type) { +bool +increase_register_file(ra_ctx& ctx, RegType type) +{ if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) { - update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr)); + update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, + ctx.program->max_reg_demand.sgpr)); } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) { - update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1)); + update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, + ctx.program->max_reg_demand.sgpr + 1)); } else { return false; } @@ -1338,9 +1336,9 @@ struct IDAndInfo { * one. If one of the variables has 0xffffffff as an ID, the register assigned * for that variable will be returned. */ -PhysReg compact_relocate_vars(ra_ctx& ctx, const std::vector& vars, - std::vector>& parallelcopies, - PhysReg start) +PhysReg +compact_relocate_vars(ra_ctx& ctx, const std::vector& vars, + std::vector>& parallelcopies, PhysReg start) { /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword * temporary sizes to dwords. @@ -1351,18 +1349,21 @@ PhysReg compact_relocate_vars(ra_ctx& ctx, const std::vector& var sorted.emplace_back(var.id, info); } - std::sort(sorted.begin(), sorted.end(), [&ctx](const IDAndInfo& a, - const IDAndInfo& b) { - unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4); - unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4); - if (a_stride > b_stride) - return true; - if (a_stride < b_stride) - return false; - if (a.id == 0xffffffff || b.id == 0xffffffff) - return a.id == 0xffffffff; /* place 0xffffffff before others if possible, not for any reason */ - return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg; - }); + std::sort( + sorted.begin(), sorted.end(), + [&ctx](const IDAndInfo& a, const IDAndInfo& b) + { + unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4); + unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4); + if (a_stride > b_stride) + return true; + if (a_stride < b_stride) + return false; + if (a.id == 0xffffffff || b.id == 0xffffffff) + return a.id == + 0xffffffff; /* place 0xffffffff before others if possible, not for any reason */ + return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg; + }); PhysReg next_reg = start; PhysReg space_reg; @@ -1395,7 +1396,8 @@ PhysReg compact_relocate_vars(ra_ctx& ctx, const std::vector& var return space_reg; } -bool is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction *instr) +bool +is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr) { PhysReg first{512}; for (unsigned i = 0; i < instr->operands.size() - 3u; i++) { @@ -1424,10 +1426,8 @@ bool is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction *inst return true; } -std::pair get_reg_vector(ra_ctx& ctx, - RegisterFile& reg_file, - Temp temp, - aco_ptr& instr) +std::pair +get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr& instr) { Instruction* vec = ctx.vectors[temp.id()]; unsigned first_operand = vec->format == Format::MIMG ? 3 : 0; @@ -1448,9 +1448,7 @@ std::pair get_reg_vector(ra_ctx& ctx, */ for (unsigned i = first_operand; i < vec->operands.size(); i++) { Operand& op = vec->operands[i]; - if (op.isTemp() && - op.tempId() != temp.id() && - op.getTemp().type() == temp.type() && + if (op.isTemp() && op.tempId() != temp.id() && op.getTemp().type() == temp.type() && ctx.assignments[op.tempId()].assigned) { PhysReg reg = ctx.assignments[op.tempId()].reg; reg.reg_b += (our_offset - their_offset); @@ -1477,12 +1475,10 @@ std::pair get_reg_vector(ra_ctx& ctx, return {{}, false}; } -PhysReg get_reg(ra_ctx& ctx, - RegisterFile& reg_file, - Temp temp, - std::vector>& parallelcopies, - aco_ptr& instr, - int operand_index=-1) +PhysReg +get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, + std::vector>& parallelcopies, aco_ptr& instr, + int operand_index = -1) { auto split_vec = ctx.split_vectors.find(temp.id()); if (split_vec != ctx.split_vectors.end()) { @@ -1581,11 +1577,10 @@ PhysReg get_reg(ra_ctx& ctx, return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index); } -PhysReg get_reg_create_vector(ra_ctx& ctx, - RegisterFile& reg_file, - Temp temp, - std::vector>& parallelcopies, - aco_ptr& instr) +PhysReg +get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, + std::vector>& parallelcopies, + aco_ptr& instr) { RegClass rc = temp.regClass(); /* create_vector instructions have different costs w.r.t. register coalescing */ @@ -1594,16 +1589,18 @@ PhysReg get_reg_create_vector(ra_ctx& ctx, uint32_t stride = get_stride(rc); PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type()); - //TODO: improve p_create_vector for sub-dword vectors + // TODO: improve p_create_vector for sub-dword vectors - PhysReg best_pos { 0xFFF }; + PhysReg best_pos{0xFFF}; unsigned num_moves = 0xFF; bool best_war_hint = true; /* test for each operand which definition placement causes the least shuffle instructions */ - for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) { + for (unsigned i = 0, offset = 0; i < instr->operands.size(); + offset += instr->operands[i].bytes(), i++) { // TODO: think about, if we can alias live operands on the same register - if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || instr->operands[i].getTemp().type() != rc.type()) + if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || + instr->operands[i].getTemp().type() != rc.type()) continue; if (offset > instr->operands[i].physReg().reg_b) @@ -1612,7 +1609,7 @@ PhysReg get_reg_create_vector(ra_ctx& ctx, unsigned reg_lower = instr->operands[i].physReg().reg_b - offset; if (reg_lower % 4) continue; - PhysRegInterval reg_win = { PhysReg { reg_lower / 4 }, size }; + PhysRegInterval reg_win = {PhysReg{reg_lower / 4}, size}; unsigned k = 0; /* no need to check multiple times */ @@ -1623,9 +1620,11 @@ PhysReg get_reg_create_vector(ra_ctx& ctx, // TODO: this can be improved */ if (!bounds.contains(reg_win) || reg_win.lo() % stride != 0) continue; - if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 && reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1))) + if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 && + reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1))) continue; - if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 && reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi())) + if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 && + reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi())) continue; /* count variables to be moved and check war_hint */ @@ -1656,9 +1655,9 @@ PhysReg get_reg_create_vector(ra_ctx& ctx, continue; /* count operands in wrong positions */ - for (unsigned j = 0, offset2 = 0; j < instr->operands.size(); offset2 += instr->operands[j].bytes(), j++) { - if (j == i || - !instr->operands[j].isTemp() || + for (unsigned j = 0, offset2 = 0; j < instr->operands.size(); + offset2 += instr->operands[j].bytes(), j++) { + if (j == i || !instr->operands[j].isTemp() || instr->operands[j].getTemp().type() != rc.type()) continue; if (instr->operands[j].physReg().reg_b != reg_win.lo() * 4 + offset2) @@ -1678,17 +1677,19 @@ PhysReg get_reg_create_vector(ra_ctx& ctx, /* re-enable killed operands which are in the wrong position */ RegisterFile tmp_file(reg_file); - for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) { - if (instr->operands[i].isTemp() && - instr->operands[i].isFirstKillBeforeDef() && + for (unsigned i = 0, offset = 0; i < instr->operands.size(); + offset += instr->operands[i].bytes(), i++) { + if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() && instr->operands[i].physReg().reg_b != best_pos.reg_b + offset) tmp_file.fill(instr->operands[i]); } /* collect variables to be moved */ - std::set> vars = collect_vars(ctx, tmp_file, PhysRegInterval { best_pos, size }); + std::set> vars = + collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size}); - for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) { + for (unsigned i = 0, offset = 0; i < instr->operands.size(); + offset += instr->operands[i].bytes(), i++) { if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() || instr->operands[i].getTemp().type() != rc.type()) continue; @@ -1700,14 +1701,15 @@ PhysReg get_reg_create_vector(ra_ctx& ctx, if (ctx.program->chip_class >= GFX9 && !correct_pos) { vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId()); tmp_file.clear(instr->operands[i]); - /* fill operands which are in the correct position to avoid overwriting */ + /* fill operands which are in the correct position to avoid overwriting */ } else if (correct_pos) { tmp_file.fill(instr->operands[i]); } } bool success = false; std::vector> pc; - success = get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, PhysRegInterval { best_pos, size }); + success = + get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, PhysRegInterval{best_pos, size}); if (!success) { if (!increase_register_file(ctx, temp.type())) { @@ -1723,9 +1725,8 @@ PhysReg get_reg_create_vector(ra_ctx& ctx, return best_pos; } -void handle_pseudo(ra_ctx& ctx, - const RegisterFile& reg_file, - Instruction* instr) +void +handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr) { if (instr->format != Format::PSEUDO) return; @@ -1736,10 +1737,8 @@ void handle_pseudo(ra_ctx& ctx, case aco_opcode::p_create_vector: case aco_opcode::p_split_vector: case aco_opcode::p_parallelcopy: - case aco_opcode::p_wqm: - break; - default: - return; + case aco_opcode::p_wqm: break; + default: return; } /* if all definitions are vgpr, no need to care for SCC */ @@ -1761,8 +1760,8 @@ void handle_pseudo(ra_ctx& ctx, if (op.isTemp() && op.regClass().is_subdword()) reads_subdword = true; } - bool needs_scratch_reg = (writes_sgpr && reads_sgpr) || - (ctx.program->chip_class <= GFX7 && reads_subdword); + bool needs_scratch_reg = + (writes_sgpr && reads_sgpr) || (ctx.program->chip_class <= GFX7 && reads_subdword); if (!needs_scratch_reg) return; @@ -1789,7 +1788,9 @@ void handle_pseudo(ra_ctx& ctx, } } -bool operand_can_use_reg(chip_class chip, aco_ptr& instr, unsigned idx, PhysReg reg, RegClass rc) +bool +operand_can_use_reg(chip_class chip, aco_ptr& instr, unsigned idx, PhysReg reg, + RegClass rc) { if (instr->operands[idx].isFixed()) return instr->operands[idx].physReg() == reg; @@ -1798,9 +1799,9 @@ bool operand_can_use_reg(chip_class chip, aco_ptr& instr, unsigned instr->opcode == aco_opcode::v_writelane_b32_e64; if (chip <= GFX9 && is_writelane && idx <= 1) { /* v_writelane_b32 can take two sgprs but only if one is m0. */ - bool is_other_sgpr = instr->operands[!idx].isTemp() && - (!instr->operands[!idx].isFixed() || - instr->operands[!idx].physReg() != m0); + bool is_other_sgpr = + instr->operands[!idx].isTemp() && + (!instr->operands[!idx].isFixed() || instr->operands[!idx].physReg() != m0); if (is_other_sgpr && instr->operands[!idx].tempId() != instr->operands[idx].tempId()) { instr->operands[idx].setFixed(m0); return reg == m0; @@ -1815,19 +1816,20 @@ bool operand_can_use_reg(chip_class chip, aco_ptr& instr, unsigned switch (instr->format) { case Format::SMEM: - return reg != scc && - reg != exec && + return reg != scc && reg != exec && (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */ - (reg != vcc || (instr->definitions.empty() && idx == 2) || chip >= GFX10); /* sdata can be vcc */ + (reg != vcc || (instr->definitions.empty() && idx == 2) || + chip >= GFX10); /* sdata can be vcc */ default: // TODO: there are more instructions with restrictions on registers return true; } } -void get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, - std::vector>& parallelcopy, - aco_ptr& instr, Operand& operand, unsigned operand_index) +void +get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, + std::vector>& parallelcopy, + aco_ptr& instr, Operand& operand, unsigned operand_index) { /* check if the operand is fixed */ PhysReg src = ctx.assignments[operand.tempId()].reg; @@ -1841,31 +1843,34 @@ void get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, RegisterFile tmp_file(register_file); - std::set> blocking_vars = collect_vars(ctx, tmp_file, target); + std::set> blocking_vars = + collect_vars(ctx, tmp_file, target); - tmp_file.clear(src, operand.regClass()); //TODO: try to avoid moving block vars to src + tmp_file.clear(src, operand.regClass()); // TODO: try to avoid moving block vars to src tmp_file.block(operand.physReg(), operand.regClass()); DefInfo info(ctx, instr, operand.regClass(), -1); - get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, info.bounds, instr, PhysRegInterval()); + get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, info.bounds, instr, + PhysRegInterval()); } dst = operand.physReg(); } else { dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index); - update_renames(ctx, register_file, parallelcopy, instr, - instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops : (UpdateRenames)0); + update_renames( + ctx, register_file, parallelcopy, instr, + instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops : (UpdateRenames)0); } Operand pc_op = operand; pc_op.setFixed(src); Definition pc_def = Definition(dst, pc_op.regClass()); parallelcopy.emplace_back(pc_op, pc_def); - update_renames(ctx, register_file, parallelcopy, instr, - rename_not_killed_ops | fill_killed_ops); + update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops); } -Temp read_variable(ra_ctx& ctx, Temp val, unsigned block_idx) +Temp +read_variable(ra_ctx& ctx, Temp val, unsigned block_idx) { std::unordered_map::iterator it = ctx.renames[block_idx].find(val.id()); if (it == ctx.renames[block_idx].end()) @@ -1874,7 +1879,8 @@ Temp read_variable(ra_ctx& ctx, Temp val, unsigned block_idx) return it->second; } -Temp handle_live_in(ra_ctx& ctx, Temp val, Block* block) +Temp +handle_live_in(ra_ctx& ctx, Temp val, Block* block) { std::vector& preds = val.is_linear() ? block->linear_preds : block->logical_preds; if (preds.size() == 0 || val.regClass() == val.regClass().as_linear()) @@ -1886,7 +1892,7 @@ Temp handle_live_in(ra_ctx& ctx, Temp val, Block* block) } /* there are multiple predecessors and the block is sealed */ - Temp *const ops = (Temp *)alloca(preds.size() * sizeof(Temp)); + Temp* const ops = (Temp*)alloca(preds.size() * sizeof(Temp)); /* get the rename from each predecessor and check if they are the same */ Temp new_val; @@ -1902,7 +1908,8 @@ Temp handle_live_in(ra_ctx& ctx, Temp val, Block* block) if (needs_phi) { /* the variable has been renamed differently in the predecessors: we need to insert a phi */ aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; - aco_ptr phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; + aco_ptr phi{ + create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; new_val = ctx.program->allocateTmp(val.regClass()); phi->definitions[0] = Definition(new_val); for (unsigned i = 0; i < preds.size(); i++) { @@ -1921,8 +1928,9 @@ Temp handle_live_in(ra_ctx& ctx, Temp val, Block* block) return new_val; } -void handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, - uint32_t loop_header_idx, uint32_t loop_exit_idx) +void +handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx, + uint32_t loop_exit_idx) { Block& loop_header = ctx.program->blocks[loop_header_idx]; std::unordered_map renames; @@ -1963,9 +1971,8 @@ void handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, aco_ptr& phi = loop_header.instructions[i]; if (!is_phi(phi)) break; - const std::vector& preds = phi->opcode == aco_opcode::p_phi ? - loop_header.logical_preds : - loop_header.linear_preds; + const std::vector& preds = + phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds; for (unsigned j = 1; j < phi->operands.size(); j++) { Operand& op = phi->operands[j]; if (!op.isTemp()) @@ -2016,7 +2023,8 @@ void handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, * Reg-to-reg moves (renames) from previous blocks are taken into account and * the SSA is repaired by inserting corresponding phi-nodes. */ -RegisterFile init_reg_file(ra_ctx& ctx, const std::vector& live_out_per_block, Block& block) +RegisterFile +init_reg_file(ra_ctx& ctx, const std::vector& live_out_per_block, Block& block) { if (block.kind & block_kind_loop_exit) { uint32_t header = ctx.loop_header.back(); @@ -2054,9 +2062,8 @@ RegisterFile init_reg_file(ra_ctx& ctx, const std::vector& live_out_per_b for (aco_ptr& instr : block.instructions) { if (!is_phi(instr)) break; - const std::vector& preds = instr->opcode == aco_opcode::p_phi ? - block.logical_preds : - block.linear_preds; + const std::vector& preds = + instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; for (unsigned i = 0; i < instr->operands.size(); i++) { Operand& operand = instr->operands[i]; @@ -2084,12 +2091,14 @@ RegisterFile init_reg_file(ra_ctx& ctx, const std::vector& live_out_per_b return register_file; } -void get_affinities(ra_ctx& ctx, std::vector& live_out_per_block) +void +get_affinities(ra_ctx& ctx, std::vector& live_out_per_block) { std::vector> phi_ressources; std::unordered_map temp_to_phi_ressources; - for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend(); block_rit++) { + for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend(); + block_rit++) { Block& block = *block_rit; /* first, compute the death points of all live vars within the block */ @@ -2109,7 +2118,8 @@ void get_affinities(ra_ctx& ctx, std::vector& live_out_per_block) affinity_related.emplace_back(instr->definitions[0].getTemp()); affinity_related.emplace_back(instr->definitions[0].getTemp()); for (const Operand& op : instr->operands) { - if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) { + if (op.isTemp() && op.isKill() && + op.regClass() == instr->definitions[0].regClass()) { affinity_related.emplace_back(op.getTemp()); temp_to_phi_ressources[op.tempId()] = phi_ressources.size(); } @@ -2119,7 +2129,8 @@ void get_affinities(ra_ctx& ctx, std::vector& live_out_per_block) /* add vector affinities */ if (instr->opcode == aco_opcode::p_create_vector) { for (const Operand& op : instr->operands) { - if (op.isTemp() && op.isFirstKill() && op.getTemp().type() == instr->definitions[0].getTemp().type()) + if (op.isTemp() && op.isFirstKill() && + op.getTemp().type() == instr->definitions[0].getTemp().type()) ctx.vectors[op.tempId()] = instr.get(); } } else if (instr->format == Format::MIMG && instr->operands.size() > 4) { @@ -2127,7 +2138,8 @@ void get_affinities(ra_ctx& ctx, std::vector& live_out_per_block) ctx.vectors[instr->operands[i].tempId()] = instr.get(); } - if (instr->opcode == aco_opcode::p_split_vector && instr->operands[0].isFirstKillBeforeDef()) + if (instr->opcode == aco_opcode::p_split_vector && + instr->operands[0].isFirstKillBeforeDef()) ctx.split_vectors[instr->operands[0].tempId()] = instr.get(); /* add operands to live variables */ @@ -2144,28 +2156,26 @@ void get_affinities(ra_ctx& ctx, std::vector& live_out_per_block) continue; live.erase(def.tempId()); /* mark last-seen phi operand */ - std::unordered_map::iterator it = temp_to_phi_ressources.find(def.tempId()); - if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) { + std::unordered_map::iterator it = + temp_to_phi_ressources.find(def.tempId()); + if (it != temp_to_phi_ressources.end() && + def.regClass() == phi_ressources[it->second][0].regClass()) { phi_ressources[it->second][0] = def.getTemp(); /* try to coalesce phi affinities with parallelcopies */ Operand op = Operand(); switch (instr->opcode) { - case aco_opcode::p_parallelcopy: - op = instr->operands[i]; - break; + case aco_opcode::p_parallelcopy: op = instr->operands[i]; break; case aco_opcode::v_interp_p2_f32: case aco_opcode::v_writelane_b32: - case aco_opcode::v_writelane_b32_e64: - op = instr->operands[2]; - break; + case aco_opcode::v_writelane_b32_e64: op = instr->operands[2]; break; case aco_opcode::v_fma_f32: case aco_opcode::v_fma_f16: case aco_opcode::v_pk_fma_f16: if (ctx.program->chip_class < GFX10) continue; - FALLTHROUGH; + FALLTHROUGH; case aco_opcode::v_mad_f32: case aco_opcode::v_mad_f16: if (instr->usesModifiers()) @@ -2173,8 +2183,7 @@ void get_affinities(ra_ctx& ctx, std::vector& live_out_per_block) op = instr->operands[2]; break; - default: - continue; + default: continue; } if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) { @@ -2196,8 +2205,8 @@ void get_affinities(ra_ctx& ctx, std::vector& live_out_per_block) } /* end namespace */ - -void register_allocation(Program *program, std::vector& live_out_per_block, ra_test_policy policy) +void +register_allocation(Program* program, std::vector& live_out_per_block, ra_test_policy policy) { ra_ctx ctx(program, policy); get_affinities(ctx, live_out_per_block); @@ -2217,22 +2226,26 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc * We consider them incomplete phis and only handle the definition. */ /* look up the affinities */ - for (instr_it = block.instructions.begin(); instr_it != block.instructions.end(); ++instr_it) { + for (instr_it = block.instructions.begin(); instr_it != block.instructions.end(); + ++instr_it) { aco_ptr& phi = *instr_it; if (!is_phi(phi)) break; Definition& definition = phi->definitions[0]; if (definition.isKill() || definition.isFixed()) - continue; + continue; if (ctx.affinities.find(definition.tempId()) != ctx.affinities.end() && ctx.assignments[ctx.affinities[definition.tempId()]].assigned) { - assert(ctx.assignments[ctx.affinities[definition.tempId()]].rc == definition.regClass()); + assert(ctx.assignments[ctx.affinities[definition.tempId()]].rc == + definition.regClass()); PhysReg reg = ctx.assignments[ctx.affinities[definition.tempId()]].reg; if (reg == scc) { /* only use scc if all operands are already placed there */ - bool use_scc = std::all_of(phi->operands.begin(), phi->operands.end(), - [] (const Operand& op) { return op.isTemp() && op.isFixed() && op.physReg() == scc;}); + bool use_scc = + std::all_of(phi->operands.begin(), phi->operands.end(), + [](const Operand& op) + { return op.isTemp() && op.isFixed() && op.physReg() == scc; }); if (!use_scc) continue; } @@ -2247,7 +2260,8 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc } /* find registers for phis without affinity or where the register was blocked */ - for (instr_it = block.instructions.begin();instr_it != block.instructions.end(); ++instr_it) { + for (instr_it = block.instructions.begin(); instr_it != block.instructions.end(); + ++instr_it) { aco_ptr& phi = *instr_it; if (!is_phi(phi)) break; @@ -2274,16 +2288,18 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc } } if (!definition.isFixed()) { - definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi)); + definition.setFixed( + get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi)); update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops); } /* process parallelcopy */ for (std::pair pc : parallelcopy) { /* see if it's a copy from a different phi */ - //TODO: prefer moving some previous phis over live-ins - //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec) - Instruction *prev_phi = NULL; + // TODO: prefer moving some previous phis over live-ins + // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a + // problem in practice since they can only be fixed to exec) + Instruction* prev_phi = NULL; std::vector>::iterator phi_it; for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) { if ((*phi_it)->definitions[0].tempId() == pc.first.tempId()) @@ -2298,13 +2314,15 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc /* if so, just update that phi's register */ register_file.clear(prev_phi->definitions[0]); prev_phi->definitions[0].setFixed(pc.second.physReg()); - ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), pc.second.regClass()}; + ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), + pc.second.regClass()}; register_file.fill(prev_phi->definitions[0]); continue; } /* rename */ - std::unordered_map::iterator orig_it = ctx.orig_names.find(pc.first.tempId()); + std::unordered_map::iterator orig_it = + ctx.orig_names.find(pc.first.tempId()); Temp orig = pc.first.getTemp(); if (orig_it != ctx.orig_names.end()) orig = orig_it->second; @@ -2314,9 +2332,12 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc /* otherwise, this is a live-in and we need to create a new phi * to move it in this block's predecessors */ - aco_opcode opcode = pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; - std::vector& preds = pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds; - aco_ptr new_phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; + aco_opcode opcode = + pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; + std::vector& preds = + pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds; + aco_ptr new_phi{ + create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; new_phi->definitions[0] = pc.second; for (unsigned i = 0; i < preds.size(); i++) new_phi->operands[i] = Operand(pc.first); @@ -2370,7 +2391,8 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc if (phi->operands[idx].isTemp() && phi->operands[idx].getTemp().type() == RegType::sgpr && phi->operands[idx].isFirstKillBeforeDef()) { - Definition phi_op(read_variable(ctx, phi->operands[idx].getTemp(), block.index)); + Definition phi_op( + read_variable(ctx, phi->operands[idx].getTemp(), block.index)); phi_op.setFixed(ctx.assignments[phi_op.tempId()].reg); register_file.clear(phi_op); } @@ -2404,8 +2426,7 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc else get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i); - if (instr->isEXP() || - (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) || + if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) || (instr->isDS() && instr->ds().gds)) { for (unsigned j = 0; j < operand.size(); j++) ctx.war_hint.set(operand.physReg().reg() + j); @@ -2425,14 +2446,10 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc instr->opcode == aco_opcode::v_mad_legacy_f16 || (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10) || (instr->opcode == aco_opcode::v_pk_fma_f16 && program->chip_class >= GFX10)) && - instr->operands[2].isTemp() && - instr->operands[2].isKillBeforeDef() && - instr->operands[2].getTemp().type() == RegType::vgpr && - instr->operands[1].isTemp() && - instr->operands[1].getTemp().type() == RegType::vgpr && - !instr->usesModifiers() && - instr->operands[0].physReg().byte() == 0 && - instr->operands[1].physReg().byte() == 0 && + instr->operands[2].isTemp() && instr->operands[2].isKillBeforeDef() && + instr->operands[2].getTemp().type() == RegType::vgpr && instr->operands[1].isTemp() && + instr->operands[1].getTemp().type() == RegType::vgpr && !instr->usesModifiers() && + instr->operands[0].physReg().byte() == 0 && instr->operands[1].physReg().byte() == 0 && instr->operands[2].physReg().byte() == 0) { unsigned def_id = instr->definitions[0].tempId(); auto it = ctx.affinities.find(def_id); @@ -2441,34 +2458,21 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc register_file.test(ctx.assignments[it->second].reg, instr->operands[2].bytes())) { instr->format = Format::VOP2; switch (instr->opcode) { - case aco_opcode::v_mad_f32: - instr->opcode = aco_opcode::v_mac_f32; - break; - case aco_opcode::v_fma_f32: - instr->opcode = aco_opcode::v_fmac_f32; - break; + case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break; + case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break; case aco_opcode::v_mad_f16: - case aco_opcode::v_mad_legacy_f16: - instr->opcode = aco_opcode::v_mac_f16; - break; - case aco_opcode::v_fma_f16: - instr->opcode = aco_opcode::v_fmac_f16; - break; - case aco_opcode::v_pk_fma_f16: - instr->opcode = aco_opcode::v_pk_fmac_f16; - break; - default: - break; + case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break; + case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break; + case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break; + default: break; } } } /* handle definitions which must have the same register as an operand */ if (instr->opcode == aco_opcode::v_interp_p2_f32 || - instr->opcode == aco_opcode::v_mac_f32 || - instr->opcode == aco_opcode::v_fmac_f32 || - instr->opcode == aco_opcode::v_mac_f16 || - instr->opcode == aco_opcode::v_fmac_f16 || + instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_fmac_f32 || + instr->opcode == aco_opcode::v_mac_f16 || instr->opcode == aco_opcode::v_fmac_f16 || instr->opcode == aco_opcode::v_pk_fmac_f16 || instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64) { @@ -2476,12 +2480,10 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc } else if (instr->opcode == aco_opcode::s_addk_i32 || instr->opcode == aco_opcode::s_mulk_i32) { instr->definitions[0].setFixed(instr->operands[0].physReg()); - } else if (instr->isMUBUF() && - instr->definitions.size() == 1 && + } else if (instr->isMUBUF() && instr->definitions.size() == 1 && instr->operands.size() == 4) { instr->definitions[0].setFixed(instr->operands[3].physReg()); - } else if (instr->isMIMG() && - instr->definitions.size() == 1 && + } else if (instr->isMIMG() && instr->definitions.size() == 1 && !instr->operands[2].isUndefined()) { instr->definitions[0].setFixed(instr->operands[2].physReg()); } @@ -2497,10 +2499,11 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc adjust_max_used_regs(ctx, definition.regClass(), definition.physReg()); /* check if the target register is blocked */ if (register_file.test(definition.physReg(), definition.bytes())) { - const PhysRegInterval def_regs { definition.physReg(), definition.size() }; + const PhysRegInterval def_regs{definition.physReg(), definition.size()}; /* create parallelcopy pair to move blocking vars */ - std::set> vars = collect_vars(ctx, register_file, def_regs); + std::set> vars = + collect_vars(ctx, register_file, def_regs); RegisterFile tmp_file(register_file); /* re-enable the killed operands, so that we don't move the blocking vars there */ @@ -2511,8 +2514,7 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc ASSERTED bool success = false; DefInfo info(ctx, instr, definition.regClass(), -1); - success = get_regs_for_copies(ctx, tmp_file, parallelcopy, - vars, info.bounds, instr, + success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, info.bounds, instr, def_regs); assert(success); @@ -2529,13 +2531,15 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc /* handle all other definitions */ for (unsigned i = 0; i < instr->definitions.size(); ++i) { - Definition *definition = &instr->definitions[i]; + Definition* definition = &instr->definitions[i]; if (definition->isFixed() || !definition->isTemp()) continue; /* find free reg */ - if (definition->hasHint() && get_reg_specified(ctx, register_file, definition->regClass(), instr, definition->physReg())) { + if (definition->hasHint() && + get_reg_specified(ctx, register_file, definition->regClass(), instr, + definition->physReg())) { definition->setFixed(definition->physReg()); } else if (instr->opcode == aco_opcode::p_split_vector) { PhysReg reg = instr->operands[0].physReg(); @@ -2543,7 +2547,8 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc reg.reg_b += instr->definitions[j].bytes(); if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg)) definition->setFixed(reg); - } else if (instr->opcode == aco_opcode::p_wqm || instr->opcode == aco_opcode::p_parallelcopy) { + } else if (instr->opcode == aco_opcode::p_wqm || + instr->opcode == aco_opcode::p_parallelcopy) { PhysReg reg = instr->operands[i].physReg(); if (instr->operands[i].isTemp() && instr->operands[i].getTemp().type() == definition->getTemp().type() && @@ -2568,17 +2573,21 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc definition->setFixed(reg); if (reg.byte() || register_file.test(reg, 4)) { add_subdword_definition(program, instr, i, reg); - definition = &instr->definitions[i]; /* add_subdword_definition can invalidate the reference */ + definition = &instr->definitions[i]; /* add_subdword_definition can invalidate + the reference */ } } else { definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr)); } update_renames(ctx, register_file, parallelcopy, instr, - instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops : (UpdateRenames)0); + instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops + : (UpdateRenames)0); } - assert(definition->isFixed() && ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) || - (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256))); + assert( + definition->isFixed() && + ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) || + (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256))); ctx.defs_done.set(i); ctx.assignments[definition->tempId()] = {definition->physReg(), definition->regClass()}; register_file.fill(*definition); @@ -2586,10 +2595,11 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc handle_pseudo(ctx, register_file, instr.get()); - /* kill definitions and late-kill operands and ensure that sub-dword operands can actually be read */ + /* kill definitions and late-kill operands and ensure that sub-dword operands can actually + * be read */ for (const Definition& def : instr->definitions) { - if (def.isTemp() && def.isKill()) - register_file.clear(def); + if (def.isTemp() && def.isKill()) + register_file.clear(def); } for (unsigned i = 0; i < instr->operands.size(); i++) { const Operand& op = instr->operands[i]; @@ -2602,11 +2612,14 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc /* emit parallelcopy */ if (!parallelcopy.empty()) { aco_ptr pc; - pc.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), parallelcopy.size())); + pc.reset(create_instruction(aco_opcode::p_parallelcopy, + Format::PSEUDO, parallelcopy.size(), + parallelcopy.size())); bool sgpr_operands_alias_defs = false; uint64_t sgpr_operands[4] = {0, 0, 0, 0}; for (unsigned i = 0; i < parallelcopy.size(); i++) { - if (temp_in_scc && parallelcopy[i].first.isTemp() && parallelcopy[i].first.getTemp().type() == RegType::sgpr) { + if (temp_in_scc && parallelcopy[i].first.isTemp() && + parallelcopy[i].first.getTemp().type() == RegType::sgpr) { if (!sgpr_operands_alias_defs) { unsigned reg = parallelcopy[i].first.physReg().reg(); unsigned size = parallelcopy[i].first.getTemp().size(); @@ -2623,8 +2636,10 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc pc->definitions[i] = parallelcopy[i].second; assert(pc->operands[i].size() == pc->definitions[i].size()); - /* it might happen that the operand is already renamed. we have to restore the original name. */ - std::unordered_map::iterator it = ctx.orig_names.find(pc->operands[i].tempId()); + /* it might happen that the operand is already renamed. we have to restore the + * original name. */ + std::unordered_map::iterator it = + ctx.orig_names.find(pc->operands[i].tempId()); Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp(); ctx.orig_names[pc->definitions[i].tempId()] = orig; ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp(); @@ -2651,24 +2666,27 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc } /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */ - bool instr_needs_vop3 = !instr->isVOP3() && - ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) || - (instr->opcode == aco_opcode::v_cndmask_b32 && !(instr->operands[2].physReg() == vcc)) || - ((instr->opcode == aco_opcode::v_add_co_u32 || - instr->opcode == aco_opcode::v_addc_co_u32 || - instr->opcode == aco_opcode::v_sub_co_u32 || - instr->opcode == aco_opcode::v_subb_co_u32 || - instr->opcode == aco_opcode::v_subrev_co_u32 || - instr->opcode == aco_opcode::v_subbrev_co_u32) && - !(instr->definitions[1].physReg() == vcc)) || - ((instr->opcode == aco_opcode::v_addc_co_u32 || - instr->opcode == aco_opcode::v_subb_co_u32 || - instr->opcode == aco_opcode::v_subbrev_co_u32) && - !(instr->operands[2].physReg() == vcc))); + bool instr_needs_vop3 = + !instr->isVOP3() && + ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) || + (instr->opcode == aco_opcode::v_cndmask_b32 && + !(instr->operands[2].physReg() == vcc)) || + ((instr->opcode == aco_opcode::v_add_co_u32 || + instr->opcode == aco_opcode::v_addc_co_u32 || + instr->opcode == aco_opcode::v_sub_co_u32 || + instr->opcode == aco_opcode::v_subb_co_u32 || + instr->opcode == aco_opcode::v_subrev_co_u32 || + instr->opcode == aco_opcode::v_subbrev_co_u32) && + !(instr->definitions[1].physReg() == vcc)) || + ((instr->opcode == aco_opcode::v_addc_co_u32 || + instr->opcode == aco_opcode::v_subb_co_u32 || + instr->opcode == aco_opcode::v_subbrev_co_u32) && + !(instr->operands[2].physReg() == vcc))); if (instr_needs_vop3) { /* if the first operand is a literal, we have to move it to a reg */ - if (instr->operands.size() && instr->operands[0].isLiteral() && program->chip_class < GFX10) { + if (instr->operands.size() && instr->operands[0].isLiteral() && + program->chip_class < GFX10) { bool can_sgpr = true; /* check, if we have to move to vgpr */ for (const Operand& op : instr->operands) { @@ -2692,9 +2710,11 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc aco_ptr mov; if (can_sgpr) - mov.reset(create_instruction(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)); + mov.reset(create_instruction(aco_opcode::s_mov_b32, + Format::SOP1, 1, 1)); else - mov.reset(create_instruction(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)); + mov.reset(create_instruction(aco_opcode::v_mov_b32, + Format::VOP1, 1, 1)); mov->operands[0] = instr->operands[0]; mov->definitions[0] = Definition(tmp); mov->definitions[0].setFixed(reg); @@ -2709,7 +2729,8 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc /* change the instruction to VOP3 to enable an arbitrary register pair as dst */ aco_ptr tmp = std::move(instr); Format format = asVOP3(tmp->format); - instr.reset(create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); + instr.reset(create_instruction( + tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); std::copy(tmp->operands.begin(), tmp->operands.end(), instr->operands.begin()); std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin()); } @@ -2752,4 +2773,4 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc program->progress = CompilationProgress::after_ra; } -} +} // namespace aco diff --git a/src/amd/compiler/aco_reindex_ssa.cpp b/src/amd/compiler/aco_reindex_ssa.cpp index 9ad2faced21..47653f8b6d3 100644 --- a/src/amd/compiler/aco_reindex_ssa.cpp +++ b/src/amd/compiler/aco_reindex_ssa.cpp @@ -34,8 +34,8 @@ struct idx_ctx { std::vector renames; }; -inline -void reindex_defs(idx_ctx& ctx, aco_ptr& instr) +inline void +reindex_defs(idx_ctx& ctx, aco_ptr& instr) { for (Definition& def : instr->definitions) { if (!def.isTemp()) @@ -48,8 +48,8 @@ void reindex_defs(idx_ctx& ctx, aco_ptr& instr) } } -inline -void reindex_ops(idx_ctx& ctx, aco_ptr& instr) +inline void +reindex_ops(idx_ctx& ctx, aco_ptr& instr) { for (Operand& op : instr->operands) { if (!op.isTemp()) @@ -60,7 +60,8 @@ void reindex_ops(idx_ctx& ctx, aco_ptr& instr) } } -void reindex_program(idx_ctx& ctx, Program* program) +void +reindex_program(idx_ctx& ctx, Program* program) { ctx.renames.resize(program->peekAllocationId()); @@ -88,12 +89,13 @@ void reindex_program(idx_ctx& ctx, Program* program) /* update program members */ program->private_segment_buffer = Temp(ctx.renames[program->private_segment_buffer.id()], program->private_segment_buffer.regClass()); - program->scratch_offset = Temp(ctx.renames[program->scratch_offset.id()], - program->scratch_offset.regClass()); + program->scratch_offset = + Temp(ctx.renames[program->scratch_offset.id()], program->scratch_offset.regClass()); program->temp_rc = ctx.temp_rc; } -void update_live_out(idx_ctx& ctx, std::vector& live_out) +void +update_live_out(idx_ctx& ctx, std::vector& live_out) { for (IDSet& set : live_out) { IDSet new_set; @@ -105,7 +107,8 @@ void update_live_out(idx_ctx& ctx, std::vector& live_out) } /* end namespace */ -void reindex_ssa(Program* program) +void +reindex_ssa(Program* program) { idx_ctx ctx; reindex_program(ctx, program); @@ -113,7 +116,8 @@ void reindex_ssa(Program* program) program->allocationID = program->temp_rc.size(); } -void reindex_ssa(Program* program, std::vector& live_out) +void +reindex_ssa(Program* program, std::vector& live_out) { idx_ctx ctx; reindex_program(ctx, program); @@ -122,4 +126,4 @@ void reindex_ssa(Program* program, std::vector& live_out) program->allocationID = program->temp_rc.size(); } -} +} // namespace aco diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index 59338b3d042..9a17a816d89 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -34,11 +34,11 @@ #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35) #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64) #define POS_EXP_WINDOW_SIZE 512 -#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4) -#define VMEM_MAX_MOVES (256 - ctx.num_waves * 16) +#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4) +#define VMEM_MAX_MOVES (256 - ctx.num_waves * 16) /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */ #define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 8) -#define POS_EXP_MAX_MOVES 512 +#define POS_EXP_MAX_MOVES 512 namespace aco { @@ -54,7 +54,7 @@ enum MoveResult { * or below a group of instruction that hardware can execute as a clause. */ struct DownwardsCursor { - int source_idx; /* Current instruction to consider for moving */ + int source_idx; /* Current instruction to consider for moving */ int insert_idx_clause; /* First clause instruction */ int insert_idx; /* First instruction *after* the clause */ @@ -66,11 +66,9 @@ struct DownwardsCursor { RegisterDemand total_demand; DownwardsCursor(int current_idx, RegisterDemand initial_clause_demand) - : source_idx(current_idx - 1), - insert_idx_clause(current_idx), - insert_idx(current_idx + 1), - clause_demand(initial_clause_demand) { - } + : source_idx(current_idx - 1), insert_idx_clause(current_idx), insert_idx(current_idx + 1), + clause_demand(initial_clause_demand) + {} void verify_invariants(const RegisterDemand* register_demand); }; @@ -91,18 +89,16 @@ struct UpwardsCursor { insert_idx = -1; /* to be initialized later */ } - bool has_insert_idx() const { - return insert_idx != -1; - } + bool has_insert_idx() const { return insert_idx != -1; } void verify_invariants(const RegisterDemand* register_demand); }; struct MoveState { RegisterDemand max_registers; - Block *block; - Instruction *current; - RegisterDemand *register_demand; /* demand per instruction */ + Block* block; + Instruction* current; + RegisterDemand* register_demand; /* demand per instruction */ bool improved_rar; std::vector depends_on; @@ -143,19 +139,22 @@ struct sched_ctx { */ template -void move_element(T begin_it, size_t idx, size_t before) { - if (idx < before) { - auto begin = std::next(begin_it, idx); - auto end = std::next(begin_it, before); - std::rotate(begin, begin + 1, end); - } else if (idx > before) { - auto begin = std::next(begin_it, before); - auto end = std::next(begin_it, idx + 1); - std::rotate(begin, end - 1, end); - } +void +move_element(T begin_it, size_t idx, size_t before) +{ + if (idx < before) { + auto begin = std::next(begin_it, idx); + auto end = std::next(begin_it, before); + std::rotate(begin, begin + 1, end); + } else if (idx > before) { + auto begin = std::next(begin_it, before); + auto end = std::next(begin_it, idx + 1); + std::rotate(begin, end - 1, end); + } } -void DownwardsCursor::verify_invariants(const RegisterDemand* register_demand) +void +DownwardsCursor::verify_invariants(const RegisterDemand* register_demand) { assert(source_idx < insert_idx_clause); assert(insert_idx_clause < insert_idx); @@ -175,7 +174,8 @@ void DownwardsCursor::verify_invariants(const RegisterDemand* register_demand) #endif } -DownwardsCursor MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses) +DownwardsCursor +MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses) { improved_rar = improved_rar_; @@ -202,7 +202,8 @@ DownwardsCursor MoveState::downwards_init(int current_idx, bool improved_rar_, b /* If add_to_clause is true, the current clause is extended by moving the * instruction at source_idx in front of the clause. Otherwise, the instruction * is moved past the end of the clause without extending it */ -MoveResult MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause) +MoveResult +MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause) { aco_ptr& instr = block->instructions[cursor.source_idx]; @@ -211,7 +212,8 @@ MoveResult MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause return move_fail_ssa; /* check if one of candidate's operands is killed by depending instruction */ - std::vector& RAR_deps = improved_rar ? (add_to_clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on; + std::vector& RAR_deps = + improved_rar ? (add_to_clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on; for (const Operand& op : instr->operands) { if (op.isTemp() && RAR_deps[op.tempId()]) { // FIXME: account for difference in register pressure @@ -274,7 +276,8 @@ MoveResult MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause return move_success; } -void MoveState::downwards_skip(DownwardsCursor& cursor) +void +MoveState::downwards_skip(DownwardsCursor& cursor) { aco_ptr& instr = block->instructions[cursor.source_idx]; @@ -292,7 +295,9 @@ void MoveState::downwards_skip(DownwardsCursor& cursor) cursor.verify_invariants(register_demand); } -void UpwardsCursor::verify_invariants(const RegisterDemand* register_demand) { +void +UpwardsCursor::verify_invariants(const RegisterDemand* register_demand) +{ #ifndef NDEBUG if (!has_insert_idx()) { return; @@ -308,7 +313,8 @@ void UpwardsCursor::verify_invariants(const RegisterDemand* register_demand) { #endif } -UpwardsCursor MoveState::upwards_init(int source_idx, bool improved_rar_) +UpwardsCursor +MoveState::upwards_init(int source_idx, bool improved_rar_) { improved_rar = improved_rar_; @@ -323,7 +329,8 @@ UpwardsCursor MoveState::upwards_init(int source_idx, bool improved_rar_) return UpwardsCursor(source_idx); } -bool MoveState::upwards_check_deps(UpwardsCursor& cursor) +bool +MoveState::upwards_check_deps(UpwardsCursor& cursor) { aco_ptr& instr = block->instructions[cursor.source_idx]; for (const Operand& op : instr->operands) { @@ -333,13 +340,15 @@ bool MoveState::upwards_check_deps(UpwardsCursor& cursor) return true; } -void MoveState::upwards_update_insert_idx(UpwardsCursor& cursor) +void +MoveState::upwards_update_insert_idx(UpwardsCursor& cursor) { cursor.insert_idx = cursor.source_idx; cursor.total_demand = register_demand[cursor.insert_idx]; } -MoveResult MoveState::upwards_move(UpwardsCursor& cursor) +MoveResult +MoveState::upwards_move(UpwardsCursor& cursor) { assert(cursor.has_insert_idx()); @@ -355,13 +364,15 @@ MoveResult MoveState::upwards_move(UpwardsCursor& cursor) return move_fail_rar; } - /* check if register pressure is low enough: the diff is negative if register pressure is decreased */ + /* check if register pressure is low enough: the diff is negative if register pressure is + * decreased */ const RegisterDemand candidate_diff = get_live_changes(instr); const RegisterDemand temp = get_temp_registers(instr); if (RegisterDemand(cursor.total_demand + candidate_diff).exceeds(max_registers)) return move_fail_pressure; const RegisterDemand temp2 = get_temp_registers(block->instructions[cursor.insert_idx - 1]); - const RegisterDemand new_demand = register_demand[cursor.insert_idx - 1] - temp2 + candidate_diff + temp; + const RegisterDemand new_demand = + register_demand[cursor.insert_idx - 1] - temp2 + candidate_diff + temp; if (new_demand.exceeds(max_registers)) return move_fail_pressure; @@ -385,7 +396,8 @@ MoveResult MoveState::upwards_move(UpwardsCursor& cursor) return move_success; } -void MoveState::upwards_skip(UpwardsCursor& cursor) +void +MoveState::upwards_skip(UpwardsCursor& cursor) { if (cursor.has_insert_idx()) { aco_ptr& instr = block->instructions[cursor.source_idx]; @@ -405,30 +417,33 @@ void MoveState::upwards_skip(UpwardsCursor& cursor) cursor.verify_invariants(register_demand); } -bool is_gs_or_done_sendmsg(const Instruction *instr) +bool +is_gs_or_done_sendmsg(const Instruction* instr) { if (instr->opcode == aco_opcode::s_sendmsg) { uint16_t imm = instr->sopp().imm; - return (imm & sendmsg_id_mask) == _sendmsg_gs || - (imm & sendmsg_id_mask) == _sendmsg_gs_done; + return (imm & sendmsg_id_mask) == _sendmsg_gs || (imm & sendmsg_id_mask) == _sendmsg_gs_done; } return false; } -bool is_done_sendmsg(const Instruction *instr) +bool +is_done_sendmsg(const Instruction* instr) { if (instr->opcode == aco_opcode::s_sendmsg) return (instr->sopp().imm & sendmsg_id_mask) == _sendmsg_gs_done; return false; } -memory_sync_info get_sync_info_with_hack(const Instruction* instr) +memory_sync_info +get_sync_info_with_hack(const Instruction* instr) { memory_sync_info sync = get_sync_info(instr); if (instr->isSMEM() && !instr->operands.empty() && instr->operands[0].bytes() == 16) { // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works sync.storage = (storage_class)(sync.storage | storage_buffer); - sync.semantics = (memory_semantics)((sync.semantics | semantic_private) & ~semantic_can_reorder); + sync.semantics = + (memory_semantics)((sync.semantics | semantic_private) & ~semantic_can_reorder); } return sync; } @@ -451,11 +466,13 @@ struct hazard_query { bool contains_sendmsg; bool uses_exec; memory_event_set mem_events; - unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */ + unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */ unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */ }; -void init_hazard_query(hazard_query *query) { +void +init_hazard_query(hazard_query* query) +{ query->contains_spill = false; query->contains_sendmsg = false; query->uses_exec = false; @@ -464,7 +481,8 @@ void init_hazard_query(hazard_query *query) { query->aliasing_storage_smem = 0; } -void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_info *sync) +void +add_memory_event(memory_event_set* set, Instruction* instr, memory_sync_info* sync) { set->has_control_barrier |= is_done_sendmsg(instr); if (instr->opcode == aco_opcode::p_barrier) { @@ -494,7 +512,8 @@ void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_inf } } -void add_to_hazard_query(hazard_query *query, Instruction *instr) +void +add_to_hazard_query(hazard_query* query, Instruction* instr) { if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) query->contains_spill = true; @@ -507,7 +526,8 @@ void add_to_hazard_query(hazard_query *query, Instruction *instr) if (!(sync.semantics & semantic_can_reorder)) { unsigned storage = sync.storage; - /* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias + /* images and buffer/global memory can alias */ // TODO: more precisely, buffer images and + // buffer/global memory can alias if (storage & (storage_buffer | storage_image)) storage |= storage_buffer | storage_image; if (instr->isSMEM()) @@ -531,7 +551,8 @@ enum HazardResult { hazard_fail_unreorderable, }; -HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool upwards) +HazardResult +perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards) { /* don't schedule discards downwards */ if (!upwards && instr->opcode == aco_opcode::p_exit_early_if) @@ -549,10 +570,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool return hazard_fail_export; /* don't move non-reorderable instructions */ - if (instr->opcode == aco_opcode::s_memtime || - instr->opcode == aco_opcode::s_memrealtime || - instr->opcode == aco_opcode::s_setprio || - instr->opcode == aco_opcode::s_getreg_b32) + if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime || + instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32) return hazard_fail_unreorderable; memory_event_set instr_set; @@ -560,8 +579,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool memory_sync_info sync = get_sync_info_with_hack(instr); add_memory_event(&instr_set, instr, &sync); - memory_event_set *first = &instr_set; - memory_event_set *second = &query->mem_events; + memory_event_set* first = &instr_set; + memory_event_set* second = &query->mem_events; if (upwards) std::swap(first, second); @@ -571,7 +590,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire) return hazard_fail_barrier; if (((first->access_acquire || first->bar_acquire) && second->bar_classes) || - ((first->access_acquire | first->bar_acquire) & (second->access_relaxed | second->access_atomic))) + ((first->access_acquire | first->bar_acquire) & + (second->access_relaxed | second->access_atomic))) return hazard_fail_barrier; /* everything before barrier(release) happens before the atomics/control_barriers after * @@ -580,7 +600,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool if (first->bar_release && (second->has_control_barrier || second->access_atomic)) return hazard_fail_barrier; if ((first->bar_classes && (second->bar_release || second->access_release)) || - ((first->access_relaxed | first->access_atomic) & (second->bar_release | second->access_release))) + ((first->access_relaxed | first->access_atomic) & + (second->bar_release | second->access_release))) return hazard_fail_barrier; /* don't move memory barriers around other memory barriers */ @@ -589,14 +610,15 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool /* Don't move memory accesses to before control barriers. I don't think * this is necessary for the Vulkan memory model, but it might be for GLSL450. */ - unsigned control_classes = storage_buffer | storage_atomic_counter | storage_image | storage_shared; - if (first->has_control_barrier && ((second->access_atomic | second->access_relaxed) & control_classes)) + unsigned control_classes = + storage_buffer | storage_atomic_counter | storage_image | storage_shared; + if (first->has_control_barrier && + ((second->access_atomic | second->access_relaxed) & control_classes)) return hazard_fail_barrier; /* don't move memory loads/stores past potentially aliasing loads/stores */ - unsigned aliasing_storage = instr->isSMEM() ? - query->aliasing_storage_smem : - query->aliasing_storage; + unsigned aliasing_storage = + instr->isSMEM() ? query->aliasing_storage_smem : query->aliasing_storage; if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) { unsigned intersect = sync.storage & aliasing_storage; if (intersect & storage_shared) @@ -614,9 +636,9 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool return hazard_success; } -void schedule_SMEM(sched_ctx& ctx, Block* block, - std::vector& register_demand, - Instruction* current, int idx) +void +schedule_SMEM(sched_ctx& ctx, Block* block, std::vector& register_demand, + Instruction* current, int idx) { assert(idx != 0); int window_size = SMEM_WINDOW_SIZE; @@ -634,30 +656,37 @@ void schedule_SMEM(sched_ctx& ctx, Block* block, DownwardsCursor cursor = ctx.mv.downwards_init(idx, false, false); - for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) { + for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size; + candidate_idx--) { assert(candidate_idx >= 0); assert(candidate_idx == cursor.source_idx); aco_ptr& candidate = block->instructions[candidate_idx]; /* break if we'd make the previous SMEM instruction stall */ - bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx; + bool can_stall_prev_smem = + idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx; if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0) break; /* break when encountering another MEM instruction, logical_start or barriers */ if (candidate->opcode == aco_opcode::p_logical_start) break; - /* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves to help create more vmem clauses */ - if (candidate->isVMEM() && (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) || current->operands[0].size() == 4)) + /* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves + * to help create more vmem clauses */ + if (candidate->isVMEM() && (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) || + current->operands[0].size() == 4)) break; /* don't move descriptor loads below buffer loads */ - if (candidate->format == Format::SMEM && current->operands[0].size() == 4 && candidate->operands[0].size() == 2) + if (candidate->format == Format::SMEM && current->operands[0].size() == 4 && + candidate->operands[0].size() == 2) break; bool can_move_down = true; HazardResult haz = perform_hazard_query(&hq, candidate.get(), false); - if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export) + if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || + haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || + haz == hazard_fail_export) can_move_down = false; else if (haz != hazard_success) break; @@ -689,9 +718,10 @@ void schedule_SMEM(sched_ctx& ctx, Block* block, bool found_dependency = false; /* second, check if we have instructions after current to move up */ - for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) { + for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size; + candidate_idx++) { assert(candidate_idx == up_cursor.source_idx); - assert(candidate_idx < (int) block->instructions.size()); + assert(candidate_idx < (int)block->instructions.size()); aco_ptr& candidate = block->instructions[candidate_idx]; if (candidate->opcode == aco_opcode::p_logical_end) @@ -748,9 +778,9 @@ void schedule_SMEM(sched_ctx& ctx, Block* block, ctx.last_SMEM_stall = 10 - ctx.num_waves - k; } -void schedule_VMEM(sched_ctx& ctx, Block* block, - std::vector& register_demand, - Instruction* current, int idx) +void +schedule_VMEM(sched_ctx& ctx, Block* block, std::vector& register_demand, + Instruction* current, int idx) { assert(idx != 0); int window_size = VMEM_WINDOW_SIZE; @@ -767,7 +797,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true); - for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) { + for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size; + candidate_idx--) { assert(candidate_idx == cursor.source_idx); assert(candidate_idx >= 0); aco_ptr& candidate = block->instructions[candidate_idx]; @@ -778,7 +809,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, break; /* break if we'd make the previous SMEM instruction stall */ - bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx; + bool can_stall_prev_smem = + idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx; if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0) break; @@ -787,14 +819,15 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, int grab_dist = cursor.insert_idx_clause - candidate_idx; /* We can't easily tell how much this will decrease the def-to-use * distances, so just use how far it will be moved as a heuristic. */ - part_of_clause = grab_dist < clause_max_grab_dist && - should_form_clause(current, candidate.get()); + part_of_clause = + grab_dist < clause_max_grab_dist && should_form_clause(current, candidate.get()); } /* if current depends on candidate, add additional dependencies and continue */ bool can_move_down = !is_vmem || part_of_clause; - HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false); + HazardResult haz = + perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false); if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export) @@ -809,7 +842,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, continue; } - Instruction *candidate_ptr = candidate.get(); + Instruction* candidate_ptr = candidate.get(); MoveResult res = ctx.mv.downwards_move(cursor, part_of_clause); if (res == move_fail_ssa || res == move_fail_rar) { add_to_hazard_query(&indep_hq, candidate.get()); @@ -832,9 +865,10 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, bool found_dependency = false; /* second, check if we have instructions after current to move up */ - for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) { + for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size; + candidate_idx++) { assert(candidate_idx == up_cursor.source_idx); - assert(candidate_idx < (int) block->instructions.size()); + assert(candidate_idx < (int)block->instructions.size()); aco_ptr& candidate = block->instructions[candidate_idx]; bool is_vmem = candidate->isVMEM() || candidate->isFlatLike(); @@ -889,9 +923,9 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, } } -void schedule_position_export(sched_ctx& ctx, Block* block, - std::vector& register_demand, - Instruction* current, int idx) +void +schedule_position_export(sched_ctx& ctx, Block* block, std::vector& register_demand, + Instruction* current, int idx) { assert(idx != 0); int window_size = POS_EXP_WINDOW_SIZE; @@ -904,7 +938,8 @@ void schedule_position_export(sched_ctx& ctx, Block* block, init_hazard_query(&hq); add_to_hazard_query(&hq, current); - for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) { + for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size; + candidate_idx--) { assert(candidate_idx >= 0); aco_ptr& candidate = block->instructions[candidate_idx]; @@ -935,7 +970,8 @@ void schedule_position_export(sched_ctx& ctx, Block* block, } } -void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars) +void +schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars) { ctx.last_SMEM_dep_idx = 0; ctx.last_SMEM_stall = INT16_MIN; @@ -950,7 +986,8 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v unsigned target = current->exp().dest; if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) { ctx.mv.current = current; - schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx); + schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, + idx); } } @@ -975,8 +1012,8 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v } } - -void schedule_program(Program *program, live& live_vars) +void +schedule_program(Program* program, live& live_vars) { /* don't use program->max_reg_demand because that is affected by max_waves_per_simd */ RegisterDemand demand; @@ -991,7 +1028,7 @@ void schedule_program(Program *program, live& live_vars) /* Allowing the scheduler to reduce the number of waves to as low as 5 * improves performance of Thrones of Britannia significantly and doesn't * seem to hurt anything else. */ - //TODO: account for possible uneven num_waves on GFX10+ + // TODO: account for possible uneven num_waves on GFX10+ unsigned wave_fac = program->dev.physical_vgprs / 256; if (program->num_waves <= 5 * wave_fac) ctx.num_waves = program->num_waves; @@ -1008,8 +1045,8 @@ void schedule_program(Program *program, live& live_vars) ctx.num_waves = std::max(ctx.num_waves / wave_fac, 1); assert(ctx.num_waves > 0); - ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2), - int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))}; + ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2), + int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))}; for (Block& block : program->blocks) schedule_block(ctx, program, &block, live_vars); @@ -1021,8 +1058,8 @@ void schedule_program(Program *program, live& live_vars) } update_vgpr_sgpr_demand(program, new_demand); - /* if enabled, this code asserts that register_demand is updated correctly */ - #if 0 +/* if enabled, this code asserts that register_demand is updated correctly */ +#if 0 int prev_num_waves = program->num_waves; const RegisterDemand prev_max_demand = program->max_reg_demand; @@ -1042,7 +1079,7 @@ void schedule_program(Program *program, live& live_vars) assert(program->max_reg_demand == prev_max_demand); assert(program->num_waves == prev_num_waves); - #endif +#endif } -} +} // namespace aco diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index c0f0471fff9..8996fae8f39 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -25,6 +25,7 @@ #include "aco_builder.h" #include "aco_ir.h" + #include "common/sid.h" #include @@ -44,7 +45,7 @@ namespace aco { namespace { struct remat_info { - Instruction *instr; + Instruction* instr; }; struct spill_ctx { @@ -62,15 +63,16 @@ struct spill_ctx { std::vector> affinities; std::vector is_reloaded; std::map remat; - std::map remat_used; + std::map remat_used; unsigned wave_size; spill_ctx(const RegisterDemand target_pressure_, Program* program_, std::vector> register_demand_) - : target_pressure(target_pressure_), program(program_), - register_demand(std::move(register_demand_)), renames(program->blocks.size()), - spills_entry(program->blocks.size()), spills_exit(program->blocks.size()), - processed(program->blocks.size(), false), wave_size(program->wave_size) {} + : target_pressure(target_pressure_), program(program_), + register_demand(std::move(register_demand_)), renames(program->blocks.size()), + spills_entry(program->blocks.size()), spills_exit(program->blocks.size()), + processed(program->blocks.size(), false), wave_size(program->wave_size) + {} void add_affinity(uint32_t first, uint32_t second) { @@ -93,7 +95,9 @@ struct spill_ctx { affinities[found_second].push_back(first); } else if (found_first != found_second) { /* merge second into first */ - affinities[found_first].insert(affinities[found_first].end(), affinities[found_second].begin(), affinities[found_second].end()); + affinities[found_first].insert(affinities[found_first].end(), + affinities[found_second].begin(), + affinities[found_second].end()); affinities.erase(std::next(affinities.begin(), found_second)); } else { assert(found_first == found_second); @@ -120,7 +124,8 @@ struct spill_ctx { uint32_t next_spill_id = 0; }; -int32_t get_dominator(int idx_a, int idx_b, Program* program, bool is_linear) +int32_t +get_dominator(int idx_a, int idx_b, Program* program, bool is_linear) { if (idx_a == -1) @@ -146,21 +151,23 @@ int32_t get_dominator(int idx_a, int idx_b, Program* program, bool is_linear) return idx_a; } -void next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set& worklist) +void +next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set& worklist) { Block* block = &ctx.program->blocks[block_idx]; std::map> next_uses = ctx.next_use_distances_end[block_idx]; - /* to compute the next use distance at the beginning of the block, we have to add the block's size */ - for (std::map>::iterator it = next_uses.begin(); it != next_uses.end(); ++it) + /* to compute the next use distance at the beginning of the block, we have to add the block's + * size */ + for (std::map>::iterator it = next_uses.begin(); + it != next_uses.end(); ++it) it->second.second = it->second.second + block->instructions.size(); int idx = block->instructions.size() - 1; while (idx >= 0) { aco_ptr& instr = block->instructions[idx]; - if (instr->opcode == aco_opcode::p_linear_phi || - instr->opcode == aco_opcode::p_phi) + if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi) break; for (const Definition& def : instr->definitions) { @@ -192,13 +199,14 @@ void next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set& } auto it = next_uses.find(instr->definitions[0].getTemp()); - std::pair distance = it == next_uses.end() ? std::make_pair(block_idx, 0u) : it->second; + std::pair distance = + it == next_uses.end() ? std::make_pair(block_idx, 0u) : it->second; for (unsigned i = 0; i < instr->operands.size(); i++) { - unsigned pred_idx = instr->opcode == aco_opcode::p_phi ? - block->logical_preds[i] : - block->linear_preds[i]; + unsigned pred_idx = + instr->opcode == aco_opcode::p_phi ? block->logical_preds[i] : block->linear_preds[i]; if (instr->operands[i].isTemp()) { - if (ctx.next_use_distances_end[pred_idx].find(instr->operands[i].getTemp()) == ctx.next_use_distances_end[pred_idx].end() || + if (ctx.next_use_distances_end[pred_idx].find(instr->operands[i].getTemp()) == + ctx.next_use_distances_end[pred_idx].end() || ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] != distance) worklist.insert(pred_idx); ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] = distance; @@ -217,19 +225,22 @@ void next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set& for (unsigned pred_idx : preds) { if (ctx.program->blocks[pred_idx].loop_nest_depth > block->loop_nest_depth) distance += 0xFFFF; - if (ctx.next_use_distances_end[pred_idx].find(temp) != ctx.next_use_distances_end[pred_idx].end()) { - dom = get_dominator(dom, ctx.next_use_distances_end[pred_idx][temp].first, ctx.program, temp.is_linear()); + if (ctx.next_use_distances_end[pred_idx].find(temp) != + ctx.next_use_distances_end[pred_idx].end()) { + dom = get_dominator(dom, ctx.next_use_distances_end[pred_idx][temp].first, ctx.program, + temp.is_linear()); distance = std::min(ctx.next_use_distances_end[pred_idx][temp].second, distance); } - if (ctx.next_use_distances_end[pred_idx][temp] != std::pair{dom, distance}) + if (ctx.next_use_distances_end[pred_idx][temp] != + std::pair{dom, distance}) worklist.insert(pred_idx); ctx.next_use_distances_end[pred_idx][temp] = {dom, distance}; } } - } -void compute_global_next_uses(spill_ctx& ctx) +void +compute_global_next_uses(spill_ctx& ctx) { ctx.next_use_distances_start.resize(ctx.program->blocks.size()); ctx.next_use_distances_end.resize(ctx.program->blocks.size()); @@ -245,12 +256,15 @@ void compute_global_next_uses(spill_ctx& ctx) } } -bool should_rematerialize(aco_ptr& instr) +bool +should_rematerialize(aco_ptr& instr) { /* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */ - if (instr->format != Format::VOP1 && instr->format != Format::SOP1 && instr->format != Format::PSEUDO && instr->format != Format::SOPK) + if (instr->format != Format::VOP1 && instr->format != Format::SOP1 && + instr->format != Format::PSEUDO && instr->format != Format::SOPK) return false; - /* TODO: pseudo-instruction rematerialization is only supported for p_create_vector/p_parallelcopy */ + /* TODO: pseudo-instruction rematerialization is only supported for + * p_create_vector/p_parallelcopy */ if (instr->isPseudo() && instr->opcode != aco_opcode::p_create_vector && instr->opcode != aco_opcode::p_parallelcopy) return false; @@ -270,24 +284,32 @@ bool should_rematerialize(aco_ptr& instr) return true; } -aco_ptr do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id) +aco_ptr +do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id) { std::map::iterator remat = ctx.remat.find(tmp); if (remat != ctx.remat.end()) { - Instruction *instr = remat->second.instr; - assert((instr->isVOP1() || instr->isSOP1() || instr->isPseudo() || instr->isSOPK()) && "unsupported"); - assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector || instr->opcode == aco_opcode::p_parallelcopy) && "unsupported"); + Instruction* instr = remat->second.instr; + assert((instr->isVOP1() || instr->isSOP1() || instr->isPseudo() || instr->isSOPK()) && + "unsupported"); + assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector || + instr->opcode == aco_opcode::p_parallelcopy) && + "unsupported"); assert(instr->definitions.size() == 1 && "unsupported"); aco_ptr res; if (instr->isVOP1()) { - res.reset(create_instruction(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); + res.reset(create_instruction( + instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); } else if (instr->isSOP1()) { - res.reset(create_instruction(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); + res.reset(create_instruction( + instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); } else if (instr->isPseudo()) { - res.reset(create_instruction(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); + res.reset(create_instruction( + instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); } else if (instr->isSOPK()) { - res.reset(create_instruction(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); + res.reset(create_instruction( + instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); res->sopk().imm = instr->sopk().imm; } for (unsigned i = 0; i < instr->operands.size(); i++) { @@ -301,7 +323,8 @@ aco_ptr do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t res->definitions[0] = Definition(new_name); return res; } else { - aco_ptr reload{create_instruction(aco_opcode::p_reload, Format::PSEUDO, 1, 1)}; + aco_ptr reload{ + create_instruction(aco_opcode::p_reload, Format::PSEUDO, 1, 1)}; reload->operands[0] = Operand(spill_id); reload->definitions[0] = Definition(new_name); ctx.is_reloaded[spill_id] = true; @@ -309,7 +332,8 @@ aco_ptr do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t } } -void get_rematerialize_info(spill_ctx& ctx) +void +get_rematerialize_info(spill_ctx& ctx) { for (Block& block : ctx.program->blocks) { bool logical = false; @@ -330,12 +354,14 @@ void get_rematerialize_info(spill_ctx& ctx) } } -std::vector> local_next_uses(spill_ctx& ctx, Block* block) +std::vector> +local_next_uses(spill_ctx& ctx, Block* block) { std::vector> local_next_uses(block->instructions.size()); std::map next_uses; - for (std::pair> pair : ctx.next_use_distances_end[block->index]) + for (std::pair> pair : + ctx.next_use_distances_end[block->index]) next_uses[pair.first] = pair.second.second + block->instructions.size(); for (int idx = block->instructions.size() - 1; idx >= 0; idx--) { @@ -362,7 +388,8 @@ std::vector> local_next_uses(spill_ctx& ctx, Block* blo return local_next_uses; } -RegisterDemand get_demand_before(spill_ctx& ctx, unsigned block_idx, unsigned idx) +RegisterDemand +get_demand_before(spill_ctx& ctx, unsigned block_idx, unsigned idx) { if (idx == 0) { RegisterDemand demand = ctx.register_demand[block_idx][idx]; @@ -374,7 +401,8 @@ RegisterDemand get_demand_before(spill_ctx& ctx, unsigned block_idx, unsigned id } } -RegisterDemand get_live_in_demand(spill_ctx& ctx, unsigned block_idx) +RegisterDemand +get_live_in_demand(spill_ctx& ctx, unsigned block_idx) { unsigned idx = 0; RegisterDemand reg_pressure = RegisterDemand(); @@ -398,12 +426,14 @@ RegisterDemand get_live_in_demand(spill_ctx& ctx, unsigned block_idx) /* Consider register pressure from linear predecessors. This can affect * reg_pressure if the branch instructions define sgprs. */ for (unsigned pred : block.linear_preds) - reg_pressure.sgpr = std::max(reg_pressure.sgpr, ctx.register_demand[pred].back().sgpr); + reg_pressure.sgpr = + std::max(reg_pressure.sgpr, ctx.register_demand[pred].back().sgpr); return reg_pressure; } -RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) +RegisterDemand +init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) { RegisterDemand spilled_registers; @@ -461,7 +491,8 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id Temp to_spill; for (std::pair> pair : next_use_distances) { if (pair.first.type() == type && - (pair.second.first >= loop_end || (ctx.remat.count(pair.first) && type == RegType::sgpr)) && + (pair.second.first >= loop_end || + (ctx.remat.count(pair.first) && type == RegType::sgpr)) && pair.second.second > distance && ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) { to_spill = pair.first; @@ -478,7 +509,8 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id } uint32_t spill_id; - if (ctx.spills_exit[block_idx - 1].find(to_spill) == ctx.spills_exit[block_idx - 1].end()) { + if (ctx.spills_exit[block_idx - 1].find(to_spill) == + ctx.spills_exit[block_idx - 1].end()) { spill_id = ctx.allocate_spill_id(to_spill.regClass()); } else { spill_id = ctx.spills_exit[block_idx - 1][to_spill]; @@ -502,8 +534,7 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr; for (std::pair> pair : next_use_distances) { - if (pair.first.type() == type && - pair.second.second > distance && + if (pair.first.type() == type && pair.second.second > distance && ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) { to_spill = pair.first; distance = pair.second.second; @@ -542,7 +573,8 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id } } - /* if register demand is still too high, we just keep all spilled live vars and process the block */ + /* if register demand is still too high, we just keep all spilled live vars + * and process the block */ if (block->register_demand.sgpr - spilled_registers.sgpr > ctx.target_pressure.sgpr) { pred_idx = block->linear_preds[0]; for (std::pair pair : ctx.spills_exit[pred_idx]) { @@ -553,7 +585,8 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id } } } - if (block->register_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr && block->logical_preds.size() == 1) { + if (block->register_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr && + block->logical_preds.size() == 1) { pred_idx = block->logical_preds[0]; for (std::pair pair : ctx.spills_exit[pred_idx]) { if (pair.first.type() == RegType::vgpr && @@ -572,17 +605,21 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id /* keep variables spilled on all incoming paths */ for (std::pair> pair : next_use_distances) { - std::vector& preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds; - /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload it. - * Otherwise, if any predecessor reloads it, ensure it's reloaded on all other predecessors. - * The idea is that it's better in practice to rematerialize redundantly than to create lots of phis. */ - /* TODO: test this idea with more than Dawn of War III shaders (the current pipeline-db doesn't seem to exercise this path much) */ + std::vector& preds = + pair.first.is_linear() ? block->linear_preds : block->logical_preds; + /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload + * it. Otherwise, if any predecessor reloads it, ensure it's reloaded on all other + * predecessors. The idea is that it's better in practice to rematerialize redundantly than to + * create lots of phis. */ + /* TODO: test this idea with more than Dawn of War III shaders (the current pipeline-db + * doesn't seem to exercise this path much) */ bool remat = ctx.remat.count(pair.first); bool spill = !remat; uint32_t spill_id = 0; for (unsigned pred_idx : preds) { /* variable is not even live at the predecessor: probably from a phi */ - if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end()) { + if (ctx.next_use_distances_end[pred_idx].find(pair.first) == + ctx.next_use_distances_end[pred_idx].end()) { spill = false; break; } @@ -591,7 +628,8 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id spill = false; } else { partial_spills.insert(pair.first); - /* it might be that on one incoming path, the variable has a different spill_id, but add_couple_code() will take care of that. */ + /* it might be that on one incoming path, the variable has a different spill_id, but + * add_couple_code() will take care of that. */ spill_id = ctx.spills_exit[pred_idx][pair.first]; if (remat) spill = true; @@ -611,7 +649,8 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id if (!phi->definitions[0].isTemp()) continue; - std::vector& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; + std::vector& preds = + phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; bool spill = true; for (unsigned i = 0; i < phi->operands.size(); i++) { @@ -621,13 +660,15 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id continue; } - if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == ctx.spills_exit[preds[i]].end()) + if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == + ctx.spills_exit[preds[i]].end()) spill = false; else partial_spills.insert(phi->definitions[0].getTemp()); } if (spill) { - ctx.spills_entry[block_idx][phi->definitions[0].getTemp()] = ctx.allocate_spill_id(phi->definitions[0].regClass()); + ctx.spills_entry[block_idx][phi->definitions[0].getTemp()] = + ctx.allocate_spill_id(phi->definitions[0].regClass()); partial_spills.erase(phi->definitions[0].getTemp()); spilled_registers += phi->definitions[0].getTemp(); } @@ -664,7 +705,8 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id return spilled_registers; } -void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) +void +add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) { /* no coupling code necessary */ if (block->linear_preds.size() == 0) @@ -672,14 +714,16 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) std::vector> instructions; /* branch block: TODO take other branch into consideration */ - if (block->linear_preds.size() == 1 && !(block->kind & (block_kind_loop_exit | block_kind_loop_header))) { + if (block->linear_preds.size() == 1 && + !(block->kind & (block_kind_loop_exit | block_kind_loop_header))) { assert(ctx.processed[block->linear_preds[0]]); assert(ctx.register_demand[block_idx].size() == block->instructions.size()); std::vector reg_demand; unsigned insert_idx = 0; RegisterDemand demand_before = get_demand_before(ctx, block_idx, 0); - for (std::pair> live : ctx.next_use_distances_start[block_idx]) { + for (std::pair> live : + ctx.next_use_distances_start[block_idx]) { const unsigned pred_idx = block->linear_preds[0]; if (!live.first.is_linear()) @@ -698,7 +742,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) /* variable is spilled at predecessor and live at current block: create reload instruction */ Temp new_name = ctx.program->allocateTmp(live.first.regClass()); - aco_ptr reload = do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]); + aco_ptr reload = + do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]); instructions.emplace_back(std::move(reload)); reg_demand.push_back(demand_before); ctx.renames[block_idx][live.first] = new_name; @@ -713,7 +758,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) } while (instructions.back()->opcode != aco_opcode::p_logical_start); unsigned pred_idx = block->logical_preds[0]; - for (std::pair> live : ctx.next_use_distances_start[block_idx]) { + for (std::pair> live : + ctx.next_use_distances_start[block_idx]) { if (live.first.is_linear()) continue; /* still spilled */ @@ -728,9 +774,11 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) continue; } - /* variable is spilled at predecessor and live at current block: create reload instruction */ + /* variable is spilled at predecessor and live at current block: + * create reload instruction */ Temp new_name = ctx.program->allocateTmp(live.first.regClass()); - aco_ptr reload = do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]); + aco_ptr reload = + do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]); instructions.emplace_back(std::move(reload)); reg_demand.emplace_back(reg_demand.back()); ctx.renames[block_idx][live.first] = new_name; @@ -739,12 +787,15 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) /* combine new reload instructions with original block */ if (!instructions.empty()) { - reg_demand.insert(reg_demand.end(), std::next(ctx.register_demand[block->index].begin(), insert_idx), + reg_demand.insert(reg_demand.end(), + std::next(ctx.register_demand[block->index].begin(), insert_idx), ctx.register_demand[block->index].end()); ctx.register_demand[block_idx] = std::move(reg_demand); instructions.insert(instructions.end(), - std::move_iterator>::iterator>(std::next(block->instructions.begin(), insert_idx)), - std::move_iterator>::iterator>(block->instructions.end())); + std::move_iterator>::iterator>( + std::next(block->instructions.begin(), insert_idx)), + std::move_iterator>::iterator>( + block->instructions.end())); block->instructions = std::move(instructions); } return; @@ -761,12 +812,14 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) /* if the phi is not spilled, add to instructions */ if (!phi->definitions[0].isTemp() || - ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == ctx.spills_entry[block_idx].end()) { + ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == + ctx.spills_entry[block_idx].end()) { instructions.emplace_back(std::move(phi)); continue; } - std::vector& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; + std::vector& preds = + phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()]; for (unsigned i = 0; i < phi->operands.size(); i++) { @@ -807,7 +860,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) ctx.add_interference(spill_id, pair.second); ctx.add_affinity(def_spill_id, spill_id); - aco_ptr spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; + aco_ptr spill{ + create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; spill->operands[0] = spill_op; spill->operands[1] = Operand(spill_id); Block& pred = ctx.program->blocks[pred_idx]; @@ -815,7 +869,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) do { assert(idx != 0); idx--; - } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end); + } while (phi->opcode == aco_opcode::p_phi && + pred.instructions[idx]->opcode != aco_opcode::p_logical_end); std::vector>::iterator it = std::next(pred.instructions.begin(), idx); pred.instructions.insert(it, std::move(spill)); if (spill_op.isTemp()) @@ -829,7 +884,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) /* iterate all (other) spilled variables for which to spill at the predecessor */ // TODO: would be better to have them sorted: first vgprs and first with longest distance for (std::pair pair : ctx.spills_entry[block_idx]) { - std::vector preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds; + std::vector preds = + pair.first.is_linear() ? block->linear_preds : block->logical_preds; for (unsigned pred_idx : preds) { /* variable is already spilled at predecessor */ @@ -841,7 +897,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) } /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */ - if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end()) + if (ctx.next_use_distances_end[pred_idx].find(pair.first) == + ctx.next_use_distances_end[pred_idx].end()) continue; /* add interferences between spilled variable and predecessors exit spills */ @@ -860,7 +917,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) ctx.renames[pred_idx].erase(rename_it); } - aco_ptr spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; + aco_ptr spill{ + create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; spill->operands[0] = Operand(var); spill->operands[1] = Operand(pair.second); Block& pred = ctx.program->blocks[pred_idx]; @@ -868,7 +926,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) do { assert(idx != 0); idx--; - } while (pair.first.type() == RegType::vgpr && pred.instructions[idx]->opcode != aco_opcode::p_logical_end); + } while (pair.first.type() == RegType::vgpr && + pred.instructions[idx]->opcode != aco_opcode::p_logical_end); std::vector>::iterator it = std::next(pred.instructions.begin(), idx); pred.instructions.insert(it, std::move(spill)); ctx.spills_exit[pred.index][pair.first] = pair.second; @@ -878,17 +937,22 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) /* iterate phis for which operands to reload */ for (aco_ptr& phi : instructions) { assert(phi->opcode == aco_opcode::p_phi || phi->opcode == aco_opcode::p_linear_phi); - assert(!phi->definitions[0].isTemp() || ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == ctx.spills_entry[block_idx].end()); + assert(!phi->definitions[0].isTemp() || + ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == + ctx.spills_entry[block_idx].end()); - std::vector& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; + std::vector& preds = + phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; for (unsigned i = 0; i < phi->operands.size(); i++) { if (!phi->operands[i].isTemp()) continue; unsigned pred_idx = preds[i]; /* if the operand was reloaded, rename */ - if (ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp()) == ctx.spills_exit[pred_idx].end()) { - std::map::iterator it = ctx.renames[pred_idx].find(phi->operands[i].getTemp()); + if (ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp()) == + ctx.spills_exit[pred_idx].end()) { + std::map::iterator it = + ctx.renames[pred_idx].find(phi->operands[i].getTemp()); if (it != ctx.renames[pred_idx].end()) phi->operands[i].setTemp(it->second); /* prevent the definining instruction from being DCE'd if it could be rematerialized */ @@ -906,9 +970,11 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) do { assert(idx != 0); idx--; - } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end); + } while (phi->opcode == aco_opcode::p_phi && + pred.instructions[idx]->opcode != aco_opcode::p_logical_end); std::vector>::iterator it = std::next(pred.instructions.begin(), idx); - aco_ptr reload = do_reload(ctx, tmp, new_name, ctx.spills_exit[pred_idx][tmp]); + aco_ptr reload = + do_reload(ctx, tmp, new_name, ctx.spills_exit[pred_idx][tmp]); /* reload spilled exec mask directly to exec */ if (!phi->definitions[0].isTemp()) { @@ -927,16 +993,19 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) /* iterate live variables for which to reload */ // TODO: reload at current block if variable is spilled on all predecessors - for (std::pair> pair : ctx.next_use_distances_start[block_idx]) { + for (std::pair> pair : + ctx.next_use_distances_start[block_idx]) { /* skip spilled variables */ if (ctx.spills_entry[block_idx].find(pair.first) != ctx.spills_entry[block_idx].end()) continue; - std::vector preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds; + std::vector preds = + pair.first.is_linear() ? block->linear_preds : block->logical_preds; /* variable is dead at predecessor, it must be from a phi */ bool is_dead = false; for (unsigned pred_idx : preds) { - if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end()) + if (ctx.next_use_distances_end[pred_idx].find(pair.first) == + ctx.next_use_distances_end[pred_idx].end()) is_dead = true; } if (is_dead) @@ -953,10 +1022,12 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) do { assert(idx != 0); idx--; - } while (pair.first.type() == RegType::vgpr && pred.instructions[idx]->opcode != aco_opcode::p_logical_end); + } while (pair.first.type() == RegType::vgpr && + pred.instructions[idx]->opcode != aco_opcode::p_logical_end); std::vector>::iterator it = std::next(pred.instructions.begin(), idx); - aco_ptr reload = do_reload(ctx, pair.first, new_name, ctx.spills_exit[pred.index][pair.first]); + aco_ptr reload = + do_reload(ctx, pair.first, new_name, ctx.spills_exit[pred.index][pair.first]); pred.instructions.insert(it, std::move(reload)); ctx.spills_exit[pred.index].erase(pair.first); @@ -986,7 +1057,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) if (!is_same) { /* the variable was renamed differently in the predecessors: we have to create a phi */ aco_opcode opcode = pair.first.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; - aco_ptr phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; + aco_ptr phi{ + create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; rename = ctx.program->allocateTmp(pair.first.regClass()); for (unsigned i = 0; i < phi->operands.size(); i++) { Temp tmp; @@ -1020,18 +1092,22 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) if (!ctx.processed[block_idx]) { assert(!(block->kind & block_kind_loop_header)); RegisterDemand demand_before = get_demand_before(ctx, block_idx, idx); - ctx.register_demand[block->index].erase(ctx.register_demand[block->index].begin(), ctx.register_demand[block->index].begin() + idx); - ctx.register_demand[block->index].insert(ctx.register_demand[block->index].begin(), instructions.size(), demand_before); + ctx.register_demand[block->index].erase(ctx.register_demand[block->index].begin(), + ctx.register_demand[block->index].begin() + idx); + ctx.register_demand[block->index].insert(ctx.register_demand[block->index].begin(), + instructions.size(), demand_before); } std::vector>::iterator start = std::next(block->instructions.begin(), idx); - instructions.insert(instructions.end(), std::move_iterator>::iterator>(start), - std::move_iterator>::iterator>(block->instructions.end())); + instructions.insert( + instructions.end(), std::move_iterator>::iterator>(start), + std::move_iterator>::iterator>(block->instructions.end())); block->instructions = std::move(instructions); } -void process_block(spill_ctx& ctx, unsigned block_idx, Block* block, - std::map ¤t_spills, RegisterDemand spilled_registers) +void +process_block(spill_ctx& ctx, unsigned block_idx, Block* block, + std::map& current_spills, RegisterDemand spilled_registers) { assert(!ctx.processed[block_idx]); @@ -1099,7 +1175,8 @@ void process_block(spill_ctx& ctx, unsigned block_idx, Block* block, if (((pair.second > distance && can_rematerialize == do_rematerialize) || (can_rematerialize && !do_rematerialize && pair.second > idx)) && current_spills.find(pair.first) == current_spills.end() && - ctx.spills_exit[block_idx].find(pair.first) == ctx.spills_exit[block_idx].end()) { + ctx.spills_exit[block_idx].find(pair.first) == + ctx.spills_exit[block_idx].end()) { to_spill = pair.first; distance = pair.second; do_rematerialize = can_rematerialize; @@ -1124,7 +1201,8 @@ void process_block(spill_ctx& ctx, unsigned block_idx, Block* block, } /* add spill to new instructions */ - aco_ptr spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; + aco_ptr spill{ + create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; spill->operands[0] = Operand(to_spill); spill->operands[1] = Operand(spill_id); instructions.emplace_back(std::move(spill)); @@ -1133,7 +1211,8 @@ void process_block(spill_ctx& ctx, unsigned block_idx, Block* block, /* add reloads and instruction to new instructions */ for (std::pair> pair : reloads) { - aco_ptr reload = do_reload(ctx, pair.second.first, pair.first, pair.second.second); + aco_ptr reload = + do_reload(ctx, pair.second.first, pair.first, pair.second.second); instructions.emplace_back(std::move(reload)); } instructions.emplace_back(std::move(instr)); @@ -1144,7 +1223,8 @@ void process_block(spill_ctx& ctx, unsigned block_idx, Block* block, ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end()); } -void spill_block(spill_ctx& ctx, unsigned block_idx) +void +spill_block(spill_ctx& ctx, unsigned block_idx) { Block* block = &ctx.program->blocks[block_idx]; @@ -1152,7 +1232,8 @@ void spill_block(spill_ctx& ctx, unsigned block_idx) RegisterDemand spilled_registers = init_live_in_vars(ctx, block, block_idx); /* add interferences for spilled variables */ - for (auto it = ctx.spills_entry[block_idx].begin(); it != ctx.spills_entry[block_idx].end(); ++it) { + for (auto it = ctx.spills_entry[block_idx].begin(); it != ctx.spills_entry[block_idx].end(); + ++it) { for (auto it2 = std::next(it); it2 != ctx.spills_entry[block_idx].end(); ++it2) ctx.add_interference(it->second, it2->second); } @@ -1167,8 +1248,7 @@ void spill_block(spill_ctx& ctx, unsigned block_idx) /* check conditions to process this block */ bool process = (block->register_demand - spilled_registers).exceeds(ctx.target_pressure) || - !ctx.renames[block_idx].empty() || - ctx.remat_used.size(); + !ctx.renames[block_idx].empty() || ctx.remat_used.size(); for (auto it = current_spills.begin(); !process && it != current_spills.end(); ++it) { if (ctx.next_use_distances_start[block_idx][it->first].first == block_idx) @@ -1183,7 +1263,8 @@ void spill_block(spill_ctx& ctx, unsigned block_idx) ctx.processed[block_idx] = true; /* check if the next block leaves the current loop */ - if (block->loop_nest_depth == 0 || ctx.program->blocks[block_idx + 1].loop_nest_depth >= block->loop_nest_depth) + if (block->loop_nest_depth == 0 || + ctx.program->blocks[block_idx + 1].loop_nest_depth >= block->loop_nest_depth) return; Block* loop_header = ctx.loop_header.top(); @@ -1206,7 +1287,8 @@ void spill_block(spill_ctx& ctx, unsigned block_idx) aco_ptr& phi = *instr_it; if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) break; - /* no need to rename the loop header phis once again. this happened in add_coupling_code() */ + /* no need to rename the loop header phis once again. this happened in + * add_coupling_code() */ if (idx == loop_header->index) { instr_it++; continue; @@ -1240,7 +1322,7 @@ void spill_block(spill_ctx& ctx, unsigned block_idx) op.setTemp(rename.second); /* we can stop with this block as soon as the variable is spilled */ if (instr->opcode == aco_opcode::p_spill) - renamed = true; + renamed = true; } } instr_it++; @@ -1252,9 +1334,10 @@ void spill_block(spill_ctx& ctx, unsigned block_idx) ctx.loop_header.pop(); } -Temp load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset, - std::vector>& instructions, - unsigned offset, bool is_top_level) +Temp +load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset, + std::vector>& instructions, unsigned offset, + bool is_top_level) { Builder bld(ctx.program); if (is_top_level) { @@ -1269,19 +1352,21 @@ Temp load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset, Temp private_segment_buffer = ctx.program->private_segment_buffer; if (ctx.program->stage != compute_cs) - private_segment_buffer = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand(0u)); + private_segment_buffer = + bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand(0u)); if (offset) - scratch_offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), scratch_offset, Operand(offset)); + scratch_offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), + scratch_offset, Operand(offset)); - uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) | - S_008F0C_INDEX_STRIDE(ctx.program->wave_size == 64 ? 3 : 2); + uint32_t rsrc_conf = + S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx.program->wave_size == 64 ? 3 : 2); if (ctx.program->chip_class >= GFX10) { rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | - S_008F0C_RESOURCE_LEVEL(1); - } else if (ctx.program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */ + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); + } else if (ctx.program->chip_class <= GFX7) { + /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */ rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); } @@ -1289,14 +1374,13 @@ Temp load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset, if (ctx.program->chip_class <= GFX8) rsrc_conf |= S_008F0C_ELEMENT_SIZE(1); - return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), - private_segment_buffer, Operand(-1u), + return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, Operand(-1u), Operand(rsrc_conf)); } -void add_interferences(spill_ctx& ctx, std::vector& is_assigned, - std::vector& slots, std::vector& slots_used, - unsigned id) +void +add_interferences(spill_ctx& ctx, std::vector& is_assigned, std::vector& slots, + std::vector& slots_used, unsigned id) { for (unsigned other : ctx.interferences[id].second) { if (!is_assigned[other]) @@ -1308,8 +1392,9 @@ void add_interferences(spill_ctx& ctx, std::vector& is_assigned, } } -unsigned find_available_slot(std::vector& used, unsigned wave_size, - unsigned size, bool is_sgpr, unsigned *num_slots) +unsigned +find_available_slot(std::vector& used, unsigned wave_size, unsigned size, bool is_sgpr, + unsigned* num_slots) { unsigned wave_size_minus_one = wave_size - 1; unsigned slot = 0; @@ -1341,10 +1426,9 @@ unsigned find_available_slot(std::vector& used, unsigned wave_size, } } -void assign_spill_slots_helper(spill_ctx& ctx, RegType type, - std::vector& is_assigned, - std::vector& slots, - unsigned *num_slots) +void +assign_spill_slots_helper(spill_ctx& ctx, RegType type, std::vector& is_assigned, + std::vector& slots, unsigned* num_slots) { std::vector slots_used(*num_slots); @@ -1360,9 +1444,9 @@ void assign_spill_slots_helper(spill_ctx& ctx, RegType type, add_interferences(ctx, is_assigned, slots, slots_used, id); } - unsigned slot = find_available_slot(slots_used, ctx.wave_size, - ctx.interferences[vec[0]].first.size(), - type == RegType::sgpr, num_slots); + unsigned slot = + find_available_slot(slots_used, ctx.wave_size, ctx.interferences[vec[0]].first.size(), + type == RegType::sgpr, num_slots); for (unsigned id : vec) { assert(!is_assigned[id]); @@ -1381,9 +1465,9 @@ void assign_spill_slots_helper(spill_ctx& ctx, RegType type, add_interferences(ctx, is_assigned, slots, slots_used, id); - unsigned slot = find_available_slot(slots_used, ctx.wave_size, - ctx.interferences[id].first.size(), - type == RegType::sgpr, num_slots); + unsigned slot = + find_available_slot(slots_used, ctx.wave_size, ctx.interferences[id].first.size(), + type == RegType::sgpr, num_slots); slots[id] = slot; is_assigned[id] = true; @@ -1392,7 +1476,9 @@ void assign_spill_slots_helper(spill_ctx& ctx, RegType type, *num_slots = slots_used.size(); } -void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { +void +assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) +{ std::vector slots(ctx.interferences.size()); std::vector is_assigned(ctx.interferences.size()); @@ -1426,7 +1512,8 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { if (!is_assigned[vec[i]]) continue; assert(ctx.is_reloaded[vec[i]] == ctx.is_reloaded[vec[j]]); - assert(ctx.interferences[vec[i]].first.type() == ctx.interferences[vec[j]].first.type()); + assert(ctx.interferences[vec[i]].first.type() == + ctx.interferences[vec[j]].first.type()); assert(slots[vec[i]] == slots[vec[j]]); } } @@ -1451,7 +1538,8 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { } if (end_vgprs > 0) { - aco_ptr destr{create_instruction(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, end_vgprs, 0)}; + aco_ptr destr{create_instruction( + aco_opcode::p_end_linear_vgpr, Format::PSEUDO, end_vgprs, 0)}; int k = 0; for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) { if (reload_in_loop[i]) @@ -1505,17 +1593,25 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { /* spill vgpr */ ctx.program->config->spilled_vgprs += (*it)->operands[0].size(); uint32_t spill_slot = slots[spill_id]; - bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096; - unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size; + bool add_offset_to_sgpr = + ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + + vgpr_spill_slots * 4 > + 4096; + unsigned base_offset = + add_offset_to_sgpr + ? 0 + : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size; /* check if the scratch resource descriptor already exists */ if (scratch_rsrc == Temp()) { - unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0; - scratch_rsrc = load_scratch_resource(ctx, scratch_offset, - last_top_level_block_idx == block.index ? - instructions : ctx.program->blocks[last_top_level_block_idx].instructions, - offset, - last_top_level_block_idx == block.index); + unsigned offset = + add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0; + scratch_rsrc = load_scratch_resource( + ctx, scratch_offset, + last_top_level_block_idx == block.index + ? instructions + : ctx.program->blocks[last_top_level_block_idx].instructions, + offset, last_top_level_block_idx == block.index); } unsigned offset = base_offset + spill_slot * 4; @@ -1524,17 +1620,21 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { Temp temp = (*it)->operands[0].getTemp(); assert(temp.type() == RegType::vgpr && !temp.is_linear()); if (temp.size() > 1) { - Instruction* split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())}; + Instruction* split{create_instruction( + aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())}; split->operands[0] = Operand(temp); for (unsigned i = 0; i < temp.size(); i++) split->definitions[i] = bld.def(v1); bld.insert(split); for (unsigned i = 0; i < temp.size(); i++) { - Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true); + Instruction* instr = + bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, + split->definitions[i].getTemp(), offset + i * 4, false, true); instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); } } else { - Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true); + Instruction* instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, + temp, offset, false, true); instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); } } else { @@ -1546,7 +1646,8 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) { Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear()); vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr; - aco_ptr create{create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; + aco_ptr create{create_instruction( + aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; create->definitions[0] = Definition(linear_vgpr); /* find the right place to insert this definition */ if (last_top_level_block_idx == block.index) { @@ -1555,13 +1656,15 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { } else { assert(last_top_level_block_idx < block.index); /* insert before the branch at last top level block */ - std::vector>& block_instrs = ctx.program->blocks[last_top_level_block_idx].instructions; + std::vector>& block_instrs = + ctx.program->blocks[last_top_level_block_idx].instructions; block_instrs.insert(std::prev(block_instrs.end()), std::move(create)); } } /* spill sgpr: just add the vgpr temp to operands */ - Pseudo_instruction* spill = create_instruction(aco_opcode::p_spill, Format::PSEUDO, 3, 0); + Pseudo_instruction* spill = + create_instruction(aco_opcode::p_spill, Format::PSEUDO, 3, 0); spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]); spill->operands[1] = Operand(spill_slot % ctx.wave_size); spill->operands[2] = (*it)->operands[0]; @@ -1577,34 +1680,46 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { } else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) { /* reload vgpr */ uint32_t spill_slot = slots[spill_id]; - bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096; - unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size; + bool add_offset_to_sgpr = + ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + + vgpr_spill_slots * 4 > + 4096; + unsigned base_offset = + add_offset_to_sgpr + ? 0 + : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size; /* check if the scratch resource descriptor already exists */ if (scratch_rsrc == Temp()) { - unsigned offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0; - scratch_rsrc = load_scratch_resource(ctx, scratch_offset, - last_top_level_block_idx == block.index ? - instructions : ctx.program->blocks[last_top_level_block_idx].instructions, - offset, - last_top_level_block_idx == block.index); + unsigned offset = + add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0; + scratch_rsrc = load_scratch_resource( + ctx, scratch_offset, + last_top_level_block_idx == block.index + ? instructions + : ctx.program->blocks[last_top_level_block_idx].instructions, + offset, last_top_level_block_idx == block.index); } unsigned offset = base_offset + spill_slot * 4; aco_opcode opcode = aco_opcode::buffer_load_dword; Definition def = (*it)->definitions[0]; if (def.size() > 1) { - Instruction* vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)}; + Instruction* vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)}; vec->definitions[0] = def; for (unsigned i = 0; i < def.size(); i++) { Temp tmp = bld.tmp(v1); vec->operands[i] = Operand(tmp); - Instruction *instr = bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true); + Instruction* instr = + bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), + scratch_offset, offset + i * 4, false, true); instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); } bld.insert(vec); } else { - Instruction *instr = bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true); + Instruction* instr = bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), + scratch_offset, offset, false, true); instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); } } else { @@ -1615,7 +1730,8 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) { Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear()); vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr; - aco_ptr create{create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; + aco_ptr create{create_instruction( + aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; create->definitions[0] = Definition(linear_vgpr); /* find the right place to insert this definition */ if (last_top_level_block_idx == block.index) { @@ -1624,13 +1740,15 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { } else { assert(last_top_level_block_idx < block.index); /* insert before the branch at last top level block */ - std::vector>& block_instrs = ctx.program->blocks[last_top_level_block_idx].instructions; + std::vector>& block_instrs = + ctx.program->blocks[last_top_level_block_idx].instructions; block_instrs.insert(std::prev(block_instrs.end()), std::move(create)); } } /* reload sgpr: just add the vgpr temp to operands */ - Pseudo_instruction* reload = create_instruction(aco_opcode::p_reload, Format::PSEUDO, 2, 1); + Pseudo_instruction* reload = create_instruction( + aco_opcode::p_reload, Format::PSEUDO, 2, 1); reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]); reload->operands[1] = Operand(spill_slot % ctx.wave_size); reload->definitions[0] = (*it)->definitions[0]; @@ -1639,13 +1757,13 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { } else if (!ctx.remat_used.count(it->get()) || ctx.remat_used[it->get()]) { instructions.emplace_back(std::move(*it)); } - } block.instructions = std::move(instructions); } /* update required scratch memory */ - ctx.program->config->scratch_bytes_per_wave += align(vgpr_spill_slots * 4 * ctx.program->wave_size, 1024); + ctx.program->config->scratch_bytes_per_wave += + align(vgpr_spill_slots * 4 * ctx.program->wave_size, 1024); /* SSA elimination inserts copies for logical phis right before p_logical_end * So if a linear vgpr is used between that p_logical_end and the branch, @@ -1686,7 +1804,8 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { if (!vgprs.size()) continue; - aco_ptr destr{create_instruction(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vgprs.size(), 0)}; + aco_ptr destr{create_instruction( + aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vgprs.size(), 0)}; int k = 0; for (Temp tmp : vgprs) { destr->operands[k++] = Operand(tmp); @@ -1701,8 +1820,8 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { } /* end namespace */ - -void spill(Program* program, live& live_vars) +void +spill(Program* program, live& live_vars) { program->config->spilled_vgprs = 0; program->config->spilled_sgprs = 0; @@ -1758,5 +1877,4 @@ void spill(Program* program, live& live_vars) assert(program->num_waves > 0); } -} - +} // namespace aco diff --git a/src/amd/compiler/aco_ssa_elimination.cpp b/src/amd/compiler/aco_ssa_elimination.cpp index dcb8b162b76..266af1e4893 100644 --- a/src/amd/compiler/aco_ssa_elimination.cpp +++ b/src/amd/compiler/aco_ssa_elimination.cpp @@ -37,7 +37,8 @@ struct phi_info_item { }; struct ssa_elimination_ctx { - /* The outer vectors should be indexed by block index. The inner vectors store phi information for each block. */ + /* The outer vectors should be indexed by block index. The inner vectors store phi information + * for each block. */ std::vector> logical_phi_info; std::vector> linear_phi_info; std::vector empty_blocks; @@ -45,14 +46,14 @@ struct ssa_elimination_ctx { Program* program; ssa_elimination_ctx(Program* program_) - : logical_phi_info(program_->blocks.size()) - , linear_phi_info(program_->blocks.size()) - , empty_blocks(program_->blocks.size(), true) - , blocks_incoming_exec_used(program_->blocks.size(), true) - , program(program_) {} + : logical_phi_info(program_->blocks.size()), linear_phi_info(program_->blocks.size()), + empty_blocks(program_->blocks.size(), true), + blocks_incoming_exec_used(program_->blocks.size(), true), program(program_) + {} }; -void collect_phi_info(ssa_elimination_ctx& ctx) +void +collect_phi_info(ssa_elimination_ctx& ctx) { for (Block& block : ctx.program->blocks) { for (aco_ptr& phi : block.instructions) { @@ -67,9 +68,11 @@ void collect_phi_info(ssa_elimination_ctx& ctx) assert(phi->definitions[0].size() == phi->operands[i].size()); - std::vector& preds = phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; + std::vector& preds = + phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; uint32_t pred_idx = preds[i]; - auto& info_vec = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info[pred_idx] : ctx.linear_phi_info[pred_idx]; + auto& info_vec = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info[pred_idx] + : ctx.linear_phi_info[pred_idx]; info_vec.push_back({phi->definitions[0], phi->operands[i]}); ctx.empty_blocks[pred_idx] = false; } @@ -77,11 +80,12 @@ void collect_phi_info(ssa_elimination_ctx& ctx) } } -void insert_parallelcopies(ssa_elimination_ctx& ctx) +void +insert_parallelcopies(ssa_elimination_ctx& ctx) { /* insert the parallelcopies from logical phis before p_logical_end */ for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) { - auto &logical_phi_info = ctx.logical_phi_info[block_idx]; + auto& logical_phi_info = ctx.logical_phi_info[block_idx]; if (logical_phi_info.empty()) continue; @@ -93,10 +97,11 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx) } std::vector>::iterator it = std::next(block.instructions.begin(), idx); - aco_ptr pc{create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, logical_phi_info.size(), logical_phi_info.size())}; + aco_ptr pc{ + create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, + logical_phi_info.size(), logical_phi_info.size())}; unsigned i = 0; - for (auto& phi_info : logical_phi_info) - { + for (auto& phi_info : logical_phi_info) { pc->definitions[i] = phi_info.def; pc->operands[i] = phi_info.op; i++; @@ -108,7 +113,7 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx) /* insert parallelcopies for the linear phis at the end of blocks just before the branch */ for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) { - auto &linear_phi_info = ctx.linear_phi_info[block_idx]; + auto& linear_phi_info = ctx.linear_phi_info[block_idx]; if (linear_phi_info.empty()) continue; @@ -116,10 +121,11 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx) std::vector>::iterator it = block.instructions.end(); --it; assert((*it)->isBranch()); - aco_ptr pc{create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, linear_phi_info.size(), linear_phi_info.size())}; + aco_ptr pc{ + create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, + linear_phi_info.size(), linear_phi_info.size())}; unsigned i = 0; - for (auto& phi_info : linear_phi_info) - { + for (auto& phi_info : linear_phi_info) { pc->definitions[i] = phi_info.def; pc->operands[i] = phi_info.op; i++; @@ -130,38 +136,38 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx) } } -bool is_empty_block(Block* block, bool ignore_exec_writes) +bool +is_empty_block(Block* block, bool ignore_exec_writes) { /* check if this block is empty and the exec mask is not needed */ for (aco_ptr& instr : block->instructions) { switch (instr->opcode) { - case aco_opcode::p_linear_phi: - case aco_opcode::p_phi: - case aco_opcode::p_logical_start: - case aco_opcode::p_logical_end: - case aco_opcode::p_branch: + case aco_opcode::p_linear_phi: + case aco_opcode::p_phi: + case aco_opcode::p_logical_start: + case aco_opcode::p_logical_end: + case aco_opcode::p_branch: break; + case aco_opcode::p_parallelcopy: + for (unsigned i = 0; i < instr->definitions.size(); i++) { + if (ignore_exec_writes && instr->definitions[i].physReg() == exec) + continue; + if (instr->definitions[i].physReg() != instr->operands[i].physReg()) + return false; + } + break; + case aco_opcode::s_andn2_b64: + case aco_opcode::s_andn2_b32: + if (ignore_exec_writes && instr->definitions[0].physReg() == exec) break; - case aco_opcode::p_parallelcopy: - for (unsigned i = 0; i < instr->definitions.size(); i++) { - if (ignore_exec_writes && instr->definitions[i].physReg() == exec) - continue; - if (instr->definitions[i].physReg() != instr->operands[i].physReg()) - return false; - } - break; - case aco_opcode::s_andn2_b64: - case aco_opcode::s_andn2_b32: - if (ignore_exec_writes && instr->definitions[0].physReg() == exec) - break; - return false; - default: - return false; + return false; + default: return false; } } return true; } -void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block) +void +try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block) { /* check if the successor is another merge block which restores exec */ // TODO: divergent loops also restore exec @@ -179,7 +185,8 @@ void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block) block->instructions.emplace_back(std::move(branch)); } -void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block) +void +try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block) { assert(block->linear_succs.size() == 2); /* only remove this block if the successor got removed as well */ @@ -193,7 +200,7 @@ void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block) unsigned succ_idx = block->linear_succs[0]; assert(block->linear_preds.size() == 2); for (unsigned i = 0; i < 2; i++) { - Block *pred = &ctx.program->blocks[block->linear_preds[i]]; + Block* pred = &ctx.program->blocks[block->linear_preds[i]]; pred->linear_succs[0] = succ_idx; ctx.program->blocks[succ_idx].linear_preds[i] = pred->index; @@ -208,7 +215,8 @@ void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block) block->linear_succs.clear(); } -void try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block) +void +try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block) { if (!is_empty_block(block, false)) return; @@ -277,7 +285,8 @@ void try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block) block->linear_succs.clear(); } -bool instr_writes_exec(Instruction* instr) +bool +instr_writes_exec(Instruction* instr) { for (Definition& def : instr->definitions) if (def.physReg() == exec || def.physReg() == exec_hi) @@ -286,7 +295,8 @@ bool instr_writes_exec(Instruction* instr) return false; } -void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block) +void +eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block) { /* Check if any successor needs the outgoing exec mask from the current block. */ @@ -309,8 +319,9 @@ void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& blo exec_write_used = false; else /* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */ - exec_write_used = std::any_of(block.linear_succs.begin(), block.linear_succs.end(), - [&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; }); + exec_write_used = + std::any_of(block.linear_succs.begin(), block.linear_succs.end(), + [&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; }); } /* Go through all instructions and eliminate useless exec writes. */ @@ -318,7 +329,8 @@ void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& blo for (int i = block.instructions.size() - 1; i >= 0; --i) { aco_ptr& instr = block.instructions[i]; - /* We already take information from phis into account before the loop, so let's just break on phis. */ + /* We already take information from phis into account before the loop, so let's just break on + * phis. */ if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi) break; @@ -341,16 +353,15 @@ void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& blo } /* Remember if the current block needs an incoming exec mask from its predecessors. */ - ctx.blocks_incoming_exec_used[block.index] = exec_write_used; /* Cleanup: remove deleted instructions from the vector. */ - auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr); block.instructions.resize(new_end - block.instructions.begin()); } -void jump_threading(ssa_elimination_ctx& ctx) +void +jump_threading(ssa_elimination_ctx& ctx) { for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) { Block* block = &ctx.program->blocks[i]; @@ -367,8 +378,7 @@ void jump_threading(ssa_elimination_ctx& ctx) if (block->linear_succs.size() > 1) continue; - if (block->kind & block_kind_merge || - block->kind & block_kind_loop_exit) + if (block->kind & block_kind_merge || block->kind & block_kind_loop_exit) try_remove_merge_block(ctx, block); if (block->linear_preds.size() == 1) @@ -378,8 +388,8 @@ void jump_threading(ssa_elimination_ctx& ctx) } /* end namespace */ - -void ssa_elimination(Program* program) +void +ssa_elimination(Program* program) { ssa_elimination_ctx ctx(program); @@ -391,6 +401,5 @@ void ssa_elimination(Program* program) /* insert parallelcopies from SSA elimination */ insert_parallelcopies(ctx); - -} } +} // namespace aco diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp index a8652de8f56..ce114e3f879 100644 --- a/src/amd/compiler/aco_statistics.cpp +++ b/src/amd/compiler/aco_statistics.cpp @@ -23,6 +23,7 @@ */ #include "aco_ir.h" + #include "util/crc32.h" #include @@ -33,7 +34,8 @@ namespace aco { /* sgpr_presched/vgpr_presched */ -void collect_presched_stats(Program *program) +void +collect_presched_stats(Program* program) { RegisterDemand presched_demand; for (Block& block : program->blocks) @@ -56,9 +58,9 @@ public: resource_count, }; - BlockCycleEstimator(Program *program_) : program(program_) {} + BlockCycleEstimator(Program* program_) : program(program_) {} - Program *program; + Program* program; int32_t cur_cycle = 0; int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0}; @@ -72,6 +74,7 @@ public: unsigned predict_cost(aco_ptr& instr); void add(aco_ptr& instr); void join(const BlockCycleEstimator& other); + private: unsigned get_waitcnt_cost(wait_imm imm); unsigned get_dependency_cost(aco_ptr& instr); @@ -81,8 +84,9 @@ private: }; struct wait_counter_info { - wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_) : - vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {} + wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_) + : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) + {} unsigned vm; unsigned exp; @@ -100,107 +104,83 @@ struct perf_info { unsigned cost1; }; -static perf_info get_perf_info(Program *program, aco_ptr& instr) +static perf_info +get_perf_info(Program* program, aco_ptr& instr) { instr_class cls = instr_info.classes[(int)instr->opcode]; - #define WAIT(res) BlockCycleEstimator::res, 0 - #define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt +#define WAIT(res) BlockCycleEstimator::res, 0 +#define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt if (program->chip_class >= GFX10) { /* fp64 might be incorrect */ switch (cls) { case instr_class::valu32: case instr_class::valu_convert32: - case instr_class::valu_fma: - return {5, WAIT_USE(valu, 1)}; - case instr_class::valu64: - return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)}; + case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)}; + case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)}; case instr_class::valu_quarter_rate32: return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)}; case instr_class::valu_transcendental32: return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)}; - case instr_class::valu_double: - return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; + case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; case instr_class::valu_double_add: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; case instr_class::valu_double_convert: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; case instr_class::valu_double_transcendental: return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; - case instr_class::salu: - return {2, WAIT_USE(scalar, 1)}; - case instr_class::smem: - return {0, WAIT_USE(scalar, 1)}; + case instr_class::salu: return {2, WAIT_USE(scalar, 1)}; + case instr_class::smem: return {0, WAIT_USE(scalar, 1)}; case instr_class::branch: - case instr_class::sendmsg: - return {0, WAIT_USE(branch_sendmsg, 1)}; + case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 1)}; case instr_class::ds: - return instr->ds().gds ? - perf_info{0, WAIT_USE(export_gds, 1)} : - perf_info{0, WAIT_USE(lds, 1)}; - case instr_class::exp: - return {0, WAIT_USE(export_gds, 1)}; - case instr_class::vmem: - return {0, WAIT_USE(vmem, 1)}; + return instr->ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)} + : perf_info{0, WAIT_USE(lds, 1)}; + case instr_class::exp: return {0, WAIT_USE(export_gds, 1)}; + case instr_class::vmem: return {0, WAIT_USE(vmem, 1)}; case instr_class::barrier: case instr_class::waitcnt: case instr_class::other: - default: - return {0}; + default: return {0}; } } else { switch (cls) { - case instr_class::valu32: - return {4, WAIT_USE(valu, 4)}; - case instr_class::valu_convert32: - return {16, WAIT_USE(valu, 16)}; - case instr_class::valu64: - return {8, WAIT_USE(valu, 8)}; - case instr_class::valu_quarter_rate32: - return {16, WAIT_USE(valu, 16)}; + case instr_class::valu32: return {4, WAIT_USE(valu, 4)}; + case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)}; + case instr_class::valu64: return {8, WAIT_USE(valu, 8)}; + case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)}; case instr_class::valu_fma: - return program->dev.has_fast_fma32 ? - perf_info{4, WAIT_USE(valu, 4)} : - perf_info{16, WAIT_USE(valu, 16)}; - case instr_class::valu_transcendental32: - return {16, WAIT_USE(valu, 16)}; - case instr_class::valu_double: - return {64, WAIT_USE(valu, 64)}; - case instr_class::valu_double_add: - return {32, WAIT_USE(valu, 32)}; - case instr_class::valu_double_convert: - return {16, WAIT_USE(valu, 16)}; - case instr_class::valu_double_transcendental: - return {64, WAIT_USE(valu, 64)}; - case instr_class::salu: - return {4, WAIT_USE(scalar, 4)}; - case instr_class::smem: - return {4, WAIT_USE(scalar, 4)}; + return program->dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)} + : perf_info{16, WAIT_USE(valu, 16)}; + case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)}; + case instr_class::valu_double: return {64, WAIT_USE(valu, 64)}; + case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)}; + case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)}; + case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)}; + case instr_class::salu: return {4, WAIT_USE(scalar, 4)}; + case instr_class::smem: return {4, WAIT_USE(scalar, 4)}; case instr_class::branch: return {8, WAIT_USE(branch_sendmsg, 8)}; return {4, WAIT_USE(branch_sendmsg, 4)}; case instr_class::ds: - return instr->ds().gds ? - perf_info{4, WAIT_USE(export_gds, 4)} : - perf_info{4, WAIT_USE(lds, 4)}; - case instr_class::exp: - return {16, WAIT_USE(export_gds, 16)}; - case instr_class::vmem: - return {4, WAIT_USE(vmem, 4)}; + return instr->ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)} + : perf_info{4, WAIT_USE(lds, 4)}; + case instr_class::exp: return {16, WAIT_USE(export_gds, 16)}; + case instr_class::vmem: return {4, WAIT_USE(vmem, 4)}; case instr_class::barrier: case instr_class::waitcnt: case instr_class::other: - default: - return {4}; + default: return {4}; } } - #undef WAIT_USE - #undef WAIT +#undef WAIT_USE +#undef WAIT } -void BlockCycleEstimator::use_resources(aco_ptr& instr) +void +BlockCycleEstimator::use_resources(aco_ptr& instr) { perf_info perf = get_perf_info(program, instr); @@ -215,7 +195,8 @@ void BlockCycleEstimator::use_resources(aco_ptr& instr) } } -int32_t BlockCycleEstimator::cycles_until_res_available(aco_ptr& instr) +int32_t +BlockCycleEstimator::cycles_until_res_available(aco_ptr& instr) { perf_info perf = get_perf_info(program, instr); @@ -228,7 +209,8 @@ int32_t BlockCycleEstimator::cycles_until_res_available(aco_ptr& in return cost; } -static wait_counter_info get_wait_counter_info(aco_ptr& instr) +static wait_counter_info +get_wait_counter_info(aco_ptr& instr) { /* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance * depends a lot on the situation. */ @@ -252,8 +234,8 @@ static wait_counter_info get_wait_counter_info(aco_ptr& instr) bool likely_desc_load = instr->operands[0].size() == 2; bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4); - bool const_offset = instr->operands[1].isConstant() && - (!soe || instr->operands.back().isConstant()); + bool const_offset = + instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant()); if (likely_desc_load || const_offset) return wait_counter_info(0, 0, 30, 0); /* likely to hit L0 cache */ @@ -273,7 +255,8 @@ static wait_counter_info get_wait_counter_info(aco_ptr& instr) return wait_counter_info(0, 0, 0, 0); } -static wait_imm get_wait_imm(Program *program, aco_ptr& instr) +static wait_imm +get_wait_imm(Program* program, aco_ptr& instr) { if (instr->opcode == aco_opcode::s_endpgm) { return wait_imm(0, 0, 0, 0); @@ -297,7 +280,8 @@ static wait_imm get_wait_imm(Program *program, aco_ptr& instr) } } -unsigned BlockCycleEstimator::get_dependency_cost(aco_ptr& instr) +unsigned +BlockCycleEstimator::get_dependency_cost(aco_ptr& instr) { int deps_available = cur_cycle; @@ -337,13 +321,15 @@ unsigned BlockCycleEstimator::get_dependency_cost(aco_ptr& instr) return deps_available - cur_cycle; } -unsigned BlockCycleEstimator::predict_cost(aco_ptr& instr) +unsigned +BlockCycleEstimator::predict_cost(aco_ptr& instr) { int32_t dep = get_dependency_cost(instr); return dep + std::max(cycles_until_res_available(instr) - dep, 0); } -static bool is_vector(aco_opcode op) +static bool +is_vector(aco_opcode op) { switch (instr_info.classes[(int)op]) { case instr_class::valu32: @@ -358,14 +344,13 @@ static bool is_vector(aco_opcode op) case instr_class::exp: case instr_class::valu64: case instr_class::valu_quarter_rate32: - case instr_class::valu_transcendental32: - return true; - default: - return false; + case instr_class::valu_transcendental32: return true; + default: return false; } } -void BlockCycleEstimator::add(aco_ptr& instr) +void +BlockCycleEstimator::add(aco_ptr& instr) { perf_info perf = get_perf_info(program, instr); @@ -411,13 +396,14 @@ void BlockCycleEstimator::add(aco_ptr& instr) int32_t result_available = start + MAX2(perf.latency, latency); for (Definition& def : instr->definitions) { - int32_t *available = ®_available[def.physReg().reg()]; + int32_t* available = ®_available[def.physReg().reg()]; for (unsigned i = 0; i < def.size(); i++) available[i] = MAX2(available[i], result_available); } } -static void join_queue(std::deque& queue, const std::deque& pred, int cycle_diff) +static void +join_queue(std::deque& queue, const std::deque& pred, int cycle_diff) { for (unsigned i = 0; i < MIN2(queue.size(), pred.size()); i++) queue.rbegin()[i] = MAX2(queue.rbegin()[i], pred.rbegin()[i] + cycle_diff); @@ -425,7 +411,8 @@ static void join_queue(std::deque& queue, const std::deque& pr queue.push_front(pred[i] + cycle_diff); } -void BlockCycleEstimator::join(const BlockCycleEstimator& pred) +void +BlockCycleEstimator::join(const BlockCycleEstimator& pred) { assert(cur_cycle == 0); @@ -435,8 +422,7 @@ void BlockCycleEstimator::join(const BlockCycleEstimator& pred) } for (unsigned i = 0; i < 512; i++) - reg_available[i] = MAX2(reg_available[i], - pred.reg_available[i] - pred.cur_cycle + cur_cycle); + reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle); join_queue(lgkm, pred.lgkm, -pred.cur_cycle); join_queue(exp, pred.exp, -pred.cur_cycle); @@ -445,11 +431,12 @@ void BlockCycleEstimator::join(const BlockCycleEstimator& pred) } /* instructions/branches/vmem_clauses/smem_clauses/cycles */ -void collect_preasm_stats(Program *program) +void +collect_preasm_stats(Program* program) { for (Block& block : program->blocks) { - std::set vmem_clause; - std::set smem_clause; + std::set vmem_clause; + std::set smem_clause; program->statistics[statistic_instructions] += block.instructions.size(); @@ -462,7 +449,8 @@ void collect_preasm_stats(Program *program) if (instr->isVMEM() && !instr->operands.empty()) { if (std::none_of(vmem_clause.begin(), vmem_clause.end(), - [&](Instruction *other) {return should_form_clause(instr.get(), other);})) + [&](Instruction* other) + { return should_form_clause(instr.get(), other); })) program->statistics[statistic_vmem_clauses]++; vmem_clause.insert(instr.get()); } else { @@ -471,12 +459,13 @@ void collect_preasm_stats(Program *program) if (instr->isSMEM() && !instr->operands.empty()) { if (std::none_of(smem_clause.begin(), smem_clause.end(), - [&](Instruction *other) {return should_form_clause(instr.get(), other);})) + [&](Instruction* other) + { return should_form_clause(instr.get(), other); })) program->statistics[statistic_smem_clauses]++; smem_clause.insert(instr.get()); } else { smem_clause.clear(); - } + } } } @@ -514,8 +503,10 @@ void collect_preasm_stats(Program *program) iter *= pow(0.5, block.uniform_if_depth); iter *= pow(0.75, block.divergent_if_logical_depth); - bool divergent_if_linear_else = block.logical_preds.empty() && block.linear_preds.size() == 1 && block.linear_succs.size() == 1 && - program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert); + bool divergent_if_linear_else = + block.logical_preds.empty() && block.linear_preds.size() == 1 && + block.linear_succs.size() == 1 && + program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert); if (divergent_if_linear_else) iter *= 0.25; @@ -540,7 +531,8 @@ void collect_preasm_stats(Program *program) double max_utilization = 1.0; if (program->workgroup_size != UINT_MAX) - max_utilization = program->workgroup_size / (double)align(program->workgroup_size, program->wave_size); + max_utilization = + program->workgroup_size / (double)align(program->workgroup_size, program->wave_size); wave64_per_cycle *= max_utilization; program->statistics[statistic_latency] = round(latency); @@ -551,7 +543,8 @@ void collect_preasm_stats(Program *program) fprintf(stderr, "num_waves: %u\n", program->num_waves); fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]); - fprintf(stderr, "branch_sendmsg_usage: %f\n", usage[(int)BlockCycleEstimator::branch_sendmsg]); + fprintf(stderr, "branch_sendmsg_usage: %f\n", + usage[(int)BlockCycleEstimator::branch_sendmsg]); fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]); fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]); fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]); @@ -565,9 +558,10 @@ void collect_preasm_stats(Program *program) } } -void collect_postasm_stats(Program *program, const std::vector& code) +void +collect_postasm_stats(Program* program, const std::vector& code) { program->statistics[aco::statistic_hash] = util_hash_crc32(code.data(), code.size() * 4); } -} +} // namespace aco diff --git a/src/amd/compiler/aco_util.h b/src/amd/compiler/aco_util.h index 9d24fb936db..88f52182a5f 100644 --- a/src/amd/compiler/aco_util.h +++ b/src/amd/compiler/aco_util.h @@ -35,207 +35,198 @@ namespace aco { /*! \brief Definition of a span object -* -* \details A "span" is an "array view" type for holding a view of contiguous -* data. The "span" object does not own the data itself. -*/ -template -class span { + * + * \details A "span" is an "array view" type for holding a view of contiguous + * data. The "span" object does not own the data itself. + */ +template class span { public: - using value_type = T; - using pointer = value_type*; - using const_pointer = const value_type*; - using reference = value_type&; - using const_reference = const value_type&; - using iterator = pointer; - using const_iterator = const_pointer; - using reverse_iterator = std::reverse_iterator; + using value_type = T; + using pointer = value_type*; + using const_pointer = const value_type*; + using reference = value_type&; + using const_reference = const value_type&; + using iterator = pointer; + using const_iterator = const_pointer; + using reverse_iterator = std::reverse_iterator; using const_reverse_iterator = std::reverse_iterator; - using size_type = uint16_t; - using difference_type = ptrdiff_t; + using size_type = uint16_t; + using difference_type = ptrdiff_t; /*! \brief Compiler generated default constructor - */ + */ constexpr span() = default; /*! \brief Constructor taking a pointer and the length of the span - * \param[in] data Pointer to the underlying data array - * \param[in] length The size of the span - */ - constexpr span(uint16_t offset_, const size_type length_) - : offset{ offset_ } , length{ length_ } {} + * \param[in] data Pointer to the underlying data array + * \param[in] length The size of the span + */ + constexpr span(uint16_t offset_, const size_type length_) : offset{offset_}, length{length_} {} /*! \brief Returns an iterator to the begin of the span - * \return data - */ - constexpr iterator begin() noexcept { - return (pointer)((uintptr_t)this + offset); - } + * \return data + */ + constexpr iterator begin() noexcept { return (pointer)((uintptr_t)this + offset); } /*! \brief Returns a const_iterator to the begin of the span - * \return data - */ - constexpr const_iterator begin() const noexcept { + * \return data + */ + constexpr const_iterator begin() const noexcept + { return (const_pointer)((uintptr_t)this + offset); } /*! \brief Returns an iterator to the end of the span - * \return data + length - */ - constexpr iterator end() noexcept { - return std::next(begin(), length); - } + * \return data + length + */ + constexpr iterator end() noexcept { return std::next(begin(), length); } /*! \brief Returns a const_iterator to the end of the span - * \return data + length - */ - constexpr const_iterator end() const noexcept { - return std::next(begin(), length); - } + * \return data + length + */ + constexpr const_iterator end() const noexcept { return std::next(begin(), length); } /*! \brief Returns a const_iterator to the begin of the span - * \return data - */ - constexpr const_iterator cbegin() const noexcept { - return begin(); - } + * \return data + */ + constexpr const_iterator cbegin() const noexcept { return begin(); } /*! \brief Returns a const_iterator to the end of the span - * \return data + length - */ - constexpr const_iterator cend() const noexcept { - return std::next(begin(), length); - } + * \return data + length + */ + constexpr const_iterator cend() const noexcept { return std::next(begin(), length); } /*! \brief Returns a reverse_iterator to the end of the span - * \return reverse_iterator(end()) - */ - constexpr reverse_iterator rbegin() noexcept { - return reverse_iterator(end()); - } + * \return reverse_iterator(end()) + */ + constexpr reverse_iterator rbegin() noexcept { return reverse_iterator(end()); } /*! \brief Returns a const_reverse_iterator to the end of the span - * \return reverse_iterator(end()) - */ - constexpr const_reverse_iterator rbegin() const noexcept { + * \return reverse_iterator(end()) + */ + constexpr const_reverse_iterator rbegin() const noexcept + { return const_reverse_iterator(end()); } /*! \brief Returns a reverse_iterator to the begin of the span - * \return reverse_iterator(begin()) - */ - constexpr reverse_iterator rend() noexcept { - return reverse_iterator(begin()); - } + * \return reverse_iterator(begin()) + */ + constexpr reverse_iterator rend() noexcept { return reverse_iterator(begin()); } /*! \brief Returns a const_reverse_iterator to the begin of the span - * \return reverse_iterator(begin()) - */ - constexpr const_reverse_iterator rend() const noexcept { + * \return reverse_iterator(begin()) + */ + constexpr const_reverse_iterator rend() const noexcept + { return const_reverse_iterator(begin()); } /*! \brief Returns a const_reverse_iterator to the end of the span - * \return rbegin() - */ - constexpr const_reverse_iterator crbegin() const noexcept { + * \return rbegin() + */ + constexpr const_reverse_iterator crbegin() const noexcept + { return const_reverse_iterator(cend()); } /*! \brief Returns a const_reverse_iterator to the begin of the span - * \return rend() - */ - constexpr const_reverse_iterator crend() const noexcept { + * \return rend() + */ + constexpr const_reverse_iterator crend() const noexcept + { return const_reverse_iterator(cbegin()); } /*! \brief Unchecked access operator - * \param[in] index Index of the element we want to access - * \return *(std::next(data, index)) - */ - constexpr reference operator[](const size_type index) noexcept { + * \param[in] index Index of the element we want to access + * \return *(std::next(data, index)) + */ + constexpr reference operator[](const size_type index) noexcept + { assert(length > index); return *(std::next(begin(), index)); } /*! \brief Unchecked const access operator - * \param[in] index Index of the element we want to access - * \return *(std::next(data, index)) - */ - constexpr const_reference operator[](const size_type index) const noexcept { + * \param[in] index Index of the element we want to access + * \return *(std::next(data, index)) + */ + constexpr const_reference operator[](const size_type index) const noexcept + { assert(length > index); return *(std::next(begin(), index)); } /*! \brief Returns a reference to the last element of the span - * \return *(std::next(data, length - 1)) - */ - constexpr reference back() noexcept { + * \return *(std::next(data, length - 1)) + */ + constexpr reference back() noexcept + { assert(length > 0); return *(std::next(begin(), length - 1)); } /*! \brief Returns a const_reference to the last element of the span - * \return *(std::next(data, length - 1)) - */ - constexpr const_reference back() const noexcept { + * \return *(std::next(data, length - 1)) + */ + constexpr const_reference back() const noexcept + { assert(length > 0); return *(std::next(begin(), length - 1)); } /*! \brief Returns a reference to the first element of the span - * \return *begin() - */ - constexpr reference front() noexcept { + * \return *begin() + */ + constexpr reference front() noexcept + { assert(length > 0); return *begin(); } /*! \brief Returns a const_reference to the first element of the span - * \return *cbegin() - */ - constexpr const_reference front() const noexcept { + * \return *cbegin() + */ + constexpr const_reference front() const noexcept + { assert(length > 0); return *cbegin(); } /*! \brief Returns true if the span is empty - * \return length == 0 - */ - constexpr bool empty() const noexcept { - return length == 0; - } + * \return length == 0 + */ + constexpr bool empty() const noexcept { return length == 0; } /*! \brief Returns the size of the span - * \return length == 0 - */ - constexpr size_type size() const noexcept { - return length; - } + * \return length == 0 + */ + constexpr size_type size() const noexcept { return length; } /*! \brief Decreases the size of the span by 1 - */ - constexpr void pop_back() noexcept { + */ + constexpr void pop_back() noexcept + { assert(length > 0); --length; } /*! \brief Adds an element to the end of the span - */ - constexpr void push_back(const_reference val) noexcept { - *std::next(begin(), length++) = val; - } + */ + constexpr void push_back(const_reference val) noexcept { *std::next(begin(), length++) = val; } /*! \brief Clears the span - */ - constexpr void clear() noexcept { + */ + constexpr void clear() noexcept + { offset = 0; length = 0; } private: - uint16_t offset{ 0 }; //!> Byte offset from span to data - size_type length{ 0 }; //!> Size of the span + uint16_t offset{0}; //!> Byte offset from span to data + size_type length{0}; //!> Size of the span }; /* @@ -250,30 +241,32 @@ private: */ struct IDSet { struct Iterator { - const IDSet *set; + const IDSet* set; union { struct { - uint32_t bit:6; - uint32_t word:26; + uint32_t bit : 6; + uint32_t word : 26; }; uint32_t id; }; - Iterator& operator ++(); + Iterator& operator++(); - bool operator != (const Iterator& other) const; + bool operator!=(const Iterator& other) const; - uint32_t operator * () const; + uint32_t operator*() const; }; - size_t count(uint32_t id) const { + size_t count(uint32_t id) const + { if (id >= words.size() * 64) return 0; return words[id / 64u] & (1ull << (id % 64u)) ? 1 : 0; } - Iterator find(uint32_t id) const { + Iterator find(uint32_t id) const + { if (!count(id)) return end(); @@ -284,7 +277,8 @@ struct IDSet { return it; } - std::pair insert(uint32_t id) { + std::pair insert(uint32_t id) + { if (words.size() * 64u <= id) words.resize(id / 64u + 1); @@ -302,7 +296,8 @@ struct IDSet { return std::make_pair(it, true); } - size_t erase(uint32_t id) { + size_t erase(uint32_t id) + { if (!count(id)) return 0; @@ -311,7 +306,8 @@ struct IDSet { return 1; } - Iterator cbegin() const { + Iterator cbegin() const + { Iterator it; it.set = this; for (size_t i = 0; i < words.size(); i++) { @@ -324,7 +320,8 @@ struct IDSet { return end(); } - Iterator cend() const { + Iterator cend() const + { Iterator it; it.set = this; it.word = words.size(); @@ -332,27 +329,21 @@ struct IDSet { return it; } - Iterator begin() const { - return cbegin(); - } + Iterator begin() const { return cbegin(); } - Iterator end() const { - return cend(); - } + Iterator end() const { return cend(); } - bool empty() const { - return bits_set == 0; - } + bool empty() const { return bits_set == 0; } - size_t size() const { - return bits_set; - } + size_t size() const { return bits_set; } std::vector words; uint32_t bits_set = 0; }; -inline IDSet::Iterator& IDSet::Iterator::operator ++() { +inline IDSet::Iterator& +IDSet::Iterator::operator++() +{ uint64_t m = set->words[word]; m &= ~((2ull << bit) - 1ull); if (!m) { @@ -374,12 +365,16 @@ inline IDSet::Iterator& IDSet::Iterator::operator ++() { return *this; } -inline bool IDSet::Iterator::operator != (const IDSet::Iterator& other) const { +inline bool +IDSet::Iterator::operator!=(const IDSet::Iterator& other) const +{ assert(set == other.set); return id != other.id; } -inline uint32_t IDSet::Iterator::operator * () const { +inline uint32_t +IDSet::Iterator::operator*() const +{ return (word << 6) | bit; } diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 400d58e5765..af1393ba418 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -23,6 +23,7 @@ */ #include "aco_ir.h" + #include "util/memstream.h" #include @@ -32,11 +33,11 @@ namespace aco { -static void aco_log(Program *program, enum radv_compiler_debug_level level, - const char *prefix, const char *file, unsigned line, - const char *fmt, va_list args) +static void +aco_log(Program* program, enum radv_compiler_debug_level level, const char* prefix, + const char* file, unsigned line, const char* fmt, va_list args) { - char *msg; + char* msg; if (program->debug.shorten_messages) { msg = ralloc_vasprintf(NULL, fmt, args); @@ -55,38 +56,39 @@ static void aco_log(Program *program, enum radv_compiler_debug_level level, ralloc_free(msg); } -void _aco_perfwarn(Program *program, const char *file, unsigned line, - const char *fmt, ...) +void +_aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...) { va_list args; va_start(args, fmt); - aco_log(program, RADV_COMPILER_DEBUG_LEVEL_PERFWARN, - "ACO PERFWARN:\n", file, line, fmt, args); + aco_log(program, RADV_COMPILER_DEBUG_LEVEL_PERFWARN, "ACO PERFWARN:\n", file, line, fmt, args); va_end(args); } -void _aco_err(Program *program, const char *file, unsigned line, - const char *fmt, ...) +void +_aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...) { va_list args; va_start(args, fmt); - aco_log(program, RADV_COMPILER_DEBUG_LEVEL_ERROR, - "ACO ERROR:\n", file, line, fmt, args); + aco_log(program, RADV_COMPILER_DEBUG_LEVEL_ERROR, "ACO ERROR:\n", file, line, fmt, args); va_end(args); } -bool validate_ir(Program* program) +bool +validate_ir(Program* program) { bool is_valid = true; - auto check = [&program, &is_valid](bool success, const char * msg, aco::Instruction * instr) -> void { + auto check = [&program, &is_valid](bool success, const char* msg, + aco::Instruction* instr) -> void + { if (!success) { - char *out; + char* out; size_t outsize; struct u_memstream mem; u_memstream_open(&mem, &out, &outsize); - FILE *const memf = u_memstream_get(&mem); + FILE* const memf = u_memstream_get(&mem); fprintf(memf, "%s: ", msg); aco_print_instr(instr, memf); @@ -99,7 +101,9 @@ bool validate_ir(Program* program) } }; - auto check_block = [&program, &is_valid](bool success, const char * msg, aco::Block * block) -> void { + auto check_block = [&program, &is_valid](bool success, const char* msg, + aco::Block* block) -> void + { if (!success) { aco_err(program, "%s: BB%u", msg, block->index); is_valid = false; @@ -132,32 +136,32 @@ bool validate_ir(Program* program) base_format = Format::VINTRP; } } - check(base_format == instr_info.format[(int)instr->opcode], "Wrong base format for instruction", instr.get()); + check(base_format == instr_info.format[(int)instr->opcode], + "Wrong base format for instruction", instr.get()); /* check VOP3 modifiers */ if (instr->isVOP3() && instr->format != Format::VOP3) { - check(base_format == Format::VOP2 || - base_format == Format::VOP1 || - base_format == Format::VOPC || - base_format == Format::VINTRP, + check(base_format == Format::VOP2 || base_format == Format::VOP1 || + base_format == Format::VOPC || base_format == Format::VINTRP, "Format cannot have VOP3/VOP3B applied", instr.get()); } /* check SDWA */ if (instr->isSDWA()) { - check(base_format == Format::VOP2 || - base_format == Format::VOP1 || - base_format == Format::VOPC, + check(base_format == Format::VOP2 || base_format == Format::VOP1 || + base_format == Format::VOPC, "Format cannot have SDWA applied", instr.get()); check(program->chip_class >= GFX8, "SDWA is GFX8+ only", instr.get()); SDWA_instruction& sdwa = instr->sdwa(); - check(sdwa.omod == 0 || program->chip_class >= GFX9, "SDWA omod only supported on GFX9+", instr.get()); + check(sdwa.omod == 0 || program->chip_class >= GFX9, + "SDWA omod only supported on GFX9+", instr.get()); if (base_format == Format::VOPC) { - check(sdwa.clamp == false || program->chip_class == GFX8, "SDWA VOPC clamp only supported on GFX8", instr.get()); + check(sdwa.clamp == false || program->chip_class == GFX8, + "SDWA VOPC clamp only supported on GFX8", instr.get()); check((instr->definitions[0].isFixed() && instr->definitions[0].physReg() == vcc) || - program->chip_class >= GFX9, + program->chip_class >= GFX9, "SDWA+VOPC definition must be fixed to vcc on GFX8", instr.get()); } @@ -171,8 +175,7 @@ bool validate_ir(Program* program) } const bool sdwa_opcodes = - instr->opcode != aco_opcode::v_fmac_f32 && - instr->opcode != aco_opcode::v_fmac_f16 && + instr->opcode != aco_opcode::v_fmac_f32 && instr->opcode != aco_opcode::v_fmac_f16 && instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 && instr->opcode != aco_opcode::v_fmamk_f16 && @@ -186,67 +189,75 @@ bool validate_ir(Program* program) const bool feature_mac = program->chip_class == GFX8 && - (instr->opcode == aco_opcode::v_mac_f32 && - instr->opcode == aco_opcode::v_mac_f16); + (instr->opcode == aco_opcode::v_mac_f32 && instr->opcode == aco_opcode::v_mac_f16); check(sdwa_opcodes || feature_mac, "SDWA can't be used with this opcode", instr.get()); if (instr->definitions[0].regClass().is_subdword()) - check((sdwa.dst_sel & sdwa_asuint) == (sdwa_isra | instr->definitions[0].bytes()), "Unexpected SDWA sel for sub-dword definition", instr.get()); + check((sdwa.dst_sel & sdwa_asuint) == (sdwa_isra | instr->definitions[0].bytes()), + "Unexpected SDWA sel for sub-dword definition", instr.get()); } /* check opsel */ if (instr->isVOP3()) { VOP3_instruction& vop3 = instr->vop3(); - check(vop3.opsel == 0 || program->chip_class >= GFX9, "Opsel is only supported on GFX9+", instr.get()); + check(vop3.opsel == 0 || program->chip_class >= GFX9, + "Opsel is only supported on GFX9+", instr.get()); for (unsigned i = 0; i < 3; i++) { if (i >= instr->operands.size() || - (instr->operands[i].hasRegClass() && instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed())) + (instr->operands[i].hasRegClass() && + instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed())) check((vop3.opsel & (1 << i)) == 0, "Unexpected opsel for operand", instr.get()); } if (instr->definitions[0].regClass().is_subdword() && !instr->definitions[0].isFixed()) - check((vop3.opsel & (1 << 3)) == 0, "Unexpected opsel for sub-dword definition", instr.get()); + check((vop3.opsel & (1 << 3)) == 0, "Unexpected opsel for sub-dword definition", + instr.get()); } /* check for undefs */ for (unsigned i = 0; i < instr->operands.size(); i++) { if (instr->operands[i].isUndefined()) { bool flat = instr->isFlatLike(); - bool can_be_undef = is_phi(instr) || instr->isEXP() || - instr->isReduction() || + bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() || instr->opcode == aco_opcode::p_create_vector || (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) || ((instr->isMUBUF() || instr->isMTBUF()) && i == 1); check(can_be_undef, "Undefs can only be used in certain operands", instr.get()); } else { - check(instr->operands[i].isFixed() || instr->operands[i].isTemp() || instr->operands[i].isConstant(), "Uninitialized Operand", instr.get()); + check(instr->operands[i].isFixed() || instr->operands[i].isTemp() || + instr->operands[i].isConstant(), + "Uninitialized Operand", instr.get()); } } /* check subdword definitions */ for (unsigned i = 0; i < instr->definitions.size(); i++) { if (instr->definitions[i].regClass().is_subdword()) - check(instr->isPseudo() || instr->definitions[i].bytes() <= 4, "Only Pseudo instructions can write subdword registers larger than 4 bytes", instr.get()); + check(instr->isPseudo() || instr->definitions[i].bytes() <= 4, + "Only Pseudo instructions can write subdword registers larger than 4 bytes", + instr.get()); } if (instr->isSALU() || instr->isVALU()) { /* check literals */ Operand literal(s1); - for (unsigned i = 0; i < instr->operands.size(); i++) - { + for (unsigned i = 0; i < instr->operands.size(); i++) { Operand op = instr->operands[i]; if (!op.isLiteral()) continue; check(!instr->isDPP() && !instr->isSDWA() && - (!instr->isVOP3() || program->chip_class >= GFX10) && - (!instr->isVOP3P() || program->chip_class >= GFX10), + (!instr->isVOP3() || program->chip_class >= GFX10) && + (!instr->isVOP3P() || program->chip_class >= GFX10), "Literal applied on wrong instruction format", instr.get()); - check(literal.isUndefined() || (literal.size() == op.size() && literal.constantValue() == op.constantValue()), "Only 1 Literal allowed", instr.get()); + check(literal.isUndefined() || (literal.size() == op.size() && + literal.constantValue() == op.constantValue()), + "Only 1 Literal allowed", instr.get()); literal = op; - check(instr->isSALU() || instr->isVOP3() || instr->isVOP3P() || i == 0 || i == 2, "Wrong source position for Literal argument", instr.get()); + check(instr->isSALU() || instr->isVOP3() || instr->isVOP3P() || i == 0 || i == 2, + "Wrong source position for Literal argument", instr.get()); } /* check num sgprs for VALU */ @@ -264,8 +275,7 @@ bool validate_ir(Program* program) else if (instr->isDPP()) scalar_mask = 0x0; - if (instr->isVOPC() || - instr->opcode == aco_opcode::v_readfirstlane_b32 || + if (instr->isVOPC() || instr->opcode == aco_opcode::v_readfirstlane_b32 || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_readlane_b32_e64) { check(instr->definitions[0].getTemp().type() == RegType::sgpr, @@ -277,45 +287,42 @@ bool validate_ir(Program* program) unsigned num_sgprs = 0; unsigned sgpr[] = {0, 0}; - for (unsigned i = 0; i < instr->operands.size(); i++) - { + for (unsigned i = 0; i < instr->operands.size(); i++) { Operand op = instr->operands[i]; if (instr->opcode == aco_opcode::v_readfirstlane_b32 || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_readlane_b32_e64) { - check(i != 1 || - (op.isTemp() && op.regClass().type() == RegType::sgpr) || - op.isConstant(), + check(i != 1 || (op.isTemp() && op.regClass().type() == RegType::sgpr) || + op.isConstant(), "Must be a SGPR or a constant", instr.get()); - check(i == 1 || - (op.isTemp() && op.regClass().type() == RegType::vgpr && op.bytes() <= 4), + check(i == 1 || (op.isTemp() && op.regClass().type() == RegType::vgpr && + op.bytes() <= 4), "Wrong Operand type for VALU instruction", instr.get()); continue; } if (instr->opcode == aco_opcode::v_permlane16_b32 || instr->opcode == aco_opcode::v_permlanex16_b32) { - check(i != 0 || - (op.isTemp() && op.regClass().type() == RegType::vgpr), + check(i != 0 || (op.isTemp() && op.regClass().type() == RegType::vgpr), "Operand 0 of v_permlane must be VGPR", instr.get()); - check(i == 0 || - (op.isTemp() && op.regClass().type() == RegType::sgpr) || - op.isConstant(), - "Lane select operands of v_permlane must be SGPR or constant", instr.get()); + check(i == 0 || (op.isTemp() && op.regClass().type() == RegType::sgpr) || + op.isConstant(), + "Lane select operands of v_permlane must be SGPR or constant", + instr.get()); } if (instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64) { - check(i != 2 || - (op.isTemp() && op.regClass().type() == RegType::vgpr && op.bytes() <= 4), + check(i != 2 || (op.isTemp() && op.regClass().type() == RegType::vgpr && + op.bytes() <= 4), "Wrong Operand type for VALU instruction", instr.get()); - check(i == 2 || - (op.isTemp() && op.regClass().type() == RegType::sgpr) || - op.isConstant(), + check(i == 2 || (op.isTemp() && op.regClass().type() == RegType::sgpr) || + op.isConstant(), "Must be a SGPR or a constant", instr.get()); continue; } if (op.isTemp() && instr->operands[i].regClass().type() == RegType::sgpr) { - check(scalar_mask & (1 << i), "Wrong source position for SGPR argument", instr.get()); + check(scalar_mask & (1 << i), "Wrong source position for SGPR argument", + instr.get()); if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) { if (num_sgprs < 2) @@ -324,19 +331,22 @@ bool validate_ir(Program* program) } if (op.isConstant() && !op.isLiteral()) - check(scalar_mask & (1 << i), "Wrong source position for constant argument", instr.get()); + check(scalar_mask & (1 << i), "Wrong source position for constant argument", + instr.get()); } - check(num_sgprs + (literal.isUndefined() ? 0 : 1) <= const_bus_limit, "Too many SGPRs/literals", instr.get()); + check(num_sgprs + (literal.isUndefined() ? 0 : 1) <= const_bus_limit, + "Too many SGPRs/literals", instr.get()); } if (instr->isSOP1() || instr->isSOP2()) { - check(instr->definitions[0].getTemp().type() == RegType::sgpr, "Wrong Definition type for SALU instruction", instr.get()); + check(instr->definitions[0].getTemp().type() == RegType::sgpr, + "Wrong Definition type for SALU instruction", instr.get()); for (const Operand& op : instr->operands) { - check(op.isConstant() || op.regClass().type() <= RegType::sgpr, - "Wrong Operand type for SALU instruction", instr.get()); + check(op.isConstant() || op.regClass().type() <= RegType::sgpr, + "Wrong Operand type for SALU instruction", instr.get()); + } } } - } switch (instr->format) { case Format::PSEUDO: { @@ -346,7 +356,8 @@ bool validate_ir(Program* program) check(op.bytes() < 4 || size % 4 == 0, "Operand is not aligned", instr.get()); size += op.bytes(); } - check(size == instr->definitions[0].bytes(), "Definition size does not match operand sizes", instr.get()); + check(size == instr->definitions[0].bytes(), + "Definition size does not match operand sizes", instr.get()); if (instr->definitions[0].getTemp().type() == RegType::sgpr) { for (const Operand& op : instr->operands) { check(op.isConstant() || op.regClass().type() == RegType::sgpr, @@ -354,55 +365,75 @@ bool validate_ir(Program* program) } } } else if (instr->opcode == aco_opcode::p_extract_vector) { - check((instr->operands[0].isTemp()) && instr->operands[1].isConstant(), "Wrong Operand types", instr.get()); - check((instr->operands[1].constantValue() + 1) * instr->definitions[0].bytes() <= instr->operands[0].bytes(), "Index out of range", instr.get()); - check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->operands[0].regClass().type() == RegType::sgpr, + check((instr->operands[0].isTemp()) && instr->operands[1].isConstant(), + "Wrong Operand types", instr.get()); + check((instr->operands[1].constantValue() + 1) * instr->definitions[0].bytes() <= + instr->operands[0].bytes(), + "Index out of range", instr.get()); + check(instr->definitions[0].getTemp().type() == RegType::vgpr || + instr->operands[0].regClass().type() == RegType::sgpr, "Cannot extract SGPR value from VGPR vector", instr.get()); - check(program->chip_class >= GFX9 || !instr->definitions[0].regClass().is_subdword() || - instr->operands[0].regClass().type() == RegType::vgpr, "Cannot extract subdword from SGPR before GFX9+", instr.get()); + check(program->chip_class >= GFX9 || + !instr->definitions[0].regClass().is_subdword() || + instr->operands[0].regClass().type() == RegType::vgpr, + "Cannot extract subdword from SGPR before GFX9+", instr.get()); } else if (instr->opcode == aco_opcode::p_split_vector) { check(instr->operands[0].isTemp(), "Operand must be a temporary", instr.get()); unsigned size = 0; for (const Definition& def : instr->definitions) { size += def.bytes(); } - check(size == instr->operands[0].bytes(), "Operand size does not match definition sizes", instr.get()); + check(size == instr->operands[0].bytes(), + "Operand size does not match definition sizes", instr.get()); if (instr->operands[0].getTemp().type() == RegType::vgpr) { for (const Definition& def : instr->definitions) - check(def.regClass().type() == RegType::vgpr, "Wrong Definition type for VGPR split_vector", instr.get()); + check(def.regClass().type() == RegType::vgpr, + "Wrong Definition type for VGPR split_vector", instr.get()); } else { for (const Definition& def : instr->definitions) - check(program->chip_class >= GFX9 || !def.regClass().is_subdword(), "Cannot split SGPR into subdword VGPRs before GFX9+", instr.get()); + check(program->chip_class >= GFX9 || !def.regClass().is_subdword(), + "Cannot split SGPR into subdword VGPRs before GFX9+", instr.get()); } } else if (instr->opcode == aco_opcode::p_parallelcopy) { - check(instr->definitions.size() == instr->operands.size(), "Number of Operands does not match number of Definitions", instr.get()); + check(instr->definitions.size() == instr->operands.size(), + "Number of Operands does not match number of Definitions", instr.get()); for (unsigned i = 0; i < instr->operands.size(); i++) { - check(instr->definitions[i].bytes() == instr->operands[i].bytes(), "Operand and Definition size must match", instr.get()); + check(instr->definitions[i].bytes() == instr->operands[i].bytes(), + "Operand and Definition size must match", instr.get()); if (instr->operands[i].isTemp()) - check((instr->definitions[i].getTemp().type() == instr->operands[i].regClass().type()) || - (instr->definitions[i].getTemp().type() == RegType::vgpr && instr->operands[i].regClass().type() == RegType::sgpr), + check((instr->definitions[i].getTemp().type() == + instr->operands[i].regClass().type()) || + (instr->definitions[i].getTemp().type() == RegType::vgpr && + instr->operands[i].regClass().type() == RegType::sgpr), "Operand and Definition types do not match", instr.get()); } } else if (instr->opcode == aco_opcode::p_phi) { - check(instr->operands.size() == block.logical_preds.size(), "Number of Operands does not match number of predecessors", instr.get()); - check(instr->definitions[0].getTemp().type() == RegType::vgpr, "Logical Phi Definition must be vgpr", instr.get()); + check(instr->operands.size() == block.logical_preds.size(), + "Number of Operands does not match number of predecessors", instr.get()); + check(instr->definitions[0].getTemp().type() == RegType::vgpr, + "Logical Phi Definition must be vgpr", instr.get()); for (const Operand& op : instr->operands) - check(instr->definitions[0].size() == op.size(), "Operand sizes must match Definition size", instr.get()); + check(instr->definitions[0].size() == op.size(), + "Operand sizes must match Definition size", instr.get()); } else if (instr->opcode == aco_opcode::p_linear_phi) { for (const Operand& op : instr->operands) { - check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type", instr.get()); - check(instr->definitions[0].size() == op.size(), "Operand sizes must match Definition size", instr.get()); + check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type", + instr.get()); + check(instr->definitions[0].size() == op.size(), + "Operand sizes must match Definition size", instr.get()); } - check(instr->operands.size() == block.linear_preds.size(), "Number of Operands does not match number of predecessors", instr.get()); - } else if (instr->opcode == aco_opcode::p_extract || instr->opcode == aco_opcode::p_insert) { - check(instr->operands[0].isTemp(), - "Data operand must be temporary", instr.get()); + check(instr->operands.size() == block.linear_preds.size(), + "Number of Operands does not match number of predecessors", instr.get()); + } else if (instr->opcode == aco_opcode::p_extract || + instr->opcode == aco_opcode::p_insert) { + check(instr->operands[0].isTemp(), "Data operand must be temporary", instr.get()); check(instr->operands[1].isConstant(), "Index must be constant", instr.get()); if (instr->opcode == aco_opcode::p_extract) - check(instr->operands[3].isConstant(), "Sign-extend flag must be constant", instr.get()); + check(instr->operands[3].isConstant(), "Sign-extend flag must be constant", + instr.get()); check(instr->definitions[0].getTemp().type() != RegType::sgpr || - instr->operands[0].getTemp().type() == RegType::sgpr, + instr->operands[0].getTemp().type() == RegType::sgpr, "Can't extract/insert VGPR to SGPR", instr.get()); if (instr->operands[0].getTemp().type() == RegType::vgpr) @@ -410,69 +441,106 @@ bool validate_ir(Program* program) "Sizes of operand and definition must match", instr.get()); if (instr->definitions[0].getTemp().type() == RegType::sgpr) - check(instr->definitions.size() >= 2 && instr->definitions[1].isFixed() && instr->definitions[1].physReg() == scc, "SGPR extract/insert needs a SCC definition", instr.get()); + check(instr->definitions.size() >= 2 && instr->definitions[1].isFixed() && + instr->definitions[1].physReg() == scc, + "SGPR extract/insert needs a SCC definition", instr.get()); - check(instr->operands[2].constantEquals(8) || instr->operands[2].constantEquals(16), "Size must be 8 or 16", instr.get()); - check(instr->operands[2].constantValue() < instr->operands[0].getTemp().bytes() * 8u, "Size must be smaller than source", instr.get()); + check(instr->operands[2].constantEquals(8) || instr->operands[2].constantEquals(16), + "Size must be 8 or 16", instr.get()); + check(instr->operands[2].constantValue() < instr->operands[0].getTemp().bytes() * 8u, + "Size must be smaller than source", instr.get()); - unsigned comp = instr->operands[0].bytes() * 8u / MAX2(instr->operands[2].constantValue(), 1); - check(instr->operands[1].constantValue() < comp, "Index must be in-bounds", instr.get()); + unsigned comp = + instr->operands[0].bytes() * 8u / MAX2(instr->operands[2].constantValue(), 1); + check(instr->operands[1].constantValue() < comp, "Index must be in-bounds", + instr.get()); } break; } case Format::PSEUDO_REDUCTION: { - for (const Operand &op : instr->operands) - check(op.regClass().type() == RegType::vgpr, "All operands of PSEUDO_REDUCTION instructions must be in VGPRs.", instr.get()); + for (const Operand& op : instr->operands) + check(op.regClass().type() == RegType::vgpr, + "All operands of PSEUDO_REDUCTION instructions must be in VGPRs.", + instr.get()); - if (instr->opcode == aco_opcode::p_reduce && instr->reduction().cluster_size == program->wave_size) - check(instr->definitions[0].regClass().type() == RegType::sgpr || program->wave_size == 32, "The result of unclustered reductions must go into an SGPR.", instr.get()); + if (instr->opcode == aco_opcode::p_reduce && + instr->reduction().cluster_size == program->wave_size) + check(instr->definitions[0].regClass().type() == RegType::sgpr || + program->wave_size == 32, + "The result of unclustered reductions must go into an SGPR.", instr.get()); else - check(instr->definitions[0].regClass().type() == RegType::vgpr, "The result of scans and clustered reductions must go into a VGPR.", instr.get()); + check(instr->definitions[0].regClass().type() == RegType::vgpr, + "The result of scans and clustered reductions must go into a VGPR.", + instr.get()); break; } case Format::SMEM: { if (instr->operands.size() >= 1) check((instr->operands[0].isFixed() && !instr->operands[0].isConstant()) || - (instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr), "SMEM operands must be sgpr", instr.get()); + (instr->operands[0].isTemp() && + instr->operands[0].regClass().type() == RegType::sgpr), + "SMEM operands must be sgpr", instr.get()); if (instr->operands.size() >= 2) - check(instr->operands[1].isConstant() || (instr->operands[1].isTemp() && instr->operands[1].regClass().type() == RegType::sgpr), + check(instr->operands[1].isConstant() || + (instr->operands[1].isTemp() && + instr->operands[1].regClass().type() == RegType::sgpr), "SMEM offset must be constant or sgpr", instr.get()); if (!instr->definitions.empty()) - check(instr->definitions[0].getTemp().type() == RegType::sgpr, "SMEM result must be sgpr", instr.get()); + check(instr->definitions[0].getTemp().type() == RegType::sgpr, + "SMEM result must be sgpr", instr.get()); break; } case Format::MTBUF: case Format::MUBUF: { - check(instr->operands.size() > 1, "VMEM instructions must have at least one operand", instr.get()); - check(instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::vgpr, + check(instr->operands.size() > 1, "VMEM instructions must have at least one operand", + instr.get()); + check(instr->operands[1].hasRegClass() && + instr->operands[1].regClass().type() == RegType::vgpr, "VADDR must be in vgpr for VMEM instructions", instr.get()); - check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr, "VMEM resource constant must be sgpr", instr.get()); - check(instr->operands.size() < 4 || (instr->operands[3].isTemp() && instr->operands[3].regClass().type() == RegType::vgpr), "VMEM write data must be vgpr", instr.get()); + check( + instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr, + "VMEM resource constant must be sgpr", instr.get()); + check(instr->operands.size() < 4 || + (instr->operands[3].isTemp() && + instr->operands[3].regClass().type() == RegType::vgpr), + "VMEM write data must be vgpr", instr.get()); break; } case Format::MIMG: { - check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands", instr.get()); - check(instr->operands[0].hasRegClass() && (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8), + check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands", + instr.get()); + check(instr->operands[0].hasRegClass() && + (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8), "MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get()); if (instr->operands[1].hasRegClass()) - check(instr->operands[1].regClass() == s4, "MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get()); + check(instr->operands[1].regClass() == s4, + "MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get()); if (!instr->operands[2].isUndefined()) { bool is_cmpswap = instr->opcode == aco_opcode::image_atomic_cmpswap || instr->opcode == aco_opcode::image_atomic_fcmpswap; - check(instr->definitions.empty() || (instr->definitions[0].regClass() == instr->operands[2].regClass() || is_cmpswap), - "MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and TFE/LWE loads", instr.get()); + check(instr->definitions.empty() || + (instr->definitions[0].regClass() == instr->operands[2].regClass() || + is_cmpswap), + "MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and " + "TFE/LWE loads", + instr.get()); } - check(instr->operands.size() == 4 || program->chip_class >= GFX10, "NSA is only supported on GFX10+", instr.get()); + check(instr->operands.size() == 4 || program->chip_class >= GFX10, + "NSA is only supported on GFX10+", instr.get()); for (unsigned i = 3; i < instr->operands.size(); i++) { if (instr->operands.size() == 4) { - check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr, + check(instr->operands[i].hasRegClass() && + instr->operands[i].regClass().type() == RegType::vgpr, "MIMG operands[3] (VADDR) must be VGPR", instr.get()); } else { - check(instr->operands[i].regClass() == v1, "MIMG VADDR must be v1 if NSA is used", instr.get()); + check(instr->operands[i].regClass() == v1, "MIMG VADDR must be v1 if NSA is used", + instr.get()); } } - check(instr->definitions.empty() || (instr->definitions[0].isTemp() && instr->definitions[0].regClass().type() == RegType::vgpr), + check(instr->definitions.empty() || + (instr->definitions[0].isTemp() && + instr->definitions[0].regClass().type() == RegType::vgpr), "MIMG definitions[0] (VDATA) must be VGPR", instr.get()); break; } @@ -482,31 +550,38 @@ bool validate_ir(Program* program) "Only VGPRs are valid DS instruction operands", instr.get()); } if (!instr->definitions.empty()) - check(instr->definitions[0].getTemp().type() == RegType::vgpr, "DS instruction must return VGPR", instr.get()); + check(instr->definitions[0].getTemp().type() == RegType::vgpr, + "DS instruction must return VGPR", instr.get()); break; } case Format::EXP: { for (unsigned i = 0; i < 4; i++) - check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr, + check(instr->operands[i].hasRegClass() && + instr->operands[i].regClass().type() == RegType::vgpr, "Only VGPRs are valid Export arguments", instr.get()); break; } case Format::FLAT: - check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR", instr.get()); + check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR", + instr.get()); FALLTHROUGH; case Format::GLOBAL: case Format::SCRATCH: { - check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH address must be vgpr", instr.get()); - check(instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::sgpr, + check( + instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::vgpr, + "FLAT/GLOBAL/SCRATCH address must be vgpr", instr.get()); + check(instr->operands[1].hasRegClass() && + instr->operands[1].regClass().type() == RegType::sgpr, "FLAT/GLOBAL/SCRATCH sgpr address must be undefined or sgpr", instr.get()); if (!instr->definitions.empty()) - check(instr->definitions[0].getTemp().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get()); + check(instr->definitions[0].getTemp().type() == RegType::vgpr, + "FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get()); else - check(instr->operands[2].regClass().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get()); + check(instr->operands[2].regClass().type() == RegType::vgpr, + "FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get()); break; } - default: - break; + default: break; } } } @@ -518,20 +593,26 @@ bool validate_ir(Program* program) /* predecessors/successors should be sorted */ for (unsigned j = 0; j + 1 < block.linear_preds.size(); j++) - check_block(block.linear_preds[j] < block.linear_preds[j + 1], "linear predecessors must be sorted", &block); + check_block(block.linear_preds[j] < block.linear_preds[j + 1], + "linear predecessors must be sorted", &block); for (unsigned j = 0; j + 1 < block.logical_preds.size(); j++) - check_block(block.logical_preds[j] < block.logical_preds[j + 1], "logical predecessors must be sorted", &block); + check_block(block.logical_preds[j] < block.logical_preds[j + 1], + "logical predecessors must be sorted", &block); for (unsigned j = 0; j + 1 < block.linear_succs.size(); j++) - check_block(block.linear_succs[j] < block.linear_succs[j + 1], "linear successors must be sorted", &block); + check_block(block.linear_succs[j] < block.linear_succs[j + 1], + "linear successors must be sorted", &block); for (unsigned j = 0; j + 1 < block.logical_succs.size(); j++) - check_block(block.logical_succs[j] < block.logical_succs[j + 1], "logical successors must be sorted", &block); + check_block(block.logical_succs[j] < block.logical_succs[j + 1], + "logical successors must be sorted", &block); /* critical edges are not allowed */ if (block.linear_preds.size() > 1) { for (unsigned pred : block.linear_preds) - check_block(program->blocks[pred].linear_succs.size() == 1, "linear critical edges are not allowed", &program->blocks[pred]); + check_block(program->blocks[pred].linear_succs.size() == 1, + "linear critical edges are not allowed", &program->blocks[pred]); for (unsigned pred : block.logical_preds) - check_block(program->blocks[pred].logical_succs.size() == 1, "logical critical edges are not allowed", &program->blocks[pred]); + check_block(program->blocks[pred].logical_succs.size() == 1, + "logical critical edges are not allowed", &program->blocks[pred]); } } @@ -544,8 +625,8 @@ namespace { struct Location { Location() : block(NULL), instr(NULL) {} - Block *block; - Instruction *instr; //NULL if it's the block's live-in + Block* block; + Instruction* instr; // NULL if it's the block's live-in }; struct Assignment { @@ -554,18 +635,20 @@ struct Assignment { PhysReg reg; }; -bool ra_fail(Program *program, Location loc, Location loc2, const char *fmt, ...) { +bool +ra_fail(Program* program, Location loc, Location loc2, const char* fmt, ...) +{ va_list args; va_start(args, fmt); char msg[1024]; vsprintf(msg, fmt, args); va_end(args); - char *out; + char* out; size_t outsize; struct u_memstream mem; u_memstream_open(&mem, &out, &outsize); - FILE *const memf = u_memstream_get(&mem); + FILE* const memf = u_memstream_get(&mem); fprintf(memf, "RA error found at instruction in BB%d:\n", loc.block->index); if (loc.instr) { @@ -587,7 +670,8 @@ bool ra_fail(Program *program, Location loc, Location loc2, const char *fmt, ... return true; } -bool validate_subdword_operand(chip_class chip, const aco_ptr& instr, unsigned index) +bool +validate_subdword_operand(chip_class chip, const aco_ptr& instr, unsigned index) { Operand op = instr->operands[index]; unsigned byte = op.physReg().byte(); @@ -635,14 +719,14 @@ bool validate_subdword_operand(chip_class chip, const aco_ptr& inst if (byte == 2 && index == 2) return true; break; - default: - break; + default: break; } return byte == 0; } -bool validate_subdword_definition(chip_class chip, const aco_ptr& instr) +bool +validate_subdword_definition(chip_class chip, const aco_ptr& instr) { Definition def = instr->definitions[0]; unsigned byte = def.physReg().byte(); @@ -664,16 +748,15 @@ bool validate_subdword_definition(chip_class chip, const aco_ptr& i case aco_opcode::global_load_ubyte_d16_hi: case aco_opcode::global_load_short_d16_hi: case aco_opcode::ds_read_u8_d16_hi: - case aco_opcode::ds_read_u16_d16_hi: - return byte == 2; - default: - break; + case aco_opcode::ds_read_u16_d16_hi: return byte == 2; + default: break; } return byte == 0; } -unsigned get_subdword_bytes_written(Program *program, const aco_ptr& instr, unsigned index) +unsigned +get_subdword_bytes_written(Program* program, const aco_ptr& instr, unsigned index) { chip_class chip = program->chip_class; Definition def = instr->definitions[index]; @@ -703,8 +786,7 @@ unsigned get_subdword_bytes_written(Program *program, const aco_ptr case aco_opcode::global_load_ubyte_d16_hi: case aco_opcode::global_load_short_d16_hi: case aco_opcode::ds_read_u8_d16_hi: - case aco_opcode::ds_read_u16_d16_hi: - return program->dev.sram_ecc_enabled ? 4 : 2; + case aco_opcode::ds_read_u16_d16_hi: return program->dev.sram_ecc_enabled ? 4 : 2; case aco_opcode::v_mad_f16: case aco_opcode::v_mad_u16: case aco_opcode::v_mad_i16: @@ -714,16 +796,18 @@ unsigned get_subdword_bytes_written(Program *program, const aco_ptr if (chip >= GFX9) return 2; break; - default: - break; + default: break; } - return MAX2(chip >= GFX10 ? def.bytes() : 4, instr_info.definition_size[(int)instr->opcode] / 8u); + return MAX2(chip >= GFX10 ? def.bytes() : 4, + instr_info.definition_size[(int)instr->opcode] / 8u); } } /* end namespace */ -bool validate_ra(Program *program) { +bool +validate_ra(Program* program) +{ if (!(debug_flags & DEBUG_VALIDATE_RA)) return false; @@ -754,13 +838,21 @@ bool validate_ra(Program *program) { if (!op.isFixed()) err |= ra_fail(program, loc, Location(), "Operand %d is not assigned a register", i); if (assignments.count(op.tempId()) && assignments[op.tempId()].reg != op.physReg()) - err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an inconsistent register assignment with instruction", i); - if ((op.getTemp().type() == RegType::vgpr && op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) || - (op.getTemp().type() == RegType::sgpr && op.physReg() + op.size() > program->config->num_sgprs && op.physReg() < sgpr_limit)) - err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an out-of-bounds register assignment", i); + err |= + ra_fail(program, loc, assignments.at(op.tempId()).firstloc, + "Operand %d has an inconsistent register assignment with instruction", i); + if ((op.getTemp().type() == RegType::vgpr && + op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) || + (op.getTemp().type() == RegType::sgpr && + op.physReg() + op.size() > program->config->num_sgprs && + op.physReg() < sgpr_limit)) + err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, + "Operand %d has an out-of-bounds register assignment", i); if (op.physReg() == vcc && !program->needs_vcc) - err |= ra_fail(program, loc, Location(), "Operand %d fixed to vcc but needs_vcc=false", i); - if (op.regClass().is_subdword() && !validate_subdword_operand(program->chip_class, instr, i)) + err |= ra_fail(program, loc, Location(), + "Operand %d fixed to vcc but needs_vcc=false", i); + if (op.regClass().is_subdword() && + !validate_subdword_operand(program->chip_class, instr, i)) err |= ra_fail(program, loc, Location(), "Operand %d not aligned correctly", i); if (!assignments[op.tempId()].firstloc.block) assignments[op.tempId()].firstloc = loc; @@ -773,15 +865,23 @@ bool validate_ra(Program *program) { if (!def.isTemp()) continue; if (!def.isFixed()) - err |= ra_fail(program, loc, Location(), "Definition %d is not assigned a register", i); + err |= + ra_fail(program, loc, Location(), "Definition %d is not assigned a register", i); if (assignments[def.tempId()].defloc.block) - err |= ra_fail(program, loc, assignments.at(def.tempId()).defloc, "Temporary %%%d also defined by instruction", def.tempId()); - if ((def.getTemp().type() == RegType::vgpr && def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) || - (def.getTemp().type() == RegType::sgpr && def.physReg() + def.size() > program->config->num_sgprs && def.physReg() < sgpr_limit)) - err |= ra_fail(program, loc, assignments.at(def.tempId()).firstloc, "Definition %d has an out-of-bounds register assignment", i); + err |= ra_fail(program, loc, assignments.at(def.tempId()).defloc, + "Temporary %%%d also defined by instruction", def.tempId()); + if ((def.getTemp().type() == RegType::vgpr && + def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) || + (def.getTemp().type() == RegType::sgpr && + def.physReg() + def.size() > program->config->num_sgprs && + def.physReg() < sgpr_limit)) + err |= ra_fail(program, loc, assignments.at(def.tempId()).firstloc, + "Definition %d has an out-of-bounds register assignment", i); if (def.physReg() == vcc && !program->needs_vcc) - err |= ra_fail(program, loc, Location(), "Definition %d fixed to vcc but needs_vcc=false", i); - if (def.regClass().is_subdword() && !validate_subdword_definition(program->chip_class, instr)) + err |= ra_fail(program, loc, Location(), + "Definition %d fixed to vcc but needs_vcc=false", i); + if (def.regClass().is_subdword() && + !validate_subdword_definition(program->chip_class, instr)) err |= ra_fail(program, loc, Location(), "Definition %d not aligned correctly", i); if (!assignments[def.tempId()].firstloc.block) assignments[def.tempId()].firstloc = loc; @@ -810,7 +910,9 @@ bool validate_ra(Program *program) { PhysReg reg = assignments.at(tmp.id()).reg; for (unsigned i = 0; i < tmp.bytes(); i++) { if (regs[reg.reg_b + i]) { - err |= ra_fail(program, loc, Location(), "Assignment of element %d of %%%d already taken by %%%d in live-out", i, tmp.id(), regs[reg.reg_b + i]); + err |= ra_fail(program, loc, Location(), + "Assignment of element %d of %%%d already taken by %%%d in live-out", + i, tmp.id(), regs[reg.reg_b + i]); } regs[reg.reg_b + i] = tmp.id(); } @@ -826,7 +928,10 @@ bool validate_ra(Program *program) { PhysReg reg = assignments.at(tmp.id()).reg; for (unsigned i = 0; i < tmp.bytes(); i++) { if (regs[reg.reg_b + i]) - err |= ra_fail(program, loc, Location(), "Assignment of element %d of %%%d already taken by %%%d in live-out", i, tmp.id(), regs[reg.reg_b + i]); + err |= ra_fail( + program, loc, Location(), + "Assignment of element %d of %%%d already taken by %%%d in live-out", i, + tmp.id(), regs[reg.reg_b + i]); } live.emplace(tmp); } @@ -886,16 +991,23 @@ bool validate_ra(Program *program) { PhysReg reg = assignments.at(tmp.id()).reg; for (unsigned j = 0; j < tmp.bytes(); j++) { if (regs[reg.reg_b + j]) - err |= ra_fail(program, loc, assignments.at(regs[reg.reg_b + j]).defloc, "Assignment of element %d of %%%d already taken by %%%d from instruction", i, tmp.id(), regs[reg.reg_b + j]); + err |= ra_fail( + program, loc, assignments.at(regs[reg.reg_b + j]).defloc, + "Assignment of element %d of %%%d already taken by %%%d from instruction", i, + tmp.id(), regs[reg.reg_b + j]); regs[reg.reg_b + j] = tmp.id(); } if (def.regClass().is_subdword() && def.bytes() < 4) { unsigned written = get_subdword_bytes_written(program, instr, i); - /* If written=4, the instruction still might write the upper half. In that case, it's the lower half that isn't preserved */ + /* If written=4, the instruction still might write the upper half. In that case, it's + * the lower half that isn't preserved */ for (unsigned j = reg.byte() & ~(written - 1); j < written; j++) { unsigned written_reg = reg.reg() * 4u + j; if (regs[written_reg] && regs[written_reg] != def.tempId()) - err |= ra_fail(program, loc, assignments.at(regs[written_reg]).defloc, "Assignment of element %d of %%%d overwrites the full register taken by %%%d from instruction", i, tmp.id(), regs[written_reg]); + err |= ra_fail(program, loc, assignments.at(regs[written_reg]).defloc, + "Assignment of element %d of %%%d overwrites the full register " + "taken by %%%d from instruction", + i, tmp.id(), regs[written_reg]); } } } @@ -924,4 +1036,4 @@ bool validate_ra(Program *program) { return err; } -} +} // namespace aco