mesa/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1145 lines
41 KiB
C++
Raw Normal View History

/*
* Copyright © 2018 Valve Corporation
* Copyright © 2018 Google
*
* SPDX-License-Identifier: MIT
*/
#include "aco_builder.h"
#include "aco_instruction_selection.h"
#include "aco_ir.h"
#include "aco_nir_call_attribs.h"
#include "util/memstream.h"
#include <optional>
namespace aco {
void
_isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
const char* msg)
{
char* out;
size_t outsize;
struct u_memstream mem;
u_memstream_open(&mem, &out, &outsize);
FILE* const memf = u_memstream_get(&mem);
fprintf(memf, "%s: ", msg);
nir_print_instr(instr, memf);
u_memstream_close(&mem);
_aco_err(ctx->program, file, line, out);
free(out);
}
void
append_logical_start(Block* b)
{
Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
}
void
append_logical_end(isel_context* ctx, bool append_reload_preserved)
{
Builder bld(ctx->program, ctx->block);
if (append_reload_preserved && ctx->program->is_callee && ctx->block->loop_nest_depth == 0)
emit_reload_preserved(ctx);
bld.pseudo(aco_opcode::p_logical_end);
}
Temp
get_ssa_temp_tex(struct isel_context* ctx, nir_def* def, bool is_16bit)
{
RegClass rc = RegClass::get(RegType::vgpr, (is_16bit ? 2 : 4) * def->num_components);
Temp tmp = get_ssa_temp(ctx, def);
if (tmp.bytes() != rc.bytes())
return emit_extract_vector(ctx, tmp, 0, rc);
else
return tmp;
}
Temp
bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst)
{
Builder bld(ctx->program, ctx->block);
if (!dst.id())
dst = bld.tmp(bld.lm);
assert(val.regClass() == s1);
assert(dst.regClass() == bld.lm);
return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
bld.scc(val));
}
Temp
bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst)
{
Builder bld(ctx->program, ctx->block);
if (!dst.id())
dst = bld.tmp(s1);
assert(val.regClass() == bld.lm);
assert(dst.regClass() == s1);
/* if we're currently in WQM mode, ensure that the source is also computed in WQM */
bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(dst)), val, Operand(exec, bld.lm));
return dst;
}
static Temp
as_vgpr(Builder& bld, Temp val)
{
if (val.type() == RegType::sgpr)
return bld.copy(bld.def(RegType::vgpr, val.size()), val);
assert(val.type() == RegType::vgpr);
return val;
}
Temp
as_vgpr(isel_context* ctx, Temp val)
{
Builder bld(ctx->program, ctx->block);
return as_vgpr(bld, val);
}
Temp
emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
{
/* no need to extract the whole vector */
if (src.regClass() == dst_rc) {
assert(idx == 0);
return src;
}
assert(src.bytes() > (idx * dst_rc.bytes()));
Builder bld(ctx->program, ctx->block);
auto it = ctx->allocated_vec.find(src.id());
if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
if (it->second[idx].regClass() == dst_rc) {
return it->second[idx];
} else {
assert(!dst_rc.is_subdword());
assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
return bld.copy(bld.def(dst_rc), it->second[idx]);
}
}
if (dst_rc.is_subdword())
src = as_vgpr(ctx, src);
if (src.bytes() == dst_rc.bytes()) {
assert(idx == 0);
return bld.copy(bld.def(dst_rc), src);
} else {
Temp dst = bld.tmp(dst_rc);
bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
return dst;
}
}
void
emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
{
if (num_components == 1)
return;
if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
return;
if (num_components > vec_src.size() && vec_src.type() == RegType::sgpr) {
/* sub-dword split: should still help get_alu_src() */
emit_split_vector(ctx, vec_src, vec_src.size());
return;
}
RegClass rc = RegClass::get(vec_src.type(), vec_src.bytes() / num_components);
aco_ptr<Instruction> split{
create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
split->operands[0] = Operand(vec_src);
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
for (unsigned i = 0; i < num_components; i++) {
elems[i] = ctx->program->allocateTmp(rc);
split->definitions[i] = Definition(elems[i]);
}
ctx->block->instructions.emplace_back(std::move(split));
ctx->allocated_vec.emplace(vec_src.id(), elems);
}
/* This vector expansion uses a mask to determine which elements in the new vector
* come from the original vector. The other elements are undefined. */
void
expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask,
bool zero_padding)
{
assert(vec_src.type() == RegType::vgpr);
Builder bld(ctx->program, ctx->block);
if (dst.type() == RegType::sgpr && num_components > dst.size()) {
Temp tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, 2 * num_components));
expand_vector(ctx, vec_src, tmp_dst, num_components, mask, zero_padding);
bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp_dst);
ctx->allocated_vec[dst.id()] = ctx->allocated_vec[tmp_dst.id()];
return;
}
emit_split_vector(ctx, vec_src, util_bitcount(mask));
if (vec_src == dst)
return;
if (num_components == 1) {
if (dst.type() == RegType::sgpr)
bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
else
bld.copy(Definition(dst), vec_src);
return;
}
unsigned component_bytes = dst.bytes() / num_components;
RegClass src_rc = RegClass::get(RegType::vgpr, component_bytes);
RegClass dst_rc = RegClass::get(dst.type(), component_bytes);
assert(dst.type() == RegType::vgpr || !src_rc.is_subdword());
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
Temp padding = Temp(0, dst_rc);
if (zero_padding)
padding = bld.copy(bld.def(dst_rc), Operand::zero(component_bytes));
aco_ptr<Instruction> vec{
create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
vec->definitions[0] = Definition(dst);
unsigned k = 0;
for (unsigned i = 0; i < num_components; i++) {
if (mask & (1 << i)) {
Temp src = emit_extract_vector(ctx, vec_src, k++, src_rc);
if (dst.type() == RegType::sgpr)
src = bld.as_uniform(src);
vec->operands[i] = Operand(src);
elems[i] = src;
} else {
vec->operands[i] = Operand::zero(component_bytes);
elems[i] = padding;
}
}
ctx->block->instructions.emplace_back(std::move(vec));
ctx->allocated_vec.emplace(dst.id(), elems);
}
/**
* Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
* src_bits and dst_bits are truncated.
*
* Sign extension may be applied using the sign_extend parameter. The position of the input sign
* bit is indicated by src_bits in this case.
*
* If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
*/
Temp
convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
bool sign_extend, Temp dst)
{
assert(!(sign_extend && dst_bits < src_bits) &&
"Shrinking integers is not supported for signed inputs");
if (!dst.id())
dst = bld.tmp(RegClass::get(src.type(), dst_bits / 8u));
assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
/* Copy the raw value, leaving an undefined value in the upper bits for
* the caller to handle appropriately */
return bld.copy(Definition(dst), src);
} else if (dst.bytes() < src.bytes()) {
return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
}
Temp tmp = dst;
if (dst_bits == 64)
tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
if (tmp == src) {
} else if (src.regClass() == s1) {
assert(src_bits < 32);
bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
} else {
assert(src_bits < 32);
bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(),
Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
}
if (dst_bits == 64) {
if (sign_extend && dst.regClass() == s2) {
Temp high =
bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
} else if (sign_extend && dst.regClass() == v2) {
Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
} else {
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
}
}
return dst;
}
Temp
convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform)
{
if (ptr.size() == 2)
return ptr;
Builder bld(ctx->program, ctx->block);
if (ptr.type() == RegType::vgpr && !non_uniform)
ptr = bld.as_uniform(ptr);
return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
Operand::c32((unsigned)ctx->options->address32_hi));
}
void
select_vec2(isel_context* ctx, Temp dst, Temp cond, Temp then, Temp els)
{
Builder bld(ctx->program, ctx->block);
Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
}
Operand
load_lds_size_m0(Builder& bld)
{
/* m0 does not need to be initialized on GFX9+ */
if (bld.program->gfx_level >= GFX9)
return Operand(s1);
return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
}
Temp
create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
unsigned elem_size_bytes, unsigned split_cnt, Temp dst)
{
Builder bld(ctx->program, ctx->block);
unsigned dword_size = elem_size_bytes / 4;
if (!dst.id())
dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
aco_ptr<Instruction> instr{
create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
instr->definitions[0] = Definition(dst);
for (unsigned i = 0; i < cnt; ++i) {
if (arr[i].id()) {
assert(arr[i].size() == dword_size);
allocated_vec[i] = arr[i];
instr->operands[i] = Operand(arr[i]);
} else {
Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
Operand::zero(dword_size == 2 ? 8 : 4));
allocated_vec[i] = zero;
instr->operands[i] = Operand(zero);
}
}
bld.insert(std::move(instr));
if (split_cnt)
emit_split_vector(ctx, dst, split_cnt);
else
ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
return dst;
}
void
emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
Temp prim_mask, bool high_16bits)
{
Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
Builder bld(ctx->program, ctx->block);
if (ctx->cf_info.in_divergent_cf || ctx->cf_info.had_divergent_discard) {
bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
Operand::c32(idx), Operand::c32(component), Operand::c32(high_16bits), coord1,
coord2, bld.m0(prim_mask));
return;
}
Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
Temp res;
if (dst.regClass() == v2b) {
Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), p, coord1,
p, high_16bits ? 0x5 : 0);
bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p, coord2, p10,
high_16bits ? 0x1 : 0);
} else {
Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1), p, coord1, p);
bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2, p10);
}
/* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
set_wqm(ctx, true);
}
void
emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
Temp prim_mask, bool high_16bits)
{
if (ctx->options->gfx_level >= GFX11) {
emit_interp_instr_gfx11(ctx, idx, component, src, dst, prim_mask, high_16bits);
return;
}
Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
Builder bld(ctx->program, ctx->block);
if (dst.regClass() == v2b) {
if (ctx->program->dev.has_16bank_lds) {
assert(ctx->options->gfx_level <= GFX8);
Builder::Result interp_p1 =
bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
bld.m0(prim_mask), idx, component);
interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v1), coord1,
bld.m0(prim_mask), interp_p1, idx, component, high_16bits);
bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
interp_p1, idx, component, high_16bits);
} else {
aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
if (ctx->options->gfx_level == GFX8)
interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
bld.m0(prim_mask), idx, component, high_16bits);
bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
component, high_16bits);
}
} else {
assert(!high_16bits);
Temp interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
bld.m0(prim_mask), idx, component);
bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
idx, component);
}
}
void
emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsigned vertex_id,
Temp dst, Temp prim_mask, bool high_16bits)
{
Builder bld(ctx->program, ctx->block);
Temp tmp = dst.bytes() == 2 ? bld.tmp(v1) : dst;
if (ctx->options->gfx_level >= GFX11) {
uint16_t dpp_ctrl = dpp_quad_perm(vertex_id, vertex_id, vertex_id, vertex_id);
if (ctx->cf_info.in_divergent_cf || ctx->cf_info.had_divergent_discard) {
bld.pseudo(aco_opcode::p_interp_gfx11, Definition(tmp), Operand(v1.as_linear()),
Operand::c32(idx), Operand::c32(component), Operand::c32(dpp_ctrl),
bld.m0(prim_mask));
} else {
Temp p =
bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(tmp), p, dpp_ctrl);
/* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
set_wqm(ctx, true);
}
} else {
bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(tmp), Operand::c32((vertex_id + 2) % 3),
bld.m0(prim_mask), idx, component);
}
if (dst.id() != tmp.id())
bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmp, Operand::c32(high_16bits));
}
/* Packs multiple Temps of different sizes in to a vector of v1 Temps.
* The byte count of each input Temp must be a multiple of 2.
*/
std::vector<Temp>
emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked)
{
Builder bld(ctx->program, ctx->block);
std::vector<Temp> packed;
Temp low = Temp();
for (Temp tmp : unpacked) {
assert(tmp.bytes() % 2 == 0);
unsigned byte_idx = 0;
while (byte_idx < tmp.bytes()) {
if (low != Temp()) {
Temp high = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, high);
low = Temp();
packed.push_back(dword);
byte_idx += 2;
} else if (byte_idx % 4 == 0 && (byte_idx + 4) <= tmp.bytes()) {
packed.emplace_back(emit_extract_vector(ctx, tmp, byte_idx / 4, v1));
byte_idx += 4;
} else {
low = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
byte_idx += 2;
}
}
}
if (low != Temp()) {
Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, Operand(v2b));
packed.push_back(dword);
}
return packed;
}
MIMG_instruction*
emit_mimg(Builder& bld, aco_opcode op, std::vector<Temp> dsts, Temp rsrc, Operand samp,
std::vector<Temp> coords, bool disable_wqm, Operand vdata)
{
bool is_vsample = !samp.isUndefined() || op == aco_opcode::image_msaa_load;
size_t nsa_size = bld.program->dev.max_nsa_vgprs;
if (!is_vsample && bld.program->gfx_level >= GFX12)
nsa_size++; /* VIMAGE can encode one more VADDR */
nsa_size = bld.program->gfx_level >= GFX11 || coords.size() <= nsa_size ? nsa_size : 0;
const bool strict_wqm = coords[0].regClass().is_linear_vgpr();
if (strict_wqm)
nsa_size = coords.size();
for (unsigned i = 0; i < std::min(coords.size(), nsa_size); i++) {
if (!coords[i].id())
continue;
coords[i] = as_vgpr(bld, coords[i]);
}
if (nsa_size < coords.size()) {
Temp coord = coords[nsa_size];
if (coords.size() - nsa_size > 1) {
aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
coords.size() - nsa_size, 1)};
unsigned coord_size = 0;
for (unsigned i = nsa_size; i < coords.size(); i++) {
vec->operands[i - nsa_size] = Operand(coords[i]);
coord_size += coords[i].size();
}
coord = bld.tmp(RegType::vgpr, coord_size);
vec->definitions[0] = Definition(coord);
bld.insert(std::move(vec));
} else {
coord = as_vgpr(bld, coord);
}
coords[nsa_size] = coord;
coords.resize(nsa_size + 1);
}
aco_ptr<Instruction> mimg{
create_instruction(op, Format::MIMG, 3 + coords.size() + disable_wqm * 2, dsts.size())};
for (unsigned i = 0; i < dsts.size(); ++i)
mimg->definitions[i] = Definition(dsts[i]);
mimg->operands[0] = Operand(rsrc);
mimg->operands[1] = samp;
mimg->operands[2] = vdata;
for (unsigned i = 0; i < coords.size(); i++)
mimg->operands[3 + i] = Operand(coords[i]);
init_disable_wqm(bld, mimg->mimg(), disable_wqm);
mimg->mimg().strict_wqm = strict_wqm;
return &bld.insert(std::move(mimg))->mimg();
}
Operand
emit_tfe_init(Builder& bld, Temp dst)
{
Temp tmp = bld.tmp(dst.regClass());
aco_ptr<Instruction> vec{
create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
for (unsigned i = 0; i < dst.size(); i++)
vec->operands[i] = Operand::zero();
vec->definitions[0] = Definition(tmp);
/* Since this is fixed to an instruction's definition register, any CSE will
* just create copies. Copying costs about the same as zero-initialization,
* but these copies can break up clauses.
*/
vec->definitions[0].setNoCSE(true);
bld.insert(std::move(vec));
return Operand(tmp);
}
void
create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* mrt0,
const struct aco_export_mrt* mrt1)
{
Builder bld(ctx->program, ctx->block);
aco_ptr<Instruction> exp{
create_instruction(aco_opcode::p_dual_src_export_gfx11, Format::PSEUDO, 10, 6)};
for (unsigned i = 0; i < 4; i++) {
exp->operands[i] = mrt0 ? mrt0->out[i] : Operand(v1);
exp->operands[i + 4] = mrt1 ? mrt1->out[i] : Operand(v1);
}
instr_exact_mask(exp.get()) = Operand();
instr_wqm_mask(exp.get()) = Operand();
RegClass type = RegClass(RegType::vgpr, util_bitcount(mrt0->enabled_channels));
exp->definitions[0] = bld.def(type); /* mrt0 */
exp->definitions[1] = bld.def(type); /* mrt1 */
exp->definitions[2] = bld.def(bld.lm);
exp->definitions[3] = bld.def(bld.lm);
exp->definitions[4] = bld.def(bld.lm, vcc);
exp->definitions[5] = bld.def(s1, scc);
ctx->block->instructions.emplace_back(std::move(exp));
ctx->program->has_color_exports = true;
}
Temp
lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset)
{
assert(count.regClass() == s1);
Builder bld(ctx->program, ctx->block);
/* We could optimize other cases, but they are unused at the moment. */
if (bit_offset != 0 && bit_offset != 8) {
assert(bit_offset < 32);
count = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), count,
Operand::c32(bit_offset));
bit_offset = 0;
}
if (ctx->program->wave_size == 32 && bit_offset == 0) {
/* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
* the register. It doesn't work for 64 because it only uses 6 bits. */
Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
return emit_extract_vector(ctx, mask, 0, bld.lm);
} else {
/* s_bfe (both u32 and u64) uses 7 bits for the size, but it needs them in the high word.
* The low word is used for the offset, which has to be zero for our use case.
*/
if (bit_offset == 0 && ctx->program->gfx_level >= GFX9) {
/* Avoid writing scc for better scheduling. */
count = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), Operand::c32(0), count);
} else {
count = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), count,
Operand::c32(16 - bit_offset));
}
if (ctx->program->wave_size == 32) {
return bld.sop2(aco_opcode::s_bfe_u32, bld.def(bld.lm), bld.def(s1, scc), Operand::c32(-1),
count);
} else {
return bld.sop2(aco_opcode::s_bfe_u64, bld.def(bld.lm), bld.def(s1, scc),
Operand::c64(-1ll), count);
}
}
}
void
build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
{
aco_ptr<Instruction> end{
create_instruction(aco_opcode::p_end_with_regs, Format::PSEUDO, regs.size(), 0)};
for (unsigned i = 0; i < regs.size(); i++)
end->operands[i] = regs[i];
ctx->block->instructions.emplace_back(std::move(end));
ctx->block->kind |= block_kind_end_with_regs;
}
Instruction*
add_startpgm(struct isel_context* ctx, bool is_callee)
{
ctx->program->scratch_arg_size += ctx->callee_info.scratch_param_size;
unsigned def_count = 0;
for (unsigned i = 0; i < ctx->args->arg_count; i++) {
if (ctx->args->args[i].skip)
continue;
unsigned align = MIN2(4, util_next_power_of_two(ctx->args->args[i].size));
if (ctx->args->args[i].file == AC_ARG_SGPR && ctx->args->args[i].offset % align)
def_count += ctx->args->args[i].size;
else
def_count++;
}
if (is_callee) {
/* We do not support shader args in callees. */
assert(def_count == 0);
def_count += ctx->callee_info.reg_param_count;
/* Add system parameters separately - they aren't counted by reg_param_count */
assert(ctx->callee_info.stack_ptr.is_reg && ctx->callee_info.return_address.is_reg);
def_count += 2;
}
Instruction* startpgm = create_instruction(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count);
ctx->block->instructions.emplace_back(startpgm);
for (unsigned i = 0, arg = 0; i < ctx->args->arg_count; i++) {
if (ctx->args->args[i].skip)
continue;
enum ac_arg_regfile file = ctx->args->args[i].file;
unsigned size = ctx->args->args[i].size;
unsigned reg = ctx->args->args[i].offset;
RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
if (file == AC_ARG_SGPR && reg % MIN2(4, util_next_power_of_two(size))) {
Temp elems[16];
for (unsigned j = 0; j < size; j++) {
elems[j] = ctx->program->allocateTmp(s1);
startpgm->definitions[arg++] = Definition(elems[j], PhysReg{reg + j});
}
ctx->arg_temps[i] = create_vec_from_array(ctx, elems, size, RegType::sgpr, 4);
} else {
Temp dst = ctx->program->allocateTmp(type);
Definition def(dst);
def.setPrecolored(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
ctx->arg_temps[i] = dst;
startpgm->definitions[arg++] = def;
if (ctx->args->args[i].pending_vmem) {
assert(file == AC_ARG_VGPR);
ctx->program->args_pending_vmem.push_back(def);
}
}
}
if (is_callee) {
unsigned def_idx = 0;
if (ctx->program->gfx_level >= GFX9)
ctx->program->stack_ptr = ctx->callee_info.stack_ptr.def.getTemp();
else
ctx->program->static_scratch_rsrc = ctx->callee_info.stack_ptr.def.getTemp();
startpgm->definitions[def_idx++] = ctx->callee_info.stack_ptr.def;
startpgm->definitions[def_idx++] = ctx->callee_info.return_address.def;
for (auto& info : ctx->callee_info.param_infos) {
if (!info.is_reg)
continue;
startpgm->definitions[def_idx++] = info.def;
}
}
/* epilog has no scratch */
if (ctx->args->scratch_offset.used) {
if (ctx->program->gfx_level < GFX9) {
/* Stash these in the program so that they can be accessed later when
* handling spilling.
*/
if (ctx->args->ring_offsets.used)
ctx->program->private_segment_buffers.push_back(get_arg(ctx, ctx->args->ring_offsets));
ctx->program->scratch_offsets.push_back(get_arg(ctx, ctx->args->scratch_offset));
} else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) {
/* Manually initialize scratch. For RT stages scratch initialization is done in the prolog.
*/
Operand scratch_addr = ctx->args->ring_offsets.used
? Operand(get_arg(ctx, ctx->args->ring_offsets))
: Operand(s2);
Builder bld(ctx->program, ctx->block);
bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), scratch_addr,
get_arg(ctx, ctx->args->scratch_offset));
}
}
return startpgm;
}
static void
cleanup_cfg(Program* program)
{
/* create linear_succs/logical_succs */
for (Block& BB : program->blocks) {
for (unsigned idx : BB.linear_preds)
program->blocks[idx].linear_succs.emplace_back(BB.index);
for (unsigned idx : BB.logical_preds)
program->blocks[idx].logical_succs.emplace_back(BB.index);
}
}
void
finish_program(isel_context* ctx)
{
cleanup_cfg(ctx->program);
/* Insert a single p_end_wqm instruction after the last derivative calculation */
if (ctx->program->stage == fragment_fs && ctx->program->needs_wqm && ctx->program->needs_exact) {
/* Find the next BB at top-level CFG */
while (!(ctx->program->blocks[ctx->wqm_block_idx].kind & block_kind_top_level)) {
ctx->wqm_block_idx++;
ctx->wqm_instruction_idx = 0;
}
std::vector<aco_ptr<Instruction>>* instrs =
&ctx->program->blocks[ctx->wqm_block_idx].instructions;
auto it = instrs->begin() + ctx->wqm_instruction_idx;
/* Delay transistion to Exact to help optimizations and scheduling */
while (it != instrs->end()) {
aco_ptr<Instruction>& instr = *it;
/* End WQM before: */
if (instr->isDS() || instr->isEXP() ||
instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
instr->opcode == aco_opcode::p_jump_to_epilog ||
instr->opcode == aco_opcode::p_logical_start)
break;
++it;
/* End WQM after: */
if (instr->opcode == aco_opcode::p_logical_end ||
instr->opcode == aco_opcode::p_discard_if ||
instr->opcode == aco_opcode::p_demote_to_helper ||
instr->opcode == aco_opcode::p_end_with_regs)
break;
}
Builder bld(ctx->program);
bld.reset(instrs, it);
bld.pseudo(aco_opcode::p_end_wqm);
}
}
ABI
nir_abi_to_aco(unsigned function_attributes)
{
switch (function_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) {
case ACO_NIR_CALL_ABI_RT_RECURSIVE: return rtRaygenABI;
case ACO_NIR_CALL_ABI_TRAVERSAL: return rtTraversalABI;
case ACO_NIR_CALL_ABI_AHIT_ISEC: return rtAnyHitABI;
default: UNREACHABLE("invalid abi");
}
}
struct param_assignment_info {
uint16_t required_alignment;
uint16_t provided_alignment;
RegClass rc;
parameter_info* dst_info;
bool is_return_param;
/* If true, this parameter shouldn't count toward the callee info's reg_param_count because it
* receives special handling (e.g. the call return address being a definition instead of an
* operand).
*/
bool is_system_param;
/* This parameter must reside in a register. Used for stack pointers as well as s_swappc
* operands.
*/
bool force_reg;
};
std::optional<PhysReg>
find_reg(BITSET_WORD* regs, RegClass rc)
{
uint16_t start = 0;
uint16_t size = 128;
if (rc.type() == RegType::vgpr) {
start = 256;
size = 256;
}
uint16_t contiguous_size = 0;
for (uint16_t i = 0; i < size; ++i) {
if (!BITSET_TEST(regs, start + i)) {
contiguous_size = 0;
continue;
}
if (++contiguous_size >= rc.size())
return PhysReg{(unsigned)(start + i - contiguous_size + 1)};
}
return {};
}
void
find_param_regs(Program* program, const ABI& abi, callee_info& info,
std::vector<struct param_assignment_info>& params, RegisterDemand reg_limit)
{
unsigned scratch_param_bytes = 0;
RegisterDemand param_demand = RegisterDemand();
BITSET_DECLARE(preserved_regs, 512);
BITSET_DECLARE(clobbered_regs, 512);
abi.preservedRegisters(preserved_regs, reg_limit);
BITSET_COPY(clobbered_regs, preserved_regs);
BITSET_NOT(clobbered_regs);
bool has_preserved_regs = !BITSET_IS_EMPTY(preserved_regs);
std::stable_sort(params.begin(), params.end(),
[](const param_assignment_info& first, const param_assignment_info& second)
{
/* Assign parameters with larger alignments first so we can use parameters
* with smaller alignments as padding
*/
return first.provided_alignment > second.provided_alignment;
});
std::stable_sort(params.begin(), params.end(),
[](const param_assignment_info& first, const param_assignment_info& second)
{
/* Move parameters forced into registers to the very front so we assign
* them first.
*/
return first.force_reg && !second.force_reg;
});
for (size_t i = 1; i < params.size(); ++i) {
assert(!params[i].force_reg || params[i - 1].force_reg);
}
/* Reverse parameters and start from the end, to make erasing elements cheap */
std::reverse(params.begin(), params.end());
while (!params.empty()) {
RegClass rc = params.back().rc;
bool discardable = params.back().dst_info->discardable || params.back().is_return_param;
BITSET_WORD* regs;
if (has_preserved_regs && !discardable)
regs = preserved_regs;
else
regs = clobbered_regs;
auto next_reg = find_reg(regs, rc);
/* Force parameter into scratch if it exceeds the ABI's maximum parameter demand */
if (abi.max_param_demand != RegisterDemand() &&
(param_demand + Temp(0, rc)).exceeds(abi.max_param_demand))
next_reg = {};
if (next_reg && next_reg->reg() % params.back().required_alignment) {
/* We found a register, but it's not aligned properly. Check if we can add some padding
* (and ideally stuff a different parameter in there).
*/
uint16_t required_padding =
params.back().required_alignment - (next_reg->reg() % params.back().required_alignment);
uint16_t aligned_size = rc.size() + required_padding;
for (unsigned i = 0; i < aligned_size; ++i) {
/* The added padding exceeds the size of the register range. Just bail out at this
* point.
* TODO: we could probably try finding a new register, but then we'd need to reevaluate
* alignment etc...
*/
if (!BITSET_TEST(regs, next_reg->advance(i * 4).reg())) {
next_reg = {};
break;
}
}
/* Try finding a small parameter to put inside the padding space */
for (auto it2 = std::next(params.rbegin()); next_reg && it2 != params.rend(); ++it2) {
if (it2->rc.type() != params.back().rc.type() ||
it2->dst_info->discardable != discardable)
continue;
if (it2->rc.size() > required_padding || (it2->required_alignment % next_reg->reg()))
continue;
param_demand += Temp(0, it2->rc);
it2->dst_info->needs_explicit_preservation =
regs == clobbered_regs && !it2->dst_info->discardable;
it2->dst_info->def.setPrecolored(*next_reg);
for (unsigned i = 0; i < it2->rc.size(); ++i)
BITSET_CLEAR(regs, next_reg->reg() + i);
if (!it2->is_system_param) {
++info.reg_param_count;
if (discardable)
++info.reg_discardable_param_count;
}
params.erase(std::prev(it2.base()));
break;
}
if (next_reg)
next_reg = next_reg->advance(required_padding * 4);
}
if (next_reg) {
params.back().dst_info->needs_explicit_preservation =
regs == clobbered_regs && !params.back().dst_info->discardable;
param_demand += Temp(0, params.back().rc);
params.back().dst_info->def.setPrecolored(*next_reg);
BITSET_CLEAR_COUNT(regs, next_reg->reg(), params.back().rc.size());
if (!params.back().is_system_param) {
++info.reg_param_count;
if (discardable)
++info.reg_discardable_param_count;
}
} else {
assert(!params.back().force_reg);
params.back().dst_info->is_reg = false;
params.back().dst_info->scratch_offset = scratch_param_bytes;
scratch_param_bytes += rc.size() * 4;
}
params.pop_back();
}
info.scratch_param_size = scratch_param_bytes;
if (program)
program->callee_param_demand = param_demand;
}
struct callee_info
get_callee_info(amd_gfx_level gfx_level, unsigned wave_size, const ABI& abi, unsigned param_count,
const nir_parameter* parameters, Program* program, RegisterDemand reg_limit)
{
struct callee_info info = {};
info.param_infos.reserve(param_count);
std::vector<param_assignment_info> assignment_infos;
assignment_infos.reserve(param_count + 2);
Temp return_addr = program ? program->allocateTmp(s2) : Temp();
Definition return_def = Definition(return_addr);
info.return_address = {};
info.return_address.discardable = false;
info.return_address.is_reg = true;
info.return_address.def = return_def;
param_assignment_info return_def_info = {};
return_def_info.required_alignment = 2;
return_def_info.provided_alignment = 2;
return_def_info.rc = s2;
return_def_info.dst_info = &info.return_address;
return_def_info.is_return_param = false;
return_def_info.is_system_param = true;
return_def_info.force_reg = true;
assignment_infos.push_back(return_def_info);
if (gfx_level >= GFX9) {
Temp stack_ptr = program ? program->allocateTmp(s1) : Temp();
Definition stack_def = Definition(stack_ptr);
info.stack_ptr = {};
info.stack_ptr.discardable = false;
info.stack_ptr.is_reg = true;
info.stack_ptr.def = stack_def;
param_assignment_info stack_ptr_info = {};
stack_ptr_info.required_alignment = 1;
stack_ptr_info.provided_alignment = 1;
stack_ptr_info.rc = s1;
stack_ptr_info.dst_info = &info.stack_ptr;
stack_ptr_info.is_return_param = false;
stack_ptr_info.is_system_param = true;
stack_ptr_info.force_reg = true;
assignment_infos.push_back(stack_ptr_info);
} else {
Temp scratch_rsrc = program ? program->allocateTmp(s4) : Temp();
Definition rsrc_def = Definition(scratch_rsrc);
info.stack_ptr = {};
info.stack_ptr.discardable = false;
info.stack_ptr.is_reg = true;
info.stack_ptr.def = rsrc_def;
param_assignment_info rsrc_info = {};
rsrc_info.required_alignment = 4;
rsrc_info.provided_alignment = 4;
rsrc_info.rc = s4;
rsrc_info.dst_info = &info.stack_ptr;
rsrc_info.is_return_param = false;
rsrc_info.is_system_param = true;
rsrc_info.force_reg = true;
assignment_infos.push_back(rsrc_info);
}
size_t info_base = assignment_infos.size();
for (unsigned i = 0; i < param_count; ++i) {
RegType type = parameters[i].is_uniform ? RegType::sgpr : RegType::vgpr;
unsigned byte_size = align(parameters[i].bit_size, 32) / 8 * parameters[i].num_components;
if (parameters[i].bit_size == 1) {
type = RegType::sgpr;
byte_size = wave_size / 8;
}
RegClass rc = RegClass(type, byte_size / 4);
Temp dst = program ? program->allocateTmp(rc) : Temp();
Definition def = Definition(dst);
parameter_info param_info = {};
param_info.discardable =
!!(parameters[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
param_info.is_reg = true;
param_info.def = def;
info.param_infos.push_back(param_info);
uint16_t required_alignment = 1;
uint16_t provided_alignment = 1;
if (rc.type() == RegType::sgpr) {
if (rc.size() > 2)
required_alignment = 4;
else if (rc.size() > 1)
required_alignment = 2;
}
if (rc.size() % 4 == 0)
provided_alignment = 4;
else if (rc.size() % 2 == 0)
provided_alignment = 2;
param_assignment_info assignment_info = {};
assignment_info.required_alignment = required_alignment;
assignment_info.provided_alignment = provided_alignment;
assignment_info.rc = rc;
assignment_info.is_return_param = parameters[i].is_return;
/* Force the first two parameters (callee addresses) into registers - they're assumed to be
* accessible through a temp.
*/
assignment_info.force_reg = i <= 1;
assignment_infos.push_back(assignment_info);
}
for (unsigned i = 0; i < param_count; ++i)
assignment_infos[info_base + i].dst_info = &info.param_infos[i];
find_param_regs(program, abi, info, assignment_infos, reg_limit);
/* The call target parameters are special - they are marked as discardable to allow us
* to overwrite the parameter values within each callee for the divergent dispatch logic.
* However, we still need to explicitly write back the new values to the ABI-assigned registers
* when jumping to the next divergent callee/returning. Therefore, mark them as needing explicit
* preservation.
*/
info.param_infos[ACO_NIR_CALL_SYSTEM_ARG_DIVERGENT_PC].needs_explicit_preservation = true;
info.param_infos[ACO_NIR_CALL_SYSTEM_ARG_UNIFORM_PC].needs_explicit_preservation = true;
/* Explicitly preserve the stack pointer. spill_preserved() can ensure correctness on its own,
* but it only can spill the initial stack pointer value to a linear VGPR, the inactive lanes of
* which would in turn need to be spilled to scratch. Explicitly preserving the stack pointer's
* value is more efficient.
*/
info.stack_ptr.needs_explicit_preservation = true;
return info;
}
void
emit_reload_preserved(isel_context* ctx)
{
Builder bld(ctx->program, ctx->block);
Operand stack_ptr_op;
if (ctx->program->gfx_level >= GFX9)
stack_ptr_op = Operand(ctx->program->stack_ptr);
else
stack_ptr_op = Operand(load_scratch_resource(ctx->program, bld, -1u, false));
bld.pseudo(aco_opcode::p_reload_preserved, bld.def(bld.lm), Operand(), stack_ptr_op);
}
} // namespace aco