aco: implement aco_compile_vs_prolog

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11717>
This commit is contained in:
Rhys Perry 2021-05-17 17:53:30 +01:00 committed by Marge Bot
parent f6f6f18e55
commit f4ea2d7887
3 changed files with 362 additions and 1 deletions

View file

@ -11820,4 +11820,325 @@ select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shade
cleanup_cfg(program);
}
Operand
get_arg_fixed(const struct radv_shader_args* args, struct ac_arg arg)
{
assert(arg.used);
enum ac_arg_regfile file = args->ac.args[arg.arg_index].file;
unsigned size = args->ac.args[arg.arg_index].size;
unsigned reg = args->ac.args[arg.arg_index].offset;
return Operand(PhysReg(file == AC_ARG_SGPR ? reg : reg + 256),
RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size));
}
unsigned
load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max)
{
unsigned count = MIN2((bld.program->dev.sgpr_limit - dest.reg()) / 4u, max);
unsigned num_loads = (count / 4u) + util_bitcount(count & 0x3);
if (bld.program->chip_class >= GFX10 && num_loads > 1)
bld.sopp(aco_opcode::s_clause, -1, num_loads - 1);
for (unsigned i = 0; i < count;) {
unsigned size = 1u << util_logbase2(MIN2(count - i, 4));
if (size == 4)
bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base,
Operand::c32((start + i) * 16u));
else if (size == 2)
bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base,
Operand::c32((start + i) * 16u));
else
bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base,
Operand::c32((start + i) * 16u));
dest = dest.advance(size * 16u);
i += size;
}
return count;
}
Operand
calc_nontrivial_instance_id(Builder& bld, const struct radv_shader_args* args, unsigned index,
Operand instance_id, Operand start_instance, PhysReg tmp_sgpr,
PhysReg tmp_vgpr0, PhysReg tmp_vgpr1)
{
bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2),
get_arg_fixed(args, args->prolog_inputs), Operand::c32(8u + index * 8u));
wait_imm lgkm_imm;
lgkm_imm.lgkm = 0;
bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(bld.program->chip_class));
Definition fetch_index_def(tmp_vgpr0, v1);
Operand fetch_index(tmp_vgpr0, v1);
Operand div_info(tmp_sgpr, s1);
if (bld.program->chip_class >= GFX8) {
/* use SDWA */
if (bld.program->chip_class < GFX9) {
bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info);
div_info = Operand(tmp_vgpr1, v1);
}
bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id).instr;
Instruction* instr;
if (bld.program->chip_class >= GFX9)
instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr;
else
instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm),
div_info, fetch_index)
.instr;
instr->sdwa().sel[0] = SubdwordSel::ubyte1;
bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1),
fetch_index);
instr =
bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr;
instr->sdwa().sel[0] = SubdwordSel::ubyte2;
} else {
Operand tmp_op(tmp_vgpr1, v1);
Definition tmp_def(tmp_vgpr1, v1);
bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u));
bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true);
bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index,
Operand(tmp_sgpr.advance(4), s1));
bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u));
bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index);
}
bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true);
return fetch_index;
}
void
select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key, ac_shader_config* config,
const struct radv_shader_args* args, unsigned* num_preserved_sgprs)
{
assert(key->num_attributes > 0);
/* This should be enough for any shader/stage. */
unsigned max_user_sgprs = args->options->chip_class >= GFX9 ? 32 : 16;
*num_preserved_sgprs = max_user_sgprs + 14;
init_program(program, compute_cs, args->shader_info, args->options->chip_class,
args->options->family, args->options->wgp_mode, config);
Block* block = program->create_and_insert_block();
block->kind = block_kind_top_level;
program->workgroup_size = 64;
calc_min_waves(program);
Builder bld(program, block);
block->instructions.reserve(16 + key->num_attributes * 4);
bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
uint32_t attrib_mask = BITFIELD_MASK(key->num_attributes);
bool has_nontrivial_divisors = key->state->nontrivial_divisors & attrib_mask;
wait_imm lgkm_imm;
lgkm_imm.lgkm = 0;
/* choose sgprs */
PhysReg vertex_buffers(align(*num_preserved_sgprs, 2));
PhysReg prolog_input = vertex_buffers.advance(8);
PhysReg desc(
align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4));
Operand start_instance = get_arg_fixed(args, args->ac.start_instance);
Operand instance_id = get_arg_fixed(args, args->ac.instance_id);
PhysReg attributes_start(256 + args->ac.num_vgprs_used);
/* choose vgprs that won't be used for anything else until the last attribute load */
PhysReg vertex_index(attributes_start.reg() + key->num_attributes * 4 - 1);
PhysReg instance_index(attributes_start.reg() + key->num_attributes * 4 - 2);
PhysReg start_instance_vgpr(attributes_start.reg() + key->num_attributes * 4 - 3);
PhysReg nontrivial_tmp_vgpr0(attributes_start.reg() + key->num_attributes * 4 - 4);
PhysReg nontrivial_tmp_vgpr1(attributes_start.reg() + key->num_attributes * 4);
bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
get_arg_fixed(args, args->ac.vertex_buffers));
bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1),
Operand::c32((unsigned)args->options->address32_hi));
/* calculate vgpr requirements */
unsigned num_vgprs = attributes_start.reg() - 256;
num_vgprs += key->num_attributes * 4;
if (has_nontrivial_divisors && program->chip_class <= GFX8)
num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */
unsigned num_sgprs = 0;
for (unsigned loc = 0; loc < key->num_attributes;) {
unsigned num_descs =
load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, key->num_attributes - loc);
num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg());
if (loc == 0) {
/* perform setup while we load the descriptors */
if (key->is_ngg || key->next_stage != MESA_SHADER_VERTEX) {
Operand count = get_arg_fixed(args, args->ac.merged_wave_info);
bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u));
if (program->wave_size == 64) {
bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count,
Operand::c32(6u /* log2(64) */));
bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX),
Operand(exec, s2), Operand(scc, s1));
}
}
bool needs_instance_index = false;
bool needs_start_instance = false;
u_foreach_bit(i, key->state->instance_rate_inputs & attrib_mask)
{
needs_instance_index |= key->state->divisors[i] == 1;
needs_start_instance |= key->state->divisors[i] == 0;
}
bool needs_vertex_index = ~key->state->instance_rate_inputs & attrib_mask;
if (needs_vertex_index)
bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->ac.base_vertex),
get_arg_fixed(args, args->ac.vertex_id), false, Operand(s2), true);
if (needs_instance_index)
bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false,
Operand(s2), true);
if (needs_start_instance)
bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance);
}
bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->chip_class));
for (unsigned i = 0; i < num_descs; i++, loc++) {
PhysReg dest(attributes_start.reg() + loc * 4u);
/* calculate index */
Operand fetch_index = Operand(vertex_index, v1);
if (key->state->instance_rate_inputs & (1u << loc)) {
uint32_t divisor = key->state->divisors[loc];
if (divisor) {
fetch_index = instance_id;
if (key->state->nontrivial_divisors & (1u << loc)) {
unsigned index =
util_bitcount(key->state->nontrivial_divisors & BITFIELD_MASK(loc));
fetch_index = calc_nontrivial_instance_id(
bld, args, index, instance_id, start_instance, prolog_input,
nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1);
} else {
fetch_index = Operand(instance_index, v1);
}
} else {
fetch_index = Operand(start_instance_vgpr, v1);
}
}
/* perform load */
PhysReg cur_desc = desc.advance(i * 16);
if ((key->misaligned_mask & (1u << loc))) {
unsigned dfmt = key->state->formats[loc] & 0xf;
unsigned nfmt = key->state->formats[loc] >> 4;
const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt);
for (unsigned j = 0; j < vtx_info->num_channels; j++) {
bool post_shuffle = key->state->post_shuffle & (1u << loc);
unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
/* Use MUBUF to workaround hangs for byte-aligned dword loads. The Vulkan spec
* doesn't require this to work, but some GL CTS tests over Zink do this anyway.
* MTBUF can hang, but MUBUF doesn't (probably gives garbage, but GL CTS doesn't
* care).
*/
if (vtx_info->chan_format == V_008F0C_BUF_DATA_FORMAT_32)
bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1),
Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false,
false, true);
else
bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
Operand(cur_desc, s4), fetch_index, Operand::c32(0u),
vtx_info->chan_format, nfmt, offset, false, true);
}
uint32_t one =
nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
? 1u
: 0x3f800000u;
for (unsigned j = vtx_info->num_channels; j < 4; j++) {
bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
Operand::c32(j == 3 ? one : 0u));
}
} else {
bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true);
}
}
}
if (key->state->alpha_adjust_lo | key->state->alpha_adjust_hi) {
wait_imm vm_imm;
vm_imm.vm = 0;
bld.sopp(aco_opcode::s_waitcnt, -1, vm_imm.pack(program->chip_class));
}
/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
* so we may need to fix it up. */
u_foreach_bit(loc, (key->state->alpha_adjust_lo | key->state->alpha_adjust_hi))
{
PhysReg alpha(attributes_start.reg() + loc * 4u + 3);
unsigned alpha_adjust = (key->state->alpha_adjust_lo >> loc) & 0x1;
alpha_adjust |= ((key->state->alpha_adjust_hi >> loc) & 0x1) << 1;
if (alpha_adjust == ALPHA_ADJUST_SSCALED)
bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1));
/* For the integer-like cases, do a natural sign extension.
*
* For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
* and happen to contain 0, 1, 2, 3 as the two LSBs of the
* exponent.
*/
unsigned offset = alpha_adjust == ALPHA_ADJUST_SNORM ? 23u : 0u;
bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1),
Operand::c32(offset), Operand::c32(2u));
/* Convert back to the right type. */
if (alpha_adjust == ALPHA_ADJUST_SNORM) {
bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u),
Operand(alpha, v1));
} else if (alpha_adjust == ALPHA_ADJUST_SSCALED) {
bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
}
}
block->kind |= block_kind_uniform;
/* continue on to the main shader */
Operand continue_pc = get_arg_fixed(args, args->prolog_inputs);
if (has_nontrivial_divisors) {
bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2),
get_arg_fixed(args, args->prolog_inputs), Operand::c32(0u));
bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->chip_class));
continue_pc = Operand(prolog_input, s2);
}
bld.sop1(aco_opcode::s_setpc_b64, continue_pc);
program->config->float_mode = program->blocks[0].fp_mode.val;
/* addition on GFX6-8 requires a carry-out (we use VCC) */
program->needs_vcc = program->chip_class <= GFX8;
program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
}
} // namespace aco

View file

@ -268,5 +268,42 @@ void
aco_compile_vs_prolog(const struct radv_vs_prolog_key* key, struct radv_prolog_binary** binary,
const struct radv_shader_args* args)
{
unreachable("TODO");
aco::init();
/* create program */
ac_shader_config config = {0};
std::unique_ptr<aco::Program> program{new aco::Program};
program->collect_statistics = false;
program->debug.func = NULL;
program->debug.private_data = NULL;
/* create IR */
unsigned num_preserved_sgprs;
aco::select_vs_prolog(program.get(), key, &config, args, &num_preserved_sgprs);
aco::insert_NOPs(program.get());
if (args->options->dump_shader)
aco_print_program(program.get(), stderr);
/* assembly */
std::vector<uint32_t> code;
code.reserve(align(program->blocks[0].instructions.size() * 2, 16));
unsigned exec_size = aco::emit_program(program.get(), code);
if (args->options->dump_shader) {
aco::print_asm(program.get(), code, exec_size / 4u, stderr);
fprintf(stderr, "\n");
}
/* copy into binary */
size_t size = code.size() * sizeof(uint32_t) + sizeof(radv_prolog_binary);
radv_prolog_binary* prolog_binary = (radv_prolog_binary*)calloc(size, 1);
prolog_binary->num_sgprs = config.num_sgprs;
prolog_binary->num_vgprs = config.num_vgprs;
prolog_binary->num_preserved_sgprs = num_preserved_sgprs;
prolog_binary->code_size = code.size() * sizeof(uint32_t);
memcpy(prolog_binary->data, code.data(), prolog_binary->code_size);
*binary = prolog_binary;
}

View file

@ -2146,6 +2146,9 @@ void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_sh
const struct radv_shader_args* args);
void select_trap_handler_shader(Program* program, struct nir_shader* shader,
ac_shader_config* config, const struct radv_shader_args* args);
void select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key,
ac_shader_config* config, const struct radv_shader_args* args,
unsigned* num_preserved_sgprs);
void lower_phis(Program* program);
void calc_min_waves(Program* program);