aco: don't reuse misaligned attribute destination VGPRs in VS prologs

Since we split misaligned attributes, we could overwrite one of these
VGPRs in the middle of loading the attribute.

For example:
   v_add_u32_e32 v4, vcc, s7, v1
   s_waitcnt lgkmcnt(0)
   buffer_load_dword v4, v4, s[32:35], 0 idxen
   buffer_load_dword v5, v4, s[32:35], 0 idxen offset:4
can overwrite the vertex index in the load of the first component.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Cc: mesa-stable
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27920>
This commit is contained in:
Rhys Perry 2024-02-28 12:23:17 +00:00 committed by Marge Bot
parent df7024bcdd
commit ec892c4d2b

View file

@ -12767,6 +12767,20 @@ select_rt_prolog(Program* program, ac_shader_config* config,
program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
}
PhysReg
get_next_vgpr(unsigned size, unsigned* num, int *offset = NULL)
{
unsigned reg = *num + (offset ? *offset : 0);
if (reg + size >= *num) {
*num = reg + size;
if (offset)
*offset = 0;
} else if (offset) {
*offset += size;
}
return PhysReg(256 + reg);
}
void
select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_shader_config* config,
const struct aco_compiler_options* options, const struct aco_shader_info* info,
@ -12808,13 +12822,30 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh
Operand start_instance = get_arg_fixed(args, args->start_instance);
Operand instance_id = get_arg_fixed(args, args->instance_id);
PhysReg attributes_start(256 + args->num_vgprs_used);
/* choose vgprs that won't be used for anything else until the last attribute load */
PhysReg vertex_index(attributes_start.reg() + pinfo->num_attributes * 4 - 1);
PhysReg instance_index(attributes_start.reg() + pinfo->num_attributes * 4 - 2);
PhysReg start_instance_vgpr(attributes_start.reg() + pinfo->num_attributes * 4 - 3);
PhysReg nontrivial_tmp_vgpr0(attributes_start.reg() + pinfo->num_attributes * 4 - 4);
PhysReg nontrivial_tmp_vgpr1(attributes_start.reg() + pinfo->num_attributes * 4);
bool needs_instance_index =
pinfo->instance_rate_inputs &
~(pinfo->zero_divisors | pinfo->nontrivial_divisors); /* divisor is 1 */
bool needs_start_instance = pinfo->instance_rate_inputs & pinfo->zero_divisors;
bool needs_vertex_index = ~pinfo->instance_rate_inputs & attrib_mask;
bool needs_tmp_vgpr0 = has_nontrivial_divisors;
bool needs_tmp_vgpr1 = has_nontrivial_divisors &&
(program->gfx_level <= GFX8 || program->gfx_level >= GFX11);
int vgpr_offset = pinfo->misaligned_mask & (1u << (pinfo->num_attributes - 1)) ? 0 : -4;
unsigned num_vgprs = args->num_vgprs_used;
PhysReg attributes_start = get_next_vgpr(pinfo->num_attributes * 4, &num_vgprs);
PhysReg vertex_index, instance_index, start_instance_vgpr, nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1;
if (needs_vertex_index)
vertex_index = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
if (needs_instance_index)
instance_index = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
if (needs_start_instance)
start_instance_vgpr = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
if (needs_tmp_vgpr0)
nontrivial_tmp_vgpr0 = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
if (needs_tmp_vgpr1)
nontrivial_tmp_vgpr1 = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
get_arg_fixed(args, args->vertex_buffers));
@ -12826,16 +12857,10 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh
Operand::c32((unsigned)options->address32_hi));
}
/* calculate vgpr requirements */
unsigned num_vgprs = attributes_start.reg() - 256;
num_vgprs += pinfo->num_attributes * 4;
if (has_nontrivial_divisors && program->gfx_level <= GFX8)
num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */
unsigned num_sgprs = 0;
const struct ac_vtx_format_info* vtx_info_table =
ac_get_vtx_format_info_table(GFX8, CHIP_POLARIS10);
unsigned num_sgprs = 0;
for (unsigned loc = 0; loc < pinfo->num_attributes;) {
unsigned num_descs =
load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, pinfo->num_attributes - loc);
@ -12875,11 +12900,6 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh
}
}
bool needs_instance_index =
pinfo->instance_rate_inputs &
~(pinfo->zero_divisors | pinfo->nontrivial_divisors); /* divisor is 1 */
bool needs_start_instance = pinfo->instance_rate_inputs & pinfo->zero_divisors;
bool needs_vertex_index = ~pinfo->instance_rate_inputs & attrib_mask;
if (needs_vertex_index)
bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->base_vertex),
get_arg_fixed(args, args->vertex_id), false, Operand(s2), true);