From ec892c4d2b6360efdae4e2dc9a23ce320b6aa1d6 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 28 Feb 2024 12:23:17 +0000 Subject: [PATCH] aco: don't reuse misaligned attribute destination VGPRs in VS prologs Since we split misaligned attributes, we could overwrite one of these VGPRs in the middle of loading the attribute. For example: v_add_u32_e32 v4, vcc, s7, v1 s_waitcnt lgkmcnt(0) buffer_load_dword v4, v4, s[32:35], 0 idxen buffer_load_dword v5, v4, s[32:35], 0 idxen offset:4 can overwrite the vertex index in the load of the first component. Signed-off-by: Rhys Perry Reviewed-by: Samuel Pitoiset Cc: mesa-stable Part-of: --- .../compiler/aco_instruction_selection.cpp | 58 +++++++++++++------ 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index ed4cbf1d907..56743461cc5 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -12767,6 +12767,20 @@ select_rt_prolog(Program* program, ac_shader_config* config, program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs); } +PhysReg +get_next_vgpr(unsigned size, unsigned* num, int *offset = NULL) +{ + unsigned reg = *num + (offset ? *offset : 0); + if (reg + size >= *num) { + *num = reg + size; + if (offset) + *offset = 0; + } else if (offset) { + *offset += size; + } + return PhysReg(256 + reg); +} + void select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_shader_config* config, const struct aco_compiler_options* options, const struct aco_shader_info* info, @@ -12808,13 +12822,30 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh Operand start_instance = get_arg_fixed(args, args->start_instance); Operand instance_id = get_arg_fixed(args, args->instance_id); - PhysReg attributes_start(256 + args->num_vgprs_used); - /* choose vgprs that won't be used for anything else until the last attribute load */ - PhysReg vertex_index(attributes_start.reg() + pinfo->num_attributes * 4 - 1); - PhysReg instance_index(attributes_start.reg() + pinfo->num_attributes * 4 - 2); - PhysReg start_instance_vgpr(attributes_start.reg() + pinfo->num_attributes * 4 - 3); - PhysReg nontrivial_tmp_vgpr0(attributes_start.reg() + pinfo->num_attributes * 4 - 4); - PhysReg nontrivial_tmp_vgpr1(attributes_start.reg() + pinfo->num_attributes * 4); + bool needs_instance_index = + pinfo->instance_rate_inputs & + ~(pinfo->zero_divisors | pinfo->nontrivial_divisors); /* divisor is 1 */ + bool needs_start_instance = pinfo->instance_rate_inputs & pinfo->zero_divisors; + bool needs_vertex_index = ~pinfo->instance_rate_inputs & attrib_mask; + bool needs_tmp_vgpr0 = has_nontrivial_divisors; + bool needs_tmp_vgpr1 = has_nontrivial_divisors && + (program->gfx_level <= GFX8 || program->gfx_level >= GFX11); + + int vgpr_offset = pinfo->misaligned_mask & (1u << (pinfo->num_attributes - 1)) ? 0 : -4; + + unsigned num_vgprs = args->num_vgprs_used; + PhysReg attributes_start = get_next_vgpr(pinfo->num_attributes * 4, &num_vgprs); + PhysReg vertex_index, instance_index, start_instance_vgpr, nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1; + if (needs_vertex_index) + vertex_index = get_next_vgpr(1, &num_vgprs, &vgpr_offset); + if (needs_instance_index) + instance_index = get_next_vgpr(1, &num_vgprs, &vgpr_offset); + if (needs_start_instance) + start_instance_vgpr = get_next_vgpr(1, &num_vgprs, &vgpr_offset); + if (needs_tmp_vgpr0) + nontrivial_tmp_vgpr0 = get_next_vgpr(1, &num_vgprs, &vgpr_offset); + if (needs_tmp_vgpr1) + nontrivial_tmp_vgpr1 = get_next_vgpr(1, &num_vgprs, &vgpr_offset); bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1), get_arg_fixed(args, args->vertex_buffers)); @@ -12826,16 +12857,10 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh Operand::c32((unsigned)options->address32_hi)); } - /* calculate vgpr requirements */ - unsigned num_vgprs = attributes_start.reg() - 256; - num_vgprs += pinfo->num_attributes * 4; - if (has_nontrivial_divisors && program->gfx_level <= GFX8) - num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */ - unsigned num_sgprs = 0; - const struct ac_vtx_format_info* vtx_info_table = ac_get_vtx_format_info_table(GFX8, CHIP_POLARIS10); + unsigned num_sgprs = 0; for (unsigned loc = 0; loc < pinfo->num_attributes;) { unsigned num_descs = load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, pinfo->num_attributes - loc); @@ -12875,11 +12900,6 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh } } - bool needs_instance_index = - pinfo->instance_rate_inputs & - ~(pinfo->zero_divisors | pinfo->nontrivial_divisors); /* divisor is 1 */ - bool needs_start_instance = pinfo->instance_rate_inputs & pinfo->zero_divisors; - bool needs_vertex_index = ~pinfo->instance_rate_inputs & attrib_mask; if (needs_vertex_index) bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->base_vertex), get_arg_fixed(args, args->vertex_id), false, Operand(s2), true);