aco: improve p_create_vector RA for sub-dword operands

These's still improvements needed for sub-dword definitions, but that's
not as simple.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4507>
This commit is contained in:
Rhys Perry 2020-04-10 11:38:20 +01:00 committed by Marge Bot
parent e18711cda3
commit 52cc1f8237

View file

@ -481,11 +481,12 @@ bool get_regs_for_copies(ra_ctx& ctx,
std::pair<PhysReg, bool> res; std::pair<PhysReg, bool> res;
if (is_dead_operand) { if (is_dead_operand) {
if (instr->opcode == aco_opcode::p_create_vector) { if (instr->opcode == aco_opcode::p_create_vector) {
for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) { for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) { if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
for (unsigned j = 0; j < size; j++) PhysReg reg(def_reg_lo);
assert(reg_file[def_reg_lo + offset + j] == 0); reg.reg_b += offset;
res = {PhysReg{def_reg_lo + offset}, true}; assert(!reg_file.test(reg, var.rc.bytes()));
res = {reg, true};
break; break;
} }
} }
@ -893,6 +894,7 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
{ {
/* create_vector instructions have different costs w.r.t. register coalescing */ /* create_vector instructions have different costs w.r.t. register coalescing */
uint32_t size = rc.size(); uint32_t size = rc.size();
uint32_t bytes = rc.bytes();
uint32_t stride = 1; uint32_t stride = 1;
uint32_t lb, ub; uint32_t lb, ub;
if (rc.type() == RegType::vgpr) { if (rc.type() == RegType::vgpr) {
@ -907,20 +909,25 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
stride = 4; stride = 4;
} }
//TODO: improve p_create_vector for sub-dword vectors
unsigned best_pos = -1; unsigned best_pos = -1;
unsigned num_moves = 0xFF; unsigned num_moves = 0xFF;
bool best_war_hint = true; bool best_war_hint = true;
/* test for each operand which definition placement causes the least shuffle instructions */ /* test for each operand which definition placement causes the least shuffle instructions */
for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) { for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
// TODO: think about, if we can alias live operands on the same register // TODO: think about, if we can alias live operands on the same register
if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || instr->operands[i].getTemp().type() != rc.type()) if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || instr->operands[i].getTemp().type() != rc.type())
continue; continue;
if (offset > instr->operands[i].physReg()) if (offset > instr->operands[i].physReg().reg_b)
continue; continue;
unsigned reg_lo = instr->operands[i].physReg() - offset; unsigned reg_lo = instr->operands[i].physReg().reg_b - offset;
if (reg_lo % 4)
continue;
reg_lo /= 4;
unsigned reg_hi = reg_lo + size - 1; unsigned reg_hi = reg_lo + size - 1;
unsigned k = 0; unsigned k = 0;
@ -942,10 +949,18 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
bool linear_vgpr = false; bool linear_vgpr = false;
for (unsigned j = reg_lo; j <= reg_hi && !linear_vgpr; j++) { for (unsigned j = reg_lo; j <= reg_hi && !linear_vgpr; j++) {
if (reg_file[j] != 0) { if (reg_file[j] != 0) {
k++; if (reg_file[j] == 0xF0000000) {
/* we cannot split live ranges of linear vgprs */ PhysReg reg;
if (ctx.assignments[reg_file[j]].rc & (1 << 6)) reg.reg_b = j * 4;
linear_vgpr = true; unsigned bytes_left = bytes - (j - reg_lo) * 4;
for (unsigned k = 0; k < MIN2(bytes_left, 4); k++, reg.reg_b++)
k += reg_file.test(reg, 1);
} else {
k += 4;
/* we cannot split live ranges of linear vgprs */
if (ctx.assignments[reg_file[j]].rc & (1 << 6))
linear_vgpr = true;
}
} }
war_hint |= ctx.war_hint[j]; war_hint |= ctx.war_hint[j];
} }
@ -953,13 +968,13 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
continue; continue;
/* count operands in wrong positions */ /* count operands in wrong positions */
for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].size(), j++) { for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].bytes(), j++) {
if (j == i || if (j == i ||
!instr->operands[j].isTemp() || !instr->operands[j].isTemp() ||
instr->operands[j].getTemp().type() != rc.type()) instr->operands[j].getTemp().type() != rc.type())
continue; continue;
if (instr->operands[j].physReg() != reg_lo + offset) if (instr->operands[j].physReg().reg_b != reg_lo * 4 + offset)
k += instr->operands[j].size(); k += instr->operands[j].bytes();
} }
bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0; bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
if (k > num_moves || (!aligned && k == num_moves)) if (k > num_moves || (!aligned && k == num_moves))
@ -970,18 +985,18 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
best_war_hint = war_hint; best_war_hint = war_hint;
} }
if (num_moves >= size) if (num_moves >= bytes)
return get_reg(ctx, reg_file, rc, parallelcopies, instr); return get_reg(ctx, reg_file, rc, parallelcopies, instr);
/* collect variables to be moved */ /* collect variables to be moved */
std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size); std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
/* move killed operands which aren't yet at the correct position */ /* move killed operands which aren't yet at the correct position */
for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) { for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
if (instr->operands[i].isTemp() && if (instr->operands[i].isTemp() &&
instr->operands[i].isFirstKillBeforeDef() && instr->operands[i].isFirstKillBeforeDef() &&
instr->operands[i].getTemp().type() == rc.type() && instr->operands[i].getTemp().type() == rc.type() &&
instr->operands[i].physReg() != best_pos + offset) instr->operands[i].physReg().reg_b != best_pos * 4 + offset)
vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId()); vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
} }