mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-09 06:48:06 +02:00
aco/ra: special-case get_reg_for_create_vector_copy()
This function implements separate handling for
p_create_vector during get_regs_for_copies().
This simplifies some code and lets more precisely select
swap instructions if possible.
Totals from 876 (0.65% of 134913) affected shaders: (GFX10.3)
VGPRs: 53312 -> 53336 (+0.05%)
CodeSize: 3792936 -> 3788160 (-0.13%); split: -0.15%, +0.03%
MaxWaves: 16084 -> 16078 (-0.04%)
Instrs: 707449 -> 706385 (-0.15%); split: -0.19%, +0.04%
Latency: 6288293 -> 6286677 (-0.03%); split: -0.03%, +0.01%
InvThroughput: 4264450 -> 4263671 (-0.02%); split: -0.02%, +0.00%
VClause: 18655 -> 18679 (+0.13%); split: -0.20%, +0.33%
Copies: 55397 -> 54353 (-1.88%); split: -2.45%, +0.57%
Branches: 12426 -> 12415 (-0.09%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11526>
This commit is contained in:
parent
9181e8ceba
commit
9476986e6f
1 changed files with 77 additions and 60 deletions
|
|
@ -1019,6 +1019,55 @@ collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_inte
|
||||||
return vars;
|
return vars;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::pair<PhysReg, bool>
|
||||||
|
get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file,
|
||||||
|
std::vector<std::pair<Operand, Definition>>& parallelcopies,
|
||||||
|
aco_ptr<Instruction>& instr, const PhysRegInterval def_reg,
|
||||||
|
DefInfo info, unsigned id)
|
||||||
|
{
|
||||||
|
PhysReg reg = def_reg.lo();
|
||||||
|
/* dead operand: return position in vector */
|
||||||
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
||||||
|
if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id &&
|
||||||
|
instr->operands[i].isKillBeforeDef()) {
|
||||||
|
assert(!reg_file.test(reg, info.rc.bytes()));
|
||||||
|
return {reg, !info.rc.is_subdword() || (reg.byte() % info.stride == 0)};
|
||||||
|
}
|
||||||
|
reg.reg_b += instr->operands[i].bytes();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx.program->chip_class <= GFX8)
|
||||||
|
return {PhysReg(), false};
|
||||||
|
|
||||||
|
/* check if the previous position was in vector */
|
||||||
|
assignment& var = ctx.assignments[id];
|
||||||
|
if (def_reg.contains(PhysRegInterval{var.reg, info.size})) {
|
||||||
|
reg = def_reg.lo();
|
||||||
|
/* try to use the previous register of the operand */
|
||||||
|
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
||||||
|
if (reg != var.reg) {
|
||||||
|
reg.reg_b += instr->operands[i].bytes();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* check if we can swap positions */
|
||||||
|
if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() &&
|
||||||
|
instr->operands[i].regClass() == info.rc) {
|
||||||
|
assignment& op = ctx.assignments[instr->operands[i].tempId()];
|
||||||
|
/* if everything matches, create parallelcopy for the killed operand */
|
||||||
|
if (!intersects(def_reg, PhysRegInterval{op.reg, op.rc.size()}) &&
|
||||||
|
reg_file.get_id(op.reg) == instr->operands[i].tempId()) {
|
||||||
|
Definition pc_def = Definition(reg, info.rc);
|
||||||
|
parallelcopies.emplace_back(instr->operands[i], pc_def);
|
||||||
|
return {op.reg, true};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {PhysReg(), false};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {PhysReg(), false};
|
||||||
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
|
get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
|
||||||
std::vector<std::pair<Operand, Definition>>& parallelcopies,
|
std::vector<std::pair<Operand, Definition>>& parallelcopies,
|
||||||
|
|
@ -1029,6 +1078,7 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
|
||||||
/* variables are sorted from small sized to large */
|
/* variables are sorted from small sized to large */
|
||||||
/* NOTE: variables are also sorted by ID. this only affects a very small number of shaders
|
/* NOTE: variables are also sorted by ID. this only affects a very small number of shaders
|
||||||
* slightly though. */
|
* slightly though. */
|
||||||
|
// TODO: sort by register instead of id
|
||||||
for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin();
|
for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin();
|
||||||
it != vars.rend(); ++it) {
|
it != vars.rend(); ++it) {
|
||||||
unsigned id = it->second;
|
unsigned id = it->second;
|
||||||
|
|
@ -1039,34 +1089,24 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
|
||||||
/* check if this is a dead operand, then we can re-use the space from the definition
|
/* check if this is a dead operand, then we can re-use the space from the definition
|
||||||
* also use the correct stride for sub-dword operands */
|
* also use the correct stride for sub-dword operands */
|
||||||
bool is_dead_operand = false;
|
bool is_dead_operand = false;
|
||||||
for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
|
std::pair<PhysReg, bool> res{PhysReg(), false};
|
||||||
if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
|
if (instr->opcode == aco_opcode::p_create_vector) {
|
||||||
if (instr->operands[i].isKillBeforeDef())
|
res =
|
||||||
is_dead_operand = true;
|
get_reg_for_create_vector_copy(ctx, reg_file, parallelcopies, instr, def_reg, info, id);
|
||||||
info = DefInfo(ctx, instr, var.rc, i);
|
} else {
|
||||||
break;
|
for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
|
||||||
|
if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
|
||||||
|
info = DefInfo(ctx, instr, var.rc, i);
|
||||||
|
if (instr->operands[i].isKillBeforeDef()) {
|
||||||
|
info.bounds = def_reg;
|
||||||
|
res = get_reg_simple(ctx, reg_file, info);
|
||||||
|
is_dead_operand = true;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (!res.second) {
|
||||||
std::pair<PhysReg, bool> res;
|
|
||||||
if (is_dead_operand) {
|
|
||||||
if (instr->opcode == aco_opcode::p_create_vector) {
|
|
||||||
PhysReg reg(def_reg.lo());
|
|
||||||
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
|
||||||
if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
|
|
||||||
res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) &&
|
|
||||||
!reg_file.test(reg, var.rc.bytes())};
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
reg.reg_b += instr->operands[i].bytes();
|
|
||||||
}
|
|
||||||
if (!res.second)
|
|
||||||
res = {var.reg, !reg_file.test(var.reg, var.rc.bytes())};
|
|
||||||
} else {
|
|
||||||
info.bounds = def_reg;
|
|
||||||
res = get_reg_simple(ctx, reg_file, info);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
/* Try to find space within the bounds but outside of the definition */
|
/* Try to find space within the bounds but outside of the definition */
|
||||||
info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi()));
|
info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi()));
|
||||||
res = get_reg_simple(ctx, reg_file, info);
|
res = get_reg_simple(ctx, reg_file, info);
|
||||||
|
|
@ -1300,27 +1340,19 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
|
||||||
|
|
||||||
/* now, we figured the placement for our definition */
|
/* now, we figured the placement for our definition */
|
||||||
RegisterFile tmp_file(reg_file);
|
RegisterFile tmp_file(reg_file);
|
||||||
|
|
||||||
|
/* p_create_vector: also re-place killed operands in the definition space */
|
||||||
|
if (instr->opcode == aco_opcode::p_create_vector) {
|
||||||
|
for (Operand& op : instr->operands) {
|
||||||
|
if (op.isTemp() && op.isFirstKillBeforeDef())
|
||||||
|
tmp_file.fill(op);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, tmp_file, best_win);
|
std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, tmp_file, best_win);
|
||||||
|
|
||||||
if (instr->opcode == aco_opcode::p_create_vector) {
|
/* re-enable killed operands */
|
||||||
/* move killed operands which aren't yet at the correct position (GFX9+)
|
if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector) {
|
||||||
* or which are in the definition space */
|
|
||||||
PhysReg reg = best_win.lo();
|
|
||||||
for (Operand& op : instr->operands) {
|
|
||||||
if (op.isTemp() && op.isFirstKillBeforeDef() && op.getTemp().type() == rc.type()) {
|
|
||||||
if (op.physReg() != reg && (ctx.program->chip_class >= GFX9 ||
|
|
||||||
(op.physReg().advance(op.bytes()) > best_win.lo() &&
|
|
||||||
op.physReg() < best_win.hi()))) {
|
|
||||||
vars.emplace(op.bytes(), op.tempId());
|
|
||||||
tmp_file.clear(op);
|
|
||||||
} else {
|
|
||||||
tmp_file.fill(op);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
reg.reg_b += op.bytes();
|
|
||||||
}
|
|
||||||
} else if (!is_phi(instr)) {
|
|
||||||
/* re-enable killed operands */
|
|
||||||
for (Operand& op : instr->operands) {
|
for (Operand& op : instr->operands) {
|
||||||
if (op.isTemp() && op.isFirstKillBeforeDef())
|
if (op.isTemp() && op.isFirstKillBeforeDef())
|
||||||
tmp_file.fill(op);
|
tmp_file.fill(op);
|
||||||
|
|
@ -1789,21 +1821,6 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
|
||||||
std::set<std::pair<unsigned, unsigned>> vars =
|
std::set<std::pair<unsigned, unsigned>> vars =
|
||||||
collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
|
collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
|
||||||
|
|
||||||
for (unsigned i = 0, offset = 0; i < instr->operands.size();
|
|
||||||
offset += instr->operands[i].bytes(), i++) {
|
|
||||||
if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() ||
|
|
||||||
instr->operands[i].getTemp().type() != rc.type())
|
|
||||||
continue;
|
|
||||||
bool correct_pos = !tmp_file.test(instr->operands[i].physReg(), instr->operands[i].bytes());
|
|
||||||
/* GFX9+: move killed operands which aren't yet at the correct position
|
|
||||||
* Moving all killed operands generally leads to more register swaps.
|
|
||||||
* This is only done on GFX9+ because of the cheap v_swap instruction.
|
|
||||||
*/
|
|
||||||
if (ctx.program->chip_class >= GFX9 && !correct_pos) {
|
|
||||||
vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
|
|
||||||
tmp_file.clear(instr->operands[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
bool success = false;
|
bool success = false;
|
||||||
std::vector<std::pair<Operand, Definition>> pc;
|
std::vector<std::pair<Operand, Definition>> pc;
|
||||||
success =
|
success =
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue