brw: Don't directly use regs_read/regs_written/size_read as bound for non-trivial loops

Instead save to a local variable and use that.  In various cases the
compiler is not able to pull it out of the loop, since there are other
not inlined function calls as part of the loop's body, resulting in
repeated unnecessary calls to either size_read() or its pieces that
get inlined.

Below are fossil compilation times in a MTL machine compiling shaders
for a BMG GPU:

```
// Differences at 95.0% confidence.

// Rise of the Tomb Raider (n=20)
   -0.017 +/- 0.00724575
   -3.45177665% +/- 1.45084%

// Alan Wake (n=20)
   -0.153 +/- 0.00960067
   -4.99265786% +/- 0.303695%

// Borderlands 3 (n=14)
   -0.486428571 +/- 0.15354
   -3.51248195% +/- 1.0835%

// Oblivion Remastered (n=14)
   -0.143571429 +/- 0.0357991
   -3.05749924% +/- 0.747872%

// Baldur's Gate 3 (n=14)
   -1.68928571 +/- 0.151598
   -4.12128605% +/- 0.364259%
```

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41496>
This commit is contained in:
Caio Oliveira 2026-05-08 16:52:40 -07:00 committed by Marge Bot
parent 3f71aab327
commit 821a812c7d
4 changed files with 44 additions and 22 deletions

View file

@ -89,7 +89,8 @@ brw_live_variables::setup_def_use()
continue;
const int var = var_from_reg(inst->src[i]);
for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++)
const unsigned read = regs_read(devinfo, inst, i);
for (unsigned j = 0; j < read; j++)
setup_one_read(bd, ip, var + j);
}
@ -98,7 +99,8 @@ brw_live_variables::setup_def_use()
/* Set def[] for this instruction */
if (inst->dst.file == VGRF) {
const int var = var_from_reg(inst->dst);
for (unsigned j = 0; j < regs_written(inst); j++)
const unsigned written = regs_written(inst);
for (unsigned j = 0; j < written; j++)
setup_one_write(bd, inst, ip, var + j);
}

View file

@ -905,7 +905,8 @@ namespace {
/* Stall on any source dependencies. */
for (unsigned i = 0; i < inst->sources; i++) {
for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++)
const unsigned read = regs_read(devinfo, inst, i);
for (unsigned j = 0; j < read; j++)
stall_on_dependency(
st, reg_dependency_id(devinfo, inst->src[i], j));
}
@ -927,7 +928,8 @@ namespace {
/* Stall on any write dependencies. */
if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
for (unsigned j = 0; j < regs_written(inst); j++)
const unsigned written = regs_written(inst);
for (unsigned j = 0; j < written; j++)
stall_on_dependency(
st, reg_dependency_id(devinfo, inst->dst, j));
}
@ -960,7 +962,8 @@ namespace {
if (inst->is_send()) {
for (unsigned i = 0; i < inst->sources; i++) {
if (inst->is_payload(i)) {
for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++)
const unsigned read = regs_read(devinfo, inst, i);
for (unsigned j = 0; j < read; j++)
mark_read_dependency(
st, perf, reg_dependency_id(devinfo, inst->src[i], j));
}
@ -969,7 +972,8 @@ namespace {
/* Mark any destination dependencies. */
if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
for (unsigned j = 0; j < regs_written(inst); j++) {
const unsigned written = regs_written(inst);
for (unsigned j = 0; j < written; j++) {
mark_write_dependency(st, perf,
reg_dependency_id(devinfo, inst->dst, j));
}

View file

@ -1020,7 +1020,8 @@ namespace {
is_ordered ? dependency(TGL_REGDIST_SRC, jp, exec_all) :
dependency::done;
for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++) {
const unsigned read = regs_read(devinfo, inst, i);
for (unsigned j = 0; j < read; j++) {
const brw_reg r = byte_offset(inst->src[i], REG_SIZE * j);
sb.set(r, shadow(sb.get(r), rd_dep));
}
@ -1072,7 +1073,8 @@ namespace {
if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
!inst->dst.is_null()) {
for (unsigned j = 0; j < regs_written(inst); j++)
const unsigned written = regs_written(inst);
for (unsigned j = 0; j < written; j++)
sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
}
}
@ -1168,7 +1170,8 @@ namespace {
std::vector<dependency> inst_deps;
for (unsigned i = 0; i < inst->sources; i++) {
for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++)
const unsigned read = regs_read(devinfo, inst, i);
for (unsigned j = 0; j < read; j++)
add_dependency(ids, inst_deps, dependency_for_read(
sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
}
@ -1214,7 +1217,8 @@ namespace {
if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&
!inst->dst.is_accumulator() &&
inst->opcode != SHADER_OPCODE_UNDEF) {
for (unsigned j = 0; j < regs_written(inst); j++) {
const unsigned written = regs_written(inst);
for (unsigned j = 0; j < written; j++) {
add_dependency(ids, inst_deps, dependency_for_write(devinfo, inst,
sb.get(byte_offset(inst->dst, REG_SIZE * j))));
}

View file

@ -1363,7 +1363,8 @@ brw_instruction_scheduler::calculate_deps()
if (!inst->src[i].is_address())
continue;
for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) {
const unsigned read = inst->size_read(s->devinfo, i);
for (unsigned byte = 0; byte < read; byte += 2) {
assert(inst->src[i].address_slot(byte) < ARRAY_SIZE(last_address_write));
schedule_node *write_addr_node =
last_address_write[inst->src[i].address_slot(byte)];
@ -1396,11 +1397,13 @@ brw_instruction_scheduler::calculate_deps()
/* read-after-write deps. */
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == VGRF) {
for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++)
const unsigned read = regs_read(s->devinfo, inst, i);
for (unsigned r = 0; r < read; r++)
add_dep(last_grf_write[grf_index(inst->src[i]) + r], n);
} else if (inst->src[i].file == FIXED_GRF) {
if (post_reg_alloc) {
for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++)
const unsigned read = regs_read(s->devinfo, inst, i);
for (unsigned r = 0; r < read; r++)
add_dep(last_grf_write[inst->src[i].nr + r], n);
} else {
add_dep(last_fixed_grf_write, n);
@ -1409,7 +1412,8 @@ brw_instruction_scheduler::calculate_deps()
add_dep(last_accumulator_write, n);
} else if (inst->src[i].is_address()) {
if (post_reg_alloc) {
for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2)
const unsigned read = inst->size_read(s->devinfo, i);
for (unsigned byte = 0; byte < read; byte += 2)
add_dep(last_address_write[inst->src[i].address_slot(byte)], n);
}
} else if (register_needs_barrier(inst->src[i])) {
@ -1433,13 +1437,15 @@ brw_instruction_scheduler::calculate_deps()
/* write-after-write deps. */
if (inst->dst.file == VGRF) {
int grf_idx = grf_index(inst->dst);
for (unsigned r = 0; r < regs_written(inst); r++) {
const unsigned written = regs_written(inst);
for (unsigned r = 0; r < written; r++) {
add_dep(last_grf_write[grf_idx + r], n);
last_grf_write[grf_idx + r] = n;
}
} else if (inst->dst.file == FIXED_GRF) {
if (post_reg_alloc) {
for (unsigned r = 0; r < regs_written(inst); r++) {
const unsigned written = regs_written(inst);
for (unsigned r = 0; r < written; r++) {
add_dep(last_grf_write[inst->dst.nr + r], n);
last_grf_write[inst->dst.nr + r] = n;
}
@ -1500,11 +1506,13 @@ brw_instruction_scheduler::calculate_deps()
/* write-after-read deps. */
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == VGRF) {
for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++)
const unsigned read = regs_read(s->devinfo, inst, i);
for (unsigned r = 0; r < read; r++)
add_dep(n, last_grf_write[grf_index(inst->src[i]) + r], 0);
} else if (inst->src[i].file == FIXED_GRF) {
if (post_reg_alloc) {
for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++)
const unsigned read = regs_read(s->devinfo, inst, i);
for (unsigned r = 0; r < read; r++)
add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
} else {
add_dep(n, last_fixed_grf_write, 0);
@ -1513,7 +1521,8 @@ brw_instruction_scheduler::calculate_deps()
add_dep(n, last_accumulator_write, 0);
} else if (inst->src[i].is_address()) {
if (post_reg_alloc) {
for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) {
const unsigned read = inst->size_read(s->devinfo, i);
for (unsigned byte = 0; byte < read; byte += 2) {
add_dep(n, last_address_write[inst->src[i].address_slot(byte)], 0);
}
}
@ -1544,7 +1553,8 @@ brw_instruction_scheduler::calculate_deps()
* can mark this as WAR dependency.
*/
if (inst->dst.file == VGRF) {
for (unsigned r = 0; r < regs_written(inst); r++)
const unsigned written = regs_written(inst);
for (unsigned r = 0; r < written; r++)
last_grf_write[grf_index(inst->dst) + r] = n;
} else if (inst->dst.file == FIXED_GRF) {
if (post_reg_alloc) {
@ -1603,7 +1613,8 @@ brw_instruction_scheduler::address_register_interfere(const schedule_node *n)
for (unsigned i = 0; i < n->inst->sources; i++) {
if (!n->inst->src[i].is_address())
continue;
for (unsigned byte = 0; byte < n->inst->size_read(s->devinfo, i); byte += 2) {
const unsigned read = n->inst->size_read(s->devinfo, i);
for (unsigned byte = 0; byte < read; byte += 2) {
if (current.address_register[n->inst->src[i].address_slot(byte)] !=
n->inst->src[i].nr)
return true;
@ -1772,7 +1783,8 @@ brw_instruction_scheduler::update_children(schedule_node *chosen)
for (unsigned i = 0; i < chosen->inst->sources; i++) {
if (!chosen->inst->src[i].is_address())
continue;
for (unsigned byte = 0; byte < chosen->inst->size_read(s->devinfo, i); byte += 2) {
const unsigned read = chosen->inst->size_read(s->devinfo, i);
for (unsigned byte = 0; byte < read; byte += 2) {
assert(chosen->inst->src[i].address_slot(byte) <
ARRAY_SIZE(current.address_register));
current.address_register[chosen->inst->src[i].address_slot(byte)] = 0;