From 821a812c7d9758f5dcaa46d9997adddbbac95f25 Mon Sep 17 00:00:00 2001 From: Caio Oliveira Date: Fri, 8 May 2026 16:52:40 -0700 Subject: [PATCH] brw: Don't directly use regs_read/regs_written/size_read as bound for non-trivial loops Instead save to a local variable and use that. In various cases the compiler is not able to pull it out of the loop, since there are other not inlined function calls as part of the loop's body, resulting in repeated unnecessary calls to either size_read() or its pieces that get inlined. Below are fossil compilation times in a MTL machine compiling shaders for a BMG GPU: ``` // Differences at 95.0% confidence. // Rise of the Tomb Raider (n=20) -0.017 +/- 0.00724575 -3.45177665% +/- 1.45084% // Alan Wake (n=20) -0.153 +/- 0.00960067 -4.99265786% +/- 0.303695% // Borderlands 3 (n=14) -0.486428571 +/- 0.15354 -3.51248195% +/- 1.0835% // Oblivion Remastered (n=14) -0.143571429 +/- 0.0357991 -3.05749924% +/- 0.747872% // Baldur's Gate 3 (n=14) -1.68928571 +/- 0.151598 -4.12128605% +/- 0.364259% ``` Reviewed-by: Ian Romanick Part-of: --- .../compiler/brw/brw_analysis_liveness.cpp | 6 ++-- .../compiler/brw/brw_analysis_performance.cpp | 12 ++++--- .../compiler/brw/brw_lower_scoreboard.cpp | 12 ++++--- .../brw/brw_schedule_instructions.cpp | 36 ++++++++++++------- 4 files changed, 44 insertions(+), 22 deletions(-) diff --git a/src/intel/compiler/brw/brw_analysis_liveness.cpp b/src/intel/compiler/brw/brw_analysis_liveness.cpp index 99308d5c686..5a99207debc 100644 --- a/src/intel/compiler/brw/brw_analysis_liveness.cpp +++ b/src/intel/compiler/brw/brw_analysis_liveness.cpp @@ -89,7 +89,8 @@ brw_live_variables::setup_def_use() continue; const int var = var_from_reg(inst->src[i]); - for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++) + const unsigned read = regs_read(devinfo, inst, i); + for (unsigned j = 0; j < read; j++) setup_one_read(bd, ip, var + j); } @@ -98,7 +99,8 @@ brw_live_variables::setup_def_use() /* Set def[] for this instruction */ if (inst->dst.file == VGRF) { const int var = var_from_reg(inst->dst); - for (unsigned j = 0; j < regs_written(inst); j++) + const unsigned written = regs_written(inst); + for (unsigned j = 0; j < written; j++) setup_one_write(bd, inst, ip, var + j); } diff --git a/src/intel/compiler/brw/brw_analysis_performance.cpp b/src/intel/compiler/brw/brw_analysis_performance.cpp index bea11911275..02d6ce3a0e1 100644 --- a/src/intel/compiler/brw/brw_analysis_performance.cpp +++ b/src/intel/compiler/brw/brw_analysis_performance.cpp @@ -905,7 +905,8 @@ namespace { /* Stall on any source dependencies. */ for (unsigned i = 0; i < inst->sources; i++) { - for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++) + const unsigned read = regs_read(devinfo, inst, i); + for (unsigned j = 0; j < read; j++) stall_on_dependency( st, reg_dependency_id(devinfo, inst->src[i], j)); } @@ -927,7 +928,8 @@ namespace { /* Stall on any write dependencies. */ if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { - for (unsigned j = 0; j < regs_written(inst); j++) + const unsigned written = regs_written(inst); + for (unsigned j = 0; j < written; j++) stall_on_dependency( st, reg_dependency_id(devinfo, inst->dst, j)); } @@ -960,7 +962,8 @@ namespace { if (inst->is_send()) { for (unsigned i = 0; i < inst->sources; i++) { if (inst->is_payload(i)) { - for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++) + const unsigned read = regs_read(devinfo, inst, i); + for (unsigned j = 0; j < read; j++) mark_read_dependency( st, perf, reg_dependency_id(devinfo, inst->src[i], j)); } @@ -969,7 +972,8 @@ namespace { /* Mark any destination dependencies. */ if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { - for (unsigned j = 0; j < regs_written(inst); j++) { + const unsigned written = regs_written(inst); + for (unsigned j = 0; j < written; j++) { mark_write_dependency(st, perf, reg_dependency_id(devinfo, inst->dst, j)); } diff --git a/src/intel/compiler/brw/brw_lower_scoreboard.cpp b/src/intel/compiler/brw/brw_lower_scoreboard.cpp index 903147fc0cd..43139b0dca5 100644 --- a/src/intel/compiler/brw/brw_lower_scoreboard.cpp +++ b/src/intel/compiler/brw/brw_lower_scoreboard.cpp @@ -1020,7 +1020,8 @@ namespace { is_ordered ? dependency(TGL_REGDIST_SRC, jp, exec_all) : dependency::done; - for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++) { + const unsigned read = regs_read(devinfo, inst, i); + for (unsigned j = 0; j < read; j++) { const brw_reg r = byte_offset(inst->src[i], REG_SIZE * j); sb.set(r, shadow(sb.get(r), rd_dep)); } @@ -1072,7 +1073,8 @@ namespace { if (is_valid(wr_dep) && inst->dst.file != BAD_FILE && !inst->dst.is_null()) { - for (unsigned j = 0; j < regs_written(inst); j++) + const unsigned written = regs_written(inst); + for (unsigned j = 0; j < written; j++) sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep); } } @@ -1168,7 +1170,8 @@ namespace { std::vector inst_deps; for (unsigned i = 0; i < inst->sources; i++) { - for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++) + const unsigned read = regs_read(devinfo, inst, i); + for (unsigned j = 0; j < read; j++) add_dependency(ids, inst_deps, dependency_for_read( sb.get(byte_offset(inst->src[i], REG_SIZE * j)))); } @@ -1214,7 +1217,8 @@ namespace { if (inst->dst.file != BAD_FILE && !inst->dst.is_null() && !inst->dst.is_accumulator() && inst->opcode != SHADER_OPCODE_UNDEF) { - for (unsigned j = 0; j < regs_written(inst); j++) { + const unsigned written = regs_written(inst); + for (unsigned j = 0; j < written; j++) { add_dependency(ids, inst_deps, dependency_for_write(devinfo, inst, sb.get(byte_offset(inst->dst, REG_SIZE * j)))); } diff --git a/src/intel/compiler/brw/brw_schedule_instructions.cpp b/src/intel/compiler/brw/brw_schedule_instructions.cpp index 3c2b7f68fc9..cd7299784aa 100644 --- a/src/intel/compiler/brw/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw/brw_schedule_instructions.cpp @@ -1363,7 +1363,8 @@ brw_instruction_scheduler::calculate_deps() if (!inst->src[i].is_address()) continue; - for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) { + const unsigned read = inst->size_read(s->devinfo, i); + for (unsigned byte = 0; byte < read; byte += 2) { assert(inst->src[i].address_slot(byte) < ARRAY_SIZE(last_address_write)); schedule_node *write_addr_node = last_address_write[inst->src[i].address_slot(byte)]; @@ -1396,11 +1397,13 @@ brw_instruction_scheduler::calculate_deps() /* read-after-write deps. */ for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file == VGRF) { - for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++) + const unsigned read = regs_read(s->devinfo, inst, i); + for (unsigned r = 0; r < read; r++) add_dep(last_grf_write[grf_index(inst->src[i]) + r], n); } else if (inst->src[i].file == FIXED_GRF) { if (post_reg_alloc) { - for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++) + const unsigned read = regs_read(s->devinfo, inst, i); + for (unsigned r = 0; r < read; r++) add_dep(last_grf_write[inst->src[i].nr + r], n); } else { add_dep(last_fixed_grf_write, n); @@ -1409,7 +1412,8 @@ brw_instruction_scheduler::calculate_deps() add_dep(last_accumulator_write, n); } else if (inst->src[i].is_address()) { if (post_reg_alloc) { - for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) + const unsigned read = inst->size_read(s->devinfo, i); + for (unsigned byte = 0; byte < read; byte += 2) add_dep(last_address_write[inst->src[i].address_slot(byte)], n); } } else if (register_needs_barrier(inst->src[i])) { @@ -1433,13 +1437,15 @@ brw_instruction_scheduler::calculate_deps() /* write-after-write deps. */ if (inst->dst.file == VGRF) { int grf_idx = grf_index(inst->dst); - for (unsigned r = 0; r < regs_written(inst); r++) { + const unsigned written = regs_written(inst); + for (unsigned r = 0; r < written; r++) { add_dep(last_grf_write[grf_idx + r], n); last_grf_write[grf_idx + r] = n; } } else if (inst->dst.file == FIXED_GRF) { if (post_reg_alloc) { - for (unsigned r = 0; r < regs_written(inst); r++) { + const unsigned written = regs_written(inst); + for (unsigned r = 0; r < written; r++) { add_dep(last_grf_write[inst->dst.nr + r], n); last_grf_write[inst->dst.nr + r] = n; } @@ -1500,11 +1506,13 @@ brw_instruction_scheduler::calculate_deps() /* write-after-read deps. */ for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file == VGRF) { - for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++) + const unsigned read = regs_read(s->devinfo, inst, i); + for (unsigned r = 0; r < read; r++) add_dep(n, last_grf_write[grf_index(inst->src[i]) + r], 0); } else if (inst->src[i].file == FIXED_GRF) { if (post_reg_alloc) { - for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++) + const unsigned read = regs_read(s->devinfo, inst, i); + for (unsigned r = 0; r < read; r++) add_dep(n, last_grf_write[inst->src[i].nr + r], 0); } else { add_dep(n, last_fixed_grf_write, 0); @@ -1513,7 +1521,8 @@ brw_instruction_scheduler::calculate_deps() add_dep(n, last_accumulator_write, 0); } else if (inst->src[i].is_address()) { if (post_reg_alloc) { - for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) { + const unsigned read = inst->size_read(s->devinfo, i); + for (unsigned byte = 0; byte < read; byte += 2) { add_dep(n, last_address_write[inst->src[i].address_slot(byte)], 0); } } @@ -1544,7 +1553,8 @@ brw_instruction_scheduler::calculate_deps() * can mark this as WAR dependency. */ if (inst->dst.file == VGRF) { - for (unsigned r = 0; r < regs_written(inst); r++) + const unsigned written = regs_written(inst); + for (unsigned r = 0; r < written; r++) last_grf_write[grf_index(inst->dst) + r] = n; } else if (inst->dst.file == FIXED_GRF) { if (post_reg_alloc) { @@ -1603,7 +1613,8 @@ brw_instruction_scheduler::address_register_interfere(const schedule_node *n) for (unsigned i = 0; i < n->inst->sources; i++) { if (!n->inst->src[i].is_address()) continue; - for (unsigned byte = 0; byte < n->inst->size_read(s->devinfo, i); byte += 2) { + const unsigned read = n->inst->size_read(s->devinfo, i); + for (unsigned byte = 0; byte < read; byte += 2) { if (current.address_register[n->inst->src[i].address_slot(byte)] != n->inst->src[i].nr) return true; @@ -1772,7 +1783,8 @@ brw_instruction_scheduler::update_children(schedule_node *chosen) for (unsigned i = 0; i < chosen->inst->sources; i++) { if (!chosen->inst->src[i].is_address()) continue; - for (unsigned byte = 0; byte < chosen->inst->size_read(s->devinfo, i); byte += 2) { + const unsigned read = chosen->inst->size_read(s->devinfo, i); + for (unsigned byte = 0; byte < read; byte += 2) { assert(chosen->inst->src[i].address_slot(byte) < ARRAY_SIZE(current.address_register)); current.address_register[chosen->inst->src[i].address_slot(byte)] = 0;