intel/brw/xe3+: Define BRW_SCHEDULE_PRE_LATENCY scheduling mode.

This defines a new pre-RA scheduling mode similar to BRW_SCHEDULE_PRE
but more aggressive at optimizing for minimum latency rather than
minimum register usage.  The main motivation is that on recent xe3
platforms we use a register allocation heuristic that packs variables
more tightly at the bottom of the register file instead of the
round-robin heuristic we used on previous platforms, since as a result
of VRT there is a parallelism penalty when a program uses more GRF
registers than necessary.  Unfortunately the xe3 tight-packing
heuristic severely constrains the work of the post-RA scheduler due to
the false dependencies introduced during register allocation, so we
can do a better job by making the scheduler aware of instruction
latencies before the register allocator introduces any false
dependencies.

This can lead to higher register pressure, but only when the scheduler
decides it could save cycles by extending a live range.  It makes
sense to preserve the preexisting BRW_SCHEDULE_PRE as a separate mode
since some workloads can still benefit from neglecting latencies
pre-RA due to the trade-off mentioned between parallelism and GRF use,
a future commit will introduce a more accurate estimate of the
expected relative performance of BRW_SCHEDULE_PRE
vs. BRW_SCHEDULE_PRE_LATENCY taking into account this trade-off.

In theory this could also be helpful on earlier pre-xe3 platforms, but
the benefit should be significantly smaller due to the different RA
heuristic so it hasn't been tested extensively pre-xe3.

The following Traci tests are improved significantly by this change on
PTL (nearly all tests that run on my system are affected positively):

Ghostrunner2-trace-dx11-1440p-ultra:                7.12% ±0.36%
SpaceEngineers-trace-dx11-2160p-high:               5.77% ±0.43%
HogwartsLegacy-trace-dx12-1080p-ultra:              4.40% ±0.03%
Naraka-trace-dx11-1440p-highest:                    3.06% ±0.43%
MetroExodus-trace-dx11-2160p-ultra:                 2.26% ±0.60%
Fortnite-trace-dx11-2160p-epix:                     2.12% ±0.53%
Nba2K23-trace-dx11-2160p-ultra:                     1.98% ±0.30%
Control-trace-dx11-1440p-high:                      1.93% ±0.36%
GodOfWar-trace-dx11-2160p-ultra:                    1.62% ±0.47%
TotalWarPharaoh-trace-dx11-1440p-ultra:             1.55% ±0.18%
MountAndBlade2-trace-dx11-1440p-veryhigh:           1.51% ±0.37%
Destiny2-trace-dx11-1440p-highest:                  1.44% ±0.34%
GtaV-trace-dx11-2160p-ultra:                        1.26% ±0.27%
ShadowTombRaider-trace-dx11-2160p-ultra:            1.10% ±0.58%
Borderlands3-trace-dx11-2160p-ultra:                0.95% ±0.43%
TerminatorResistance-trace-dx11-2160p-ultra:        0.87% ±0.22%
BaldursGate3-trace-dx11-1440p-ultra:                0.84% ±0.28%
CitiesSkylines2-trace-dx11-1440p-high:              0.82% ±0.22%
PubG-trace-dx11-1440p-ultra:                        0.72% ±0.37%
Palworld-trace-dx11-1080p-med:                      0.71% ±0.26%
Superposition-trace-dx11-2160p-extreme:             0.69% ±0.19%

The compile-time cost of shader-db increases significantly by 1.85%
after this commit (14 iterations, 5% significance), the compile-time
of fossil-db doesn't change significantly in my setup.

v2: Addressed interaction with 81594d0db1,
    since the code that calculates deps, delays and exits is no longer
    mode-independent after this change.  Instead of reverting that
    commit (which is non-trivial and would have a greater compile-time
    hit) simply reconstruct the scheduler object during the transition
    between BRW_SCHEDULE_PRE_LATENCY and any other PRE mode that
    doesn't require instruction latencies.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36618>
This commit is contained in:
Francisco Jerez 2025-07-16 15:34:43 -07:00 committed by Marge Bot
parent 501b1cbc2c
commit 35ac517780
3 changed files with 40 additions and 9 deletions

View file

@ -587,7 +587,7 @@ schedule_node::set_latency(const struct brw_isa_info *isa)
class brw_instruction_scheduler {
public:
brw_instruction_scheduler(void *mem_ctx, const brw_shader *s, int grf_count, int hw_reg_count,
int block_count, bool post_reg_alloc);
int block_count, bool post_reg_alloc, bool need_latencies);
void add_barrier_deps(schedule_node *n);
void add_cross_lane_deps(schedule_node *n);
@ -706,8 +706,9 @@ public:
};
brw_instruction_scheduler::brw_instruction_scheduler(void *mem_ctx, const brw_shader *s,
int grf_count, int hw_reg_count,
int block_count, bool post_reg_alloc)
int grf_count, int hw_reg_count,
int block_count, bool post_reg_alloc,
bool need_latencies)
: s(s)
{
this->mem_ctx = mem_ctx;
@ -724,7 +725,7 @@ brw_instruction_scheduler::brw_instruction_scheduler(void *mem_ctx, const brw_sh
foreach_block_and_inst(block, brw_inst, inst, s->cfg) {
n->inst = inst;
if (!post_reg_alloc)
if (!need_latencies)
n->latency = 1;
else
n->set_latency(isa);
@ -742,7 +743,7 @@ brw_instruction_scheduler::brw_instruction_scheduler(void *mem_ctx, const brw_sh
current.available.make_empty();
this->hw_reg_count = hw_reg_count;
this->mode = BRW_SCHEDULE_NONE;
this->mode = (need_latencies ? BRW_SCHEDULE_PRE_LATENCY : BRW_SCHEDULE_NONE);
this->reg_pressure = 0;
if (!post_reg_alloc) {
@ -1571,7 +1572,7 @@ brw_instruction_scheduler::choose_instruction_to_schedule()
{
schedule_node *chosen = NULL;
if (mode == BRW_SCHEDULE_PRE || mode == BRW_SCHEDULE_POST) {
if (mode == BRW_SCHEDULE_PRE || mode == BRW_SCHEDULE_PRE_LATENCY || mode == BRW_SCHEDULE_POST) {
int chosen_time = 0;
/* Of the instructions ready to execute or the closest to being ready,
@ -1848,8 +1849,16 @@ brw_prepare_scheduler(brw_shader &s, void *mem_ctx)
const int grf_count = s.alloc.count;
brw_instruction_scheduler *empty = rzalloc(mem_ctx, brw_instruction_scheduler);
return new (empty) brw_instruction_scheduler(mem_ctx, &s, grf_count, s.first_non_payload_grf,
s.cfg->num_blocks, /* post_reg_alloc */ false);
return new (empty) brw_instruction_scheduler(ralloc_context(mem_ctx), &s, grf_count,
s.first_non_payload_grf, s.cfg->num_blocks,
/* post_reg_alloc */ false, s.devinfo->ver >= 30);
}
static bool
needs_instruction_latencies(brw_instruction_scheduler_mode mode)
{
return mode == BRW_SCHEDULE_PRE_LATENCY ||
mode == BRW_SCHEDULE_POST;
}
void
@ -1859,6 +1868,21 @@ brw_schedule_instructions_pre_ra(brw_shader &s, brw_instruction_scheduler *sched
if (mode == BRW_SCHEDULE_NONE)
return;
if (needs_instruction_latencies(mode) != needs_instruction_latencies(sched->mode)) {
/* The new mode requires different instruction latencies, which
* requires recalculating the dependency graph as well as the
* delay and exit metadata. Instead of maintaining a codepath
* to reset and recompute most of the scheduler data structure
* simply recreate the scheduler object.
*/
void *mem_ctx = ralloc_parent(sched);
ralloc_free(sched->mem_ctx);
new (sched) brw_instruction_scheduler(ralloc_context(mem_ctx), &s, s.alloc.count,
s.first_non_payload_grf, s.cfg->num_blocks,
/* post_reg_alloc */ false,
needs_instruction_latencies(mode));
}
sched->run(mode);
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
@ -1868,12 +1892,13 @@ void
brw_schedule_instructions_post_ra(brw_shader &s)
{
const bool post_reg_alloc = true;
const bool need_latencies = true;
const int grf_count = reg_unit(s.devinfo) * s.grf_used;
void *mem_ctx = ralloc_context(NULL);
brw_instruction_scheduler sched(mem_ctx, &s, grf_count, s.first_non_payload_grf,
s.cfg->num_blocks, post_reg_alloc);
s.cfg->num_blocks, post_reg_alloc, need_latencies);
sched.run(BRW_SCHEDULE_POST);
ralloc_free(mem_ctx);

View file

@ -1087,6 +1087,7 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling)
bool allocated;
static const enum brw_instruction_scheduler_mode pre_modes[] = {
BRW_SCHEDULE_PRE_LATENCY,
BRW_SCHEDULE_PRE,
BRW_SCHEDULE_PRE_NON_LIFO,
BRW_SCHEDULE_NONE,
@ -1094,6 +1095,7 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling)
};
static const char *scheduler_mode_name[] = {
[BRW_SCHEDULE_PRE_LATENCY] = "latency-sensitive",
[BRW_SCHEDULE_PRE] = "top-down",
[BRW_SCHEDULE_PRE_NON_LIFO] = "non-lifo",
[BRW_SCHEDULE_PRE_LIFO] = "lifo",
@ -1130,6 +1132,9 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling)
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
enum brw_instruction_scheduler_mode sched_mode = pre_modes[i];
if (devinfo->ver < 30 && sched_mode == BRW_SCHEDULE_PRE_LATENCY)
continue;
brw_schedule_instructions_pre_ra(s, sched, sched_mode);
s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];

View file

@ -286,6 +286,7 @@ void brw_calculate_cfg(brw_shader &s);
void brw_optimize(brw_shader &s);
enum brw_instruction_scheduler_mode {
BRW_SCHEDULE_PRE_LATENCY,
BRW_SCHEDULE_PRE,
BRW_SCHEDULE_PRE_NON_LIFO,
BRW_SCHEDULE_PRE_LIFO,