mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-03-19 03:30:42 +01:00
intel/brw/xe3+: Define BRW_SCHEDULE_PRE_LATENCY scheduling mode.
This defines a new pre-RA scheduling mode similar to BRW_SCHEDULE_PRE
but more aggressive at optimizing for minimum latency rather than
minimum register usage. The main motivation is that on recent xe3
platforms we use a register allocation heuristic that packs variables
more tightly at the bottom of the register file instead of the
round-robin heuristic we used on previous platforms, since as a result
of VRT there is a parallelism penalty when a program uses more GRF
registers than necessary. Unfortunately the xe3 tight-packing
heuristic severely constrains the work of the post-RA scheduler due to
the false dependencies introduced during register allocation, so we
can do a better job by making the scheduler aware of instruction
latencies before the register allocator introduces any false
dependencies.
This can lead to higher register pressure, but only when the scheduler
decides it could save cycles by extending a live range. It makes
sense to preserve the preexisting BRW_SCHEDULE_PRE as a separate mode
since some workloads can still benefit from neglecting latencies
pre-RA due to the trade-off mentioned between parallelism and GRF use,
a future commit will introduce a more accurate estimate of the
expected relative performance of BRW_SCHEDULE_PRE
vs. BRW_SCHEDULE_PRE_LATENCY taking into account this trade-off.
In theory this could also be helpful on earlier pre-xe3 platforms, but
the benefit should be significantly smaller due to the different RA
heuristic so it hasn't been tested extensively pre-xe3.
The following Traci tests are improved significantly by this change on
PTL (nearly all tests that run on my system are affected positively):
Ghostrunner2-trace-dx11-1440p-ultra: 7.12% ±0.36%
SpaceEngineers-trace-dx11-2160p-high: 5.77% ±0.43%
HogwartsLegacy-trace-dx12-1080p-ultra: 4.40% ±0.03%
Naraka-trace-dx11-1440p-highest: 3.06% ±0.43%
MetroExodus-trace-dx11-2160p-ultra: 2.26% ±0.60%
Fortnite-trace-dx11-2160p-epix: 2.12% ±0.53%
Nba2K23-trace-dx11-2160p-ultra: 1.98% ±0.30%
Control-trace-dx11-1440p-high: 1.93% ±0.36%
GodOfWar-trace-dx11-2160p-ultra: 1.62% ±0.47%
TotalWarPharaoh-trace-dx11-1440p-ultra: 1.55% ±0.18%
MountAndBlade2-trace-dx11-1440p-veryhigh: 1.51% ±0.37%
Destiny2-trace-dx11-1440p-highest: 1.44% ±0.34%
GtaV-trace-dx11-2160p-ultra: 1.26% ±0.27%
ShadowTombRaider-trace-dx11-2160p-ultra: 1.10% ±0.58%
Borderlands3-trace-dx11-2160p-ultra: 0.95% ±0.43%
TerminatorResistance-trace-dx11-2160p-ultra: 0.87% ±0.22%
BaldursGate3-trace-dx11-1440p-ultra: 0.84% ±0.28%
CitiesSkylines2-trace-dx11-1440p-high: 0.82% ±0.22%
PubG-trace-dx11-1440p-ultra: 0.72% ±0.37%
Palworld-trace-dx11-1080p-med: 0.71% ±0.26%
Superposition-trace-dx11-2160p-extreme: 0.69% ±0.19%
The compile-time cost of shader-db increases significantly by 1.85%
after this commit (14 iterations, 5% significance), the compile-time
of fossil-db doesn't change significantly in my setup.
v2: Addressed interaction with 81594d0db1,
since the code that calculates deps, delays and exits is no longer
mode-independent after this change. Instead of reverting that
commit (which is non-trivial and would have a greater compile-time
hit) simply reconstruct the scheduler object during the transition
between BRW_SCHEDULE_PRE_LATENCY and any other PRE mode that
doesn't require instruction latencies.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36618>
This commit is contained in:
parent
501b1cbc2c
commit
35ac517780
3 changed files with 40 additions and 9 deletions
|
|
@ -587,7 +587,7 @@ schedule_node::set_latency(const struct brw_isa_info *isa)
|
|||
class brw_instruction_scheduler {
|
||||
public:
|
||||
brw_instruction_scheduler(void *mem_ctx, const brw_shader *s, int grf_count, int hw_reg_count,
|
||||
int block_count, bool post_reg_alloc);
|
||||
int block_count, bool post_reg_alloc, bool need_latencies);
|
||||
|
||||
void add_barrier_deps(schedule_node *n);
|
||||
void add_cross_lane_deps(schedule_node *n);
|
||||
|
|
@ -706,8 +706,9 @@ public:
|
|||
};
|
||||
|
||||
brw_instruction_scheduler::brw_instruction_scheduler(void *mem_ctx, const brw_shader *s,
|
||||
int grf_count, int hw_reg_count,
|
||||
int block_count, bool post_reg_alloc)
|
||||
int grf_count, int hw_reg_count,
|
||||
int block_count, bool post_reg_alloc,
|
||||
bool need_latencies)
|
||||
: s(s)
|
||||
{
|
||||
this->mem_ctx = mem_ctx;
|
||||
|
|
@ -724,7 +725,7 @@ brw_instruction_scheduler::brw_instruction_scheduler(void *mem_ctx, const brw_sh
|
|||
foreach_block_and_inst(block, brw_inst, inst, s->cfg) {
|
||||
n->inst = inst;
|
||||
|
||||
if (!post_reg_alloc)
|
||||
if (!need_latencies)
|
||||
n->latency = 1;
|
||||
else
|
||||
n->set_latency(isa);
|
||||
|
|
@ -742,7 +743,7 @@ brw_instruction_scheduler::brw_instruction_scheduler(void *mem_ctx, const brw_sh
|
|||
current.available.make_empty();
|
||||
|
||||
this->hw_reg_count = hw_reg_count;
|
||||
this->mode = BRW_SCHEDULE_NONE;
|
||||
this->mode = (need_latencies ? BRW_SCHEDULE_PRE_LATENCY : BRW_SCHEDULE_NONE);
|
||||
this->reg_pressure = 0;
|
||||
|
||||
if (!post_reg_alloc) {
|
||||
|
|
@ -1571,7 +1572,7 @@ brw_instruction_scheduler::choose_instruction_to_schedule()
|
|||
{
|
||||
schedule_node *chosen = NULL;
|
||||
|
||||
if (mode == BRW_SCHEDULE_PRE || mode == BRW_SCHEDULE_POST) {
|
||||
if (mode == BRW_SCHEDULE_PRE || mode == BRW_SCHEDULE_PRE_LATENCY || mode == BRW_SCHEDULE_POST) {
|
||||
int chosen_time = 0;
|
||||
|
||||
/* Of the instructions ready to execute or the closest to being ready,
|
||||
|
|
@ -1848,8 +1849,16 @@ brw_prepare_scheduler(brw_shader &s, void *mem_ctx)
|
|||
const int grf_count = s.alloc.count;
|
||||
|
||||
brw_instruction_scheduler *empty = rzalloc(mem_ctx, brw_instruction_scheduler);
|
||||
return new (empty) brw_instruction_scheduler(mem_ctx, &s, grf_count, s.first_non_payload_grf,
|
||||
s.cfg->num_blocks, /* post_reg_alloc */ false);
|
||||
return new (empty) brw_instruction_scheduler(ralloc_context(mem_ctx), &s, grf_count,
|
||||
s.first_non_payload_grf, s.cfg->num_blocks,
|
||||
/* post_reg_alloc */ false, s.devinfo->ver >= 30);
|
||||
}
|
||||
|
||||
static bool
|
||||
needs_instruction_latencies(brw_instruction_scheduler_mode mode)
|
||||
{
|
||||
return mode == BRW_SCHEDULE_PRE_LATENCY ||
|
||||
mode == BRW_SCHEDULE_POST;
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -1859,6 +1868,21 @@ brw_schedule_instructions_pre_ra(brw_shader &s, brw_instruction_scheduler *sched
|
|||
if (mode == BRW_SCHEDULE_NONE)
|
||||
return;
|
||||
|
||||
if (needs_instruction_latencies(mode) != needs_instruction_latencies(sched->mode)) {
|
||||
/* The new mode requires different instruction latencies, which
|
||||
* requires recalculating the dependency graph as well as the
|
||||
* delay and exit metadata. Instead of maintaining a codepath
|
||||
* to reset and recompute most of the scheduler data structure
|
||||
* simply recreate the scheduler object.
|
||||
*/
|
||||
void *mem_ctx = ralloc_parent(sched);
|
||||
ralloc_free(sched->mem_ctx);
|
||||
new (sched) brw_instruction_scheduler(ralloc_context(mem_ctx), &s, s.alloc.count,
|
||||
s.first_non_payload_grf, s.cfg->num_blocks,
|
||||
/* post_reg_alloc */ false,
|
||||
needs_instruction_latencies(mode));
|
||||
}
|
||||
|
||||
sched->run(mode);
|
||||
|
||||
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
|
||||
|
|
@ -1868,12 +1892,13 @@ void
|
|||
brw_schedule_instructions_post_ra(brw_shader &s)
|
||||
{
|
||||
const bool post_reg_alloc = true;
|
||||
const bool need_latencies = true;
|
||||
const int grf_count = reg_unit(s.devinfo) * s.grf_used;
|
||||
|
||||
void *mem_ctx = ralloc_context(NULL);
|
||||
|
||||
brw_instruction_scheduler sched(mem_ctx, &s, grf_count, s.first_non_payload_grf,
|
||||
s.cfg->num_blocks, post_reg_alloc);
|
||||
s.cfg->num_blocks, post_reg_alloc, need_latencies);
|
||||
sched.run(BRW_SCHEDULE_POST);
|
||||
|
||||
ralloc_free(mem_ctx);
|
||||
|
|
|
|||
|
|
@ -1087,6 +1087,7 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling)
|
|||
bool allocated;
|
||||
|
||||
static const enum brw_instruction_scheduler_mode pre_modes[] = {
|
||||
BRW_SCHEDULE_PRE_LATENCY,
|
||||
BRW_SCHEDULE_PRE,
|
||||
BRW_SCHEDULE_PRE_NON_LIFO,
|
||||
BRW_SCHEDULE_NONE,
|
||||
|
|
@ -1094,6 +1095,7 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling)
|
|||
};
|
||||
|
||||
static const char *scheduler_mode_name[] = {
|
||||
[BRW_SCHEDULE_PRE_LATENCY] = "latency-sensitive",
|
||||
[BRW_SCHEDULE_PRE] = "top-down",
|
||||
[BRW_SCHEDULE_PRE_NON_LIFO] = "non-lifo",
|
||||
[BRW_SCHEDULE_PRE_LIFO] = "lifo",
|
||||
|
|
@ -1130,6 +1132,9 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling)
|
|||
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
|
||||
enum brw_instruction_scheduler_mode sched_mode = pre_modes[i];
|
||||
|
||||
if (devinfo->ver < 30 && sched_mode == BRW_SCHEDULE_PRE_LATENCY)
|
||||
continue;
|
||||
|
||||
brw_schedule_instructions_pre_ra(s, sched, sched_mode);
|
||||
s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
|
||||
|
||||
|
|
|
|||
|
|
@ -286,6 +286,7 @@ void brw_calculate_cfg(brw_shader &s);
|
|||
void brw_optimize(brw_shader &s);
|
||||
|
||||
enum brw_instruction_scheduler_mode {
|
||||
BRW_SCHEDULE_PRE_LATENCY,
|
||||
BRW_SCHEDULE_PRE,
|
||||
BRW_SCHEDULE_PRE_NON_LIFO,
|
||||
BRW_SCHEDULE_PRE_LIFO,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue