From 35ac517780caff8546bbd4fcff6f6398e231148a Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Wed, 16 Jul 2025 15:34:43 -0700 Subject: [PATCH] intel/brw/xe3+: Define BRW_SCHEDULE_PRE_LATENCY scheduling mode. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This defines a new pre-RA scheduling mode similar to BRW_SCHEDULE_PRE but more aggressive at optimizing for minimum latency rather than minimum register usage. The main motivation is that on recent xe3 platforms we use a register allocation heuristic that packs variables more tightly at the bottom of the register file instead of the round-robin heuristic we used on previous platforms, since as a result of VRT there is a parallelism penalty when a program uses more GRF registers than necessary. Unfortunately the xe3 tight-packing heuristic severely constrains the work of the post-RA scheduler due to the false dependencies introduced during register allocation, so we can do a better job by making the scheduler aware of instruction latencies before the register allocator introduces any false dependencies. This can lead to higher register pressure, but only when the scheduler decides it could save cycles by extending a live range. It makes sense to preserve the preexisting BRW_SCHEDULE_PRE as a separate mode since some workloads can still benefit from neglecting latencies pre-RA due to the trade-off mentioned between parallelism and GRF use, a future commit will introduce a more accurate estimate of the expected relative performance of BRW_SCHEDULE_PRE vs. BRW_SCHEDULE_PRE_LATENCY taking into account this trade-off. In theory this could also be helpful on earlier pre-xe3 platforms, but the benefit should be significantly smaller due to the different RA heuristic so it hasn't been tested extensively pre-xe3. The following Traci tests are improved significantly by this change on PTL (nearly all tests that run on my system are affected positively): Ghostrunner2-trace-dx11-1440p-ultra: 7.12% ±0.36% SpaceEngineers-trace-dx11-2160p-high: 5.77% ±0.43% HogwartsLegacy-trace-dx12-1080p-ultra: 4.40% ±0.03% Naraka-trace-dx11-1440p-highest: 3.06% ±0.43% MetroExodus-trace-dx11-2160p-ultra: 2.26% ±0.60% Fortnite-trace-dx11-2160p-epix: 2.12% ±0.53% Nba2K23-trace-dx11-2160p-ultra: 1.98% ±0.30% Control-trace-dx11-1440p-high: 1.93% ±0.36% GodOfWar-trace-dx11-2160p-ultra: 1.62% ±0.47% TotalWarPharaoh-trace-dx11-1440p-ultra: 1.55% ±0.18% MountAndBlade2-trace-dx11-1440p-veryhigh: 1.51% ±0.37% Destiny2-trace-dx11-1440p-highest: 1.44% ±0.34% GtaV-trace-dx11-2160p-ultra: 1.26% ±0.27% ShadowTombRaider-trace-dx11-2160p-ultra: 1.10% ±0.58% Borderlands3-trace-dx11-2160p-ultra: 0.95% ±0.43% TerminatorResistance-trace-dx11-2160p-ultra: 0.87% ±0.22% BaldursGate3-trace-dx11-1440p-ultra: 0.84% ±0.28% CitiesSkylines2-trace-dx11-1440p-high: 0.82% ±0.22% PubG-trace-dx11-1440p-ultra: 0.72% ±0.37% Palworld-trace-dx11-1080p-med: 0.71% ±0.26% Superposition-trace-dx11-2160p-extreme: 0.69% ±0.19% The compile-time cost of shader-db increases significantly by 1.85% after this commit (14 iterations, 5% significance), the compile-time of fossil-db doesn't change significantly in my setup. v2: Addressed interaction with 81594d0db180398f48634438c8c8b5b9ab6a, since the code that calculates deps, delays and exits is no longer mode-independent after this change. Instead of reverting that commit (which is non-trivial and would have a greater compile-time hit) simply reconstruct the scheduler object during the transition between BRW_SCHEDULE_PRE_LATENCY and any other PRE mode that doesn't require instruction latencies. Reviewed-by: Lionel Landwerlin Part-of: --- .../compiler/brw_schedule_instructions.cpp | 43 +++++++++++++++---- src/intel/compiler/brw_shader.cpp | 5 +++ src/intel/compiler/brw_shader.h | 1 + 3 files changed, 40 insertions(+), 9 deletions(-) diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index 5c51893d4fb..b2fe12aca57 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -587,7 +587,7 @@ schedule_node::set_latency(const struct brw_isa_info *isa) class brw_instruction_scheduler { public: brw_instruction_scheduler(void *mem_ctx, const brw_shader *s, int grf_count, int hw_reg_count, - int block_count, bool post_reg_alloc); + int block_count, bool post_reg_alloc, bool need_latencies); void add_barrier_deps(schedule_node *n); void add_cross_lane_deps(schedule_node *n); @@ -706,8 +706,9 @@ public: }; brw_instruction_scheduler::brw_instruction_scheduler(void *mem_ctx, const brw_shader *s, - int grf_count, int hw_reg_count, - int block_count, bool post_reg_alloc) + int grf_count, int hw_reg_count, + int block_count, bool post_reg_alloc, + bool need_latencies) : s(s) { this->mem_ctx = mem_ctx; @@ -724,7 +725,7 @@ brw_instruction_scheduler::brw_instruction_scheduler(void *mem_ctx, const brw_sh foreach_block_and_inst(block, brw_inst, inst, s->cfg) { n->inst = inst; - if (!post_reg_alloc) + if (!need_latencies) n->latency = 1; else n->set_latency(isa); @@ -742,7 +743,7 @@ brw_instruction_scheduler::brw_instruction_scheduler(void *mem_ctx, const brw_sh current.available.make_empty(); this->hw_reg_count = hw_reg_count; - this->mode = BRW_SCHEDULE_NONE; + this->mode = (need_latencies ? BRW_SCHEDULE_PRE_LATENCY : BRW_SCHEDULE_NONE); this->reg_pressure = 0; if (!post_reg_alloc) { @@ -1571,7 +1572,7 @@ brw_instruction_scheduler::choose_instruction_to_schedule() { schedule_node *chosen = NULL; - if (mode == BRW_SCHEDULE_PRE || mode == BRW_SCHEDULE_POST) { + if (mode == BRW_SCHEDULE_PRE || mode == BRW_SCHEDULE_PRE_LATENCY || mode == BRW_SCHEDULE_POST) { int chosen_time = 0; /* Of the instructions ready to execute or the closest to being ready, @@ -1848,8 +1849,16 @@ brw_prepare_scheduler(brw_shader &s, void *mem_ctx) const int grf_count = s.alloc.count; brw_instruction_scheduler *empty = rzalloc(mem_ctx, brw_instruction_scheduler); - return new (empty) brw_instruction_scheduler(mem_ctx, &s, grf_count, s.first_non_payload_grf, - s.cfg->num_blocks, /* post_reg_alloc */ false); + return new (empty) brw_instruction_scheduler(ralloc_context(mem_ctx), &s, grf_count, + s.first_non_payload_grf, s.cfg->num_blocks, + /* post_reg_alloc */ false, s.devinfo->ver >= 30); +} + +static bool +needs_instruction_latencies(brw_instruction_scheduler_mode mode) +{ + return mode == BRW_SCHEDULE_PRE_LATENCY || + mode == BRW_SCHEDULE_POST; } void @@ -1859,6 +1868,21 @@ brw_schedule_instructions_pre_ra(brw_shader &s, brw_instruction_scheduler *sched if (mode == BRW_SCHEDULE_NONE) return; + if (needs_instruction_latencies(mode) != needs_instruction_latencies(sched->mode)) { + /* The new mode requires different instruction latencies, which + * requires recalculating the dependency graph as well as the + * delay and exit metadata. Instead of maintaining a codepath + * to reset and recompute most of the scheduler data structure + * simply recreate the scheduler object. + */ + void *mem_ctx = ralloc_parent(sched); + ralloc_free(sched->mem_ctx); + new (sched) brw_instruction_scheduler(ralloc_context(mem_ctx), &s, s.alloc.count, + s.first_non_payload_grf, s.cfg->num_blocks, + /* post_reg_alloc */ false, + needs_instruction_latencies(mode)); + } + sched->run(mode); s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS); @@ -1868,12 +1892,13 @@ void brw_schedule_instructions_post_ra(brw_shader &s) { const bool post_reg_alloc = true; + const bool need_latencies = true; const int grf_count = reg_unit(s.devinfo) * s.grf_used; void *mem_ctx = ralloc_context(NULL); brw_instruction_scheduler sched(mem_ctx, &s, grf_count, s.first_non_payload_grf, - s.cfg->num_blocks, post_reg_alloc); + s.cfg->num_blocks, post_reg_alloc, need_latencies); sched.run(BRW_SCHEDULE_POST); ralloc_free(mem_ctx); diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index 5bbb4b10bd1..360e9794363 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -1087,6 +1087,7 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling) bool allocated; static const enum brw_instruction_scheduler_mode pre_modes[] = { + BRW_SCHEDULE_PRE_LATENCY, BRW_SCHEDULE_PRE, BRW_SCHEDULE_PRE_NON_LIFO, BRW_SCHEDULE_NONE, @@ -1094,6 +1095,7 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling) }; static const char *scheduler_mode_name[] = { + [BRW_SCHEDULE_PRE_LATENCY] = "latency-sensitive", [BRW_SCHEDULE_PRE] = "top-down", [BRW_SCHEDULE_PRE_NON_LIFO] = "non-lifo", [BRW_SCHEDULE_PRE_LIFO] = "lifo", @@ -1130,6 +1132,9 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling) for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) { enum brw_instruction_scheduler_mode sched_mode = pre_modes[i]; + if (devinfo->ver < 30 && sched_mode == BRW_SCHEDULE_PRE_LATENCY) + continue; + brw_schedule_instructions_pre_ra(s, sched, sched_mode); s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode]; diff --git a/src/intel/compiler/brw_shader.h b/src/intel/compiler/brw_shader.h index 44aa044c12a..3d6a0fd9511 100644 --- a/src/intel/compiler/brw_shader.h +++ b/src/intel/compiler/brw_shader.h @@ -286,6 +286,7 @@ void brw_calculate_cfg(brw_shader &s); void brw_optimize(brw_shader &s); enum brw_instruction_scheduler_mode { + BRW_SCHEDULE_PRE_LATENCY, BRW_SCHEDULE_PRE, BRW_SCHEDULE_PRE_NON_LIFO, BRW_SCHEDULE_PRE_LIFO,