intel/brw: Only force g0's liveness to be the whole program if spilling

We don't actually need to extend g0's live range to the EOT message
generally - most messages that end a shader are headerless.  The main
implicit use of g0 is for constructing scratch headers.  With the last
two patches, we now consider scratch access that may exist in the IR
and already extend the liveness appropriately.

There is one remaining problem: spilling.  The register allocator will
create new scratch messages when spilling a register, which need to
create scratch headers, which need g0.  So, every new spill or fill
might extend the live range of g0, which would create new interference,
altering the graph.  This can be problematic.

However, when compiling SIMD16 or SIMD32 fragment shaders, we don't
allow spilling anyway.  So, why not use allow g0?  Also, when trying
various scheduling modes, we first try allocation without spilling.
If it works, great, if not, we try a (hopefully) less aggressive
schedule, and only allow spilling on the lowest-pressure schedule.

So, even for regular SIMD8 shaders, we can potentially gain the use
of g0 on the first few tries at scheduling+allocation.

Once we try to allocate with spilling, we go back to reserving g0
for the entire program, so that we can construct scratch headers at
any point.  We could possibly do better here, but this is simple and
reliable with some benefit.

Thanks to Ian Romanick for suggesting I try this approach.

fossil-db on Alchemist shows some more spill/fill improvements:

   Totals:
   Instrs: 149062395 -> 149053010 (-0.01%); split: -0.01%, +0.00%
   Cycles: 12609496913 -> 12611652181 (+0.02%); split: -0.45%, +0.47%
   Spill count: 52891 -> 52471 (-0.79%)
   Fill count: 101599 -> 100818 (-0.77%)
   Scratch Memory Size: 3292160 -> 3197952 (-2.86%)

   Totals from 416541 (66.59% of 625484) affected shaders:
   Instrs: 124058587 -> 124049202 (-0.01%); split: -0.01%, +0.01%
   Cycles: 3567164271 -> 3569319539 (+0.06%); split: -1.61%, +1.67%
   Spill count: 420 -> 0 (-inf%)
   Fill count: 781 -> 0 (-inf%)
   Scratch Memory Size: 94208 -> 0 (-inf%)

Witcher 3 shows a 33% reduction in scratch memory size, for example.

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30319>
This commit is contained in:
Kenneth Graunke 2024-07-22 17:22:47 -07:00
parent 4ca4b064cf
commit 8bca7e520c
4 changed files with 18 additions and 18 deletions

View file

@ -1515,7 +1515,7 @@ brw::register_pressure::register_pressure(const fs_visitor *v)
const unsigned payload_count = v->first_non_payload_grf; const unsigned payload_count = v->first_non_payload_grf;
int *payload_last_use_ip = new int[payload_count]; int *payload_last_use_ip = new int[payload_count];
v->calculate_payload_ranges(payload_count, payload_last_use_ip); v->calculate_payload_ranges(true, payload_count, payload_last_use_ip);
for (unsigned reg = 0; reg < payload_count; reg++) { for (unsigned reg = 0; reg < payload_count; reg++) {
for (int ip = 0; ip < payload_last_use_ip[reg]; ip++) for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)

View file

@ -294,7 +294,8 @@ public:
void assign_curb_setup(); void assign_curb_setup();
void convert_attr_sources_to_hw_regs(fs_inst *inst); void convert_attr_sources_to_hw_regs(fs_inst *inst);
void calculate_payload_ranges(unsigned payload_node_count, void calculate_payload_ranges(bool allow_spilling,
unsigned payload_node_count,
int *payload_last_use_ip) const; int *payload_last_use_ip) const;
void assign_constant_locations(); void assign_constant_locations();
bool get_pull_locs(const brw_reg &src, unsigned *out_surf_index, bool get_pull_locs(const brw_reg &src, unsigned *out_surf_index,

View file

@ -151,7 +151,8 @@ count_to_loop_end(const bblock_t *block)
unreachable("not reached"); unreachable("not reached");
} }
void fs_visitor::calculate_payload_ranges(unsigned payload_node_count, void fs_visitor::calculate_payload_ranges(bool allow_spilling,
unsigned payload_node_count,
int *payload_last_use_ip) const int *payload_last_use_ip) const
{ {
int loop_depth = 0; int loop_depth = 0;
@ -226,18 +227,16 @@ void fs_visitor::calculate_payload_ranges(unsigned payload_node_count,
if (inst->send_ex_desc_scratch) if (inst->send_ex_desc_scratch)
payload_last_use_ip[0] = use_ip; payload_last_use_ip[0] = use_ip;
if (inst->eot) {
/* We could omit this for the !inst->header_present case, except
* that the simulator apparently incorrectly reads from g0/g1
* instead of sideband. It also really freaks out driver
* developers to see g0 used in unusual places, so just always
* reserve it.
*/
payload_last_use_ip[0] = use_ip;
}
ip++; ip++;
} }
/* g0 is needed to construct scratch headers for spilling. While we could
* extend its live range each time we spill a register, and update the
* interference graph accordingly, this would get pretty messy. Instead,
* simply consider g0 live for the whole program if spilling is required.
*/
if (allow_spilling)
payload_last_use_ip[0] = ip - 1;
} }
class fs_reg_alloc { class fs_reg_alloc {
@ -292,7 +291,7 @@ private:
int node_start_ip, int node_end_ip); int node_start_ip, int node_end_ip);
void setup_inst_interference(const fs_inst *inst); void setup_inst_interference(const fs_inst *inst);
void build_interference_graph(); void build_interference_graph(bool allow_spilling);
brw_reg build_lane_offsets(const fs_builder &bld, brw_reg build_lane_offsets(const fs_builder &bld,
uint32_t spill_offset, int ip); uint32_t spill_offset, int ip);
@ -513,7 +512,7 @@ fs_reg_alloc::setup_inst_interference(const fs_inst *inst)
} }
void void
fs_reg_alloc::build_interference_graph() fs_reg_alloc::build_interference_graph(bool allow_spilling)
{ {
/* Compute the RA node layout */ /* Compute the RA node layout */
node_count = 0; node_count = 0;
@ -528,7 +527,7 @@ fs_reg_alloc::build_interference_graph()
last_vgrf_node = node_count - 1; last_vgrf_node = node_count - 1;
first_spill_node = node_count; first_spill_node = node_count;
fs->calculate_payload_ranges(payload_node_count, fs->calculate_payload_ranges(allow_spilling, payload_node_count,
payload_last_use_ip); payload_last_use_ip);
assert(g == NULL); assert(g == NULL);
@ -1069,7 +1068,7 @@ fs_reg_alloc::spill_reg(unsigned spill_reg)
bool bool
fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all) fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all)
{ {
build_interference_graph(); build_interference_graph(allow_spilling);
unsigned spilled = 0; unsigned spilled = 0;
while (1) { while (1) {

View file

@ -848,7 +848,7 @@ instruction_scheduler::setup_liveness(cfg_t *cfg)
} }
int payload_last_use_ip[hw_reg_count]; int payload_last_use_ip[hw_reg_count];
s->calculate_payload_ranges(hw_reg_count, payload_last_use_ip); s->calculate_payload_ranges(true, hw_reg_count, payload_last_use_ip);
for (unsigned i = 0; i < hw_reg_count; i++) { for (unsigned i = 0; i < hw_reg_count; i++) {
if (payload_last_use_ip[i] == -1) if (payload_last_use_ip[i] == -1)