2010-10-20 10:26:29 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
* Authors:
|
|
|
|
|
* Eric Anholt <eric@anholt.net>
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
2015-11-22 17:58:51 -08:00
|
|
|
#include "brw_eu.h"
|
2010-10-20 10:26:29 -07:00
|
|
|
#include "brw_fs.h"
|
2023-11-21 09:58:55 -08:00
|
|
|
#include "brw_fs_builder.h"
|
2014-07-15 11:45:20 -07:00
|
|
|
#include "brw_cfg.h"
|
2020-10-09 04:27:35 -05:00
|
|
|
#include "util/set.h"
|
2015-11-22 18:27:42 -08:00
|
|
|
#include "util/register_allocate.h"
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2015-06-03 19:05:54 +03:00
|
|
|
using namespace brw;
|
|
|
|
|
|
2010-10-20 10:26:29 -07:00
|
|
|
static void
|
2022-06-28 17:49:38 -07:00
|
|
|
assign_reg(const struct intel_device_info *devinfo,
|
2024-06-18 23:42:59 -07:00
|
|
|
unsigned *reg_hw_locations, brw_reg *reg)
|
2010-10-20 10:26:29 -07:00
|
|
|
{
|
2015-10-26 17:09:25 -07:00
|
|
|
if (reg->file == VGRF) {
|
2022-06-28 17:49:38 -07:00
|
|
|
reg->nr = reg_unit(devinfo) * reg_hw_locations[reg->nr] + reg->offset / REG_SIZE;
|
2016-09-01 12:42:20 -07:00
|
|
|
reg->offset %= REG_SIZE;
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_assign_regs_trivial(fs_visitor &s)
|
2010-10-20 10:26:29 -07:00
|
|
|
{
|
2024-07-12 16:55:33 -07:00
|
|
|
const struct intel_device_info *devinfo = s.devinfo;
|
2025-01-06 20:51:32 -08:00
|
|
|
unsigned *hw_reg_mapping = ralloc_array(NULL, unsigned, s.alloc.count + 1);
|
2015-02-10 15:51:34 +02:00
|
|
|
unsigned i;
|
2024-07-12 16:55:33 -07:00
|
|
|
int reg_width = s.dispatch_width / 8;
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2011-03-13 13:26:09 -07:00
|
|
|
/* Note that compressed instructions require alignment to 2 registers. */
|
2024-07-12 16:55:33 -07:00
|
|
|
hw_reg_mapping[0] = ALIGN(s.first_non_payload_grf, reg_width);
|
|
|
|
|
for (i = 1; i <= s.alloc.count; i++) {
|
2010-10-20 10:26:29 -07:00
|
|
|
hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
|
2024-07-12 16:55:33 -07:00
|
|
|
DIV_ROUND_UP(s.alloc.sizes[i - 1],
|
2022-06-28 17:49:38 -07:00
|
|
|
reg_unit(devinfo)));
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
2024-07-12 16:55:33 -07:00
|
|
|
s.grf_used = hw_reg_mapping[s.alloc.count];
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
|
2022-06-28 17:49:38 -07:00
|
|
|
assign_reg(devinfo, hw_reg_mapping, &inst->dst);
|
2014-08-08 14:57:27 -07:00
|
|
|
for (i = 0; i < inst->sources; i++) {
|
2022-06-28 17:49:38 -07:00
|
|
|
assign_reg(devinfo, hw_reg_mapping, &inst->src[i]);
|
2014-08-08 14:57:27 -07:00
|
|
|
}
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.grf_used >= BRW_MAX_GRF) {
|
|
|
|
|
s.fail("Ran out of regs on trivial allocator (%d/%d)\n",
|
|
|
|
|
s.grf_used, BRW_MAX_GRF);
|
2014-08-08 16:25:34 -07:00
|
|
|
} else {
|
2024-07-12 16:55:33 -07:00
|
|
|
s.alloc.count = s.grf_used;
|
2011-03-13 13:26:09 -07:00
|
|
|
}
|
|
|
|
|
|
2025-01-06 20:51:32 -08:00
|
|
|
ralloc_free(hw_reg_mapping);
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
|
2024-02-28 13:59:35 -08:00
|
|
|
extern "C" void
|
2024-02-26 07:46:58 -08:00
|
|
|
brw_fs_alloc_reg_sets(struct brw_compiler *compiler)
|
2010-10-20 10:26:29 -07:00
|
|
|
{
|
2021-04-05 13:19:39 -07:00
|
|
|
const struct intel_device_info *devinfo = compiler->devinfo;
|
2014-08-18 14:27:55 -07:00
|
|
|
int base_reg_count = BRW_MAX_GRF;
|
2015-07-31 08:36:35 -07:00
|
|
|
|
2012-10-01 16:39:54 -07:00
|
|
|
/* The registers used to make up almost all values handled in the compiler
|
|
|
|
|
* are a scalar value occupying a single register (or 2 registers in the
|
2013-11-12 15:33:27 -08:00
|
|
|
* case of SIMD16, which is handled by dividing base_reg_count by 2 and
|
2012-10-01 16:39:54 -07:00
|
|
|
* multiplying allocated register numbers by 2). Things that were
|
|
|
|
|
* aggregates of scalar values at the GLSL level were split to scalar
|
|
|
|
|
* values by split_virtual_grfs().
|
|
|
|
|
*
|
2013-05-22 11:26:03 -07:00
|
|
|
* However, texture SEND messages return a series of contiguous registers
|
|
|
|
|
* to write into. We currently always ask for 4 registers, but we may
|
|
|
|
|
* convert that to use less some day.
|
2012-10-01 16:39:54 -07:00
|
|
|
*
|
2021-03-29 15:40:04 -07:00
|
|
|
* Additionally, on gfx5 we need aligned pairs of registers for the PLN
|
|
|
|
|
* instruction, and on gfx4 we need 8 contiguous regs for workaround simd16
|
2012-11-13 15:54:41 -08:00
|
|
|
* texturing.
|
2012-10-01 16:39:54 -07:00
|
|
|
*/
|
2022-06-28 17:49:38 -07:00
|
|
|
assert(REG_CLASS_COUNT == MAX_VGRF_SIZE(devinfo) / reg_unit(devinfo));
|
2022-02-18 22:25:58 -08:00
|
|
|
int class_sizes[REG_CLASS_COUNT];
|
|
|
|
|
for (unsigned i = 0; i < REG_CLASS_COUNT; i++)
|
2016-04-30 20:47:49 -07:00
|
|
|
class_sizes[i] = i + 1;
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2021-03-05 09:20:01 -08:00
|
|
|
struct ra_regs *regs = ra_alloc_reg_set(compiler, BRW_MAX_GRF, false);
|
2024-02-15 02:34:50 -08:00
|
|
|
ra_set_allocate_round_robin(regs);
|
2022-02-18 22:25:58 -08:00
|
|
|
struct ra_class **classes = ralloc_array(compiler, struct ra_class *,
|
|
|
|
|
REG_CLASS_COUNT);
|
2014-09-12 16:17:37 -07:00
|
|
|
|
2021-03-05 09:20:01 -08:00
|
|
|
/* Now, make the register classes for each size of contiguous register
|
|
|
|
|
* allocation we might need to make.
|
|
|
|
|
*/
|
2022-02-18 22:25:58 -08:00
|
|
|
for (int i = 0; i < REG_CLASS_COUNT; i++) {
|
2021-03-05 09:20:01 -08:00
|
|
|
classes[i] = ra_alloc_contig_reg_class(regs, class_sizes[i]);
|
|
|
|
|
|
2024-02-15 02:34:50 -08:00
|
|
|
for (int reg = 0; reg <= base_reg_count - class_sizes[i]; reg++)
|
|
|
|
|
ra_class_add_reg(classes[i], reg);
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
|
2021-03-05 09:20:01 -08:00
|
|
|
ra_set_finalize(regs, NULL);
|
2012-10-02 19:07:20 -07:00
|
|
|
|
2024-02-26 07:46:58 -08:00
|
|
|
compiler->fs_reg_set.regs = regs;
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(compiler->fs_reg_set.classes); i++)
|
|
|
|
|
compiler->fs_reg_set.classes[i] = NULL;
|
2022-02-18 22:25:58 -08:00
|
|
|
for (int i = 0; i < REG_CLASS_COUNT; i++)
|
2024-02-26 07:46:58 -08:00
|
|
|
compiler->fs_reg_set.classes[class_sizes[i] - 1] = classes[i];
|
2011-05-09 09:56:18 -07:00
|
|
|
}
|
|
|
|
|
|
2014-09-01 15:38:58 -07:00
|
|
|
static int
|
|
|
|
|
count_to_loop_end(const bblock_t *block)
|
2012-10-02 15:01:24 -07:00
|
|
|
{
|
2014-09-01 15:01:23 -07:00
|
|
|
if (block->end()->opcode == BRW_OPCODE_WHILE)
|
2014-09-01 15:38:58 -07:00
|
|
|
return block->end_ip;
|
|
|
|
|
|
2012-10-02 15:01:24 -07:00
|
|
|
int depth = 1;
|
2014-09-01 15:38:58 -07:00
|
|
|
/* Skip the first block, since we don't want to count the do the calling
|
|
|
|
|
* function found.
|
|
|
|
|
*/
|
2014-09-02 21:07:51 -07:00
|
|
|
for (block = block->next();
|
2012-10-02 15:01:24 -07:00
|
|
|
depth > 0;
|
2014-09-02 21:07:51 -07:00
|
|
|
block = block->next()) {
|
2014-09-01 15:01:23 -07:00
|
|
|
if (block->start()->opcode == BRW_OPCODE_DO)
|
2012-10-02 15:01:24 -07:00
|
|
|
depth++;
|
2014-09-01 15:01:23 -07:00
|
|
|
if (block->end()->opcode == BRW_OPCODE_WHILE) {
|
2012-10-02 15:01:24 -07:00
|
|
|
depth--;
|
2014-09-01 15:38:58 -07:00
|
|
|
if (depth == 0)
|
|
|
|
|
return block->end_ip;
|
2012-10-02 15:01:24 -07:00
|
|
|
}
|
|
|
|
|
}
|
2014-09-01 15:38:58 -07:00
|
|
|
unreachable("not reached");
|
2012-10-02 15:01:24 -07:00
|
|
|
}
|
|
|
|
|
|
intel/brw: Only force g0's liveness to be the whole program if spilling
We don't actually need to extend g0's live range to the EOT message
generally - most messages that end a shader are headerless. The main
implicit use of g0 is for constructing scratch headers. With the last
two patches, we now consider scratch access that may exist in the IR
and already extend the liveness appropriately.
There is one remaining problem: spilling. The register allocator will
create new scratch messages when spilling a register, which need to
create scratch headers, which need g0. So, every new spill or fill
might extend the live range of g0, which would create new interference,
altering the graph. This can be problematic.
However, when compiling SIMD16 or SIMD32 fragment shaders, we don't
allow spilling anyway. So, why not use allow g0? Also, when trying
various scheduling modes, we first try allocation without spilling.
If it works, great, if not, we try a (hopefully) less aggressive
schedule, and only allow spilling on the lowest-pressure schedule.
So, even for regular SIMD8 shaders, we can potentially gain the use
of g0 on the first few tries at scheduling+allocation.
Once we try to allocate with spilling, we go back to reserving g0
for the entire program, so that we can construct scratch headers at
any point. We could possibly do better here, but this is simple and
reliable with some benefit.
Thanks to Ian Romanick for suggesting I try this approach.
fossil-db on Alchemist shows some more spill/fill improvements:
Totals:
Instrs: 149062395 -> 149053010 (-0.01%); split: -0.01%, +0.00%
Cycles: 12609496913 -> 12611652181 (+0.02%); split: -0.45%, +0.47%
Spill count: 52891 -> 52471 (-0.79%)
Fill count: 101599 -> 100818 (-0.77%)
Scratch Memory Size: 3292160 -> 3197952 (-2.86%)
Totals from 416541 (66.59% of 625484) affected shaders:
Instrs: 124058587 -> 124049202 (-0.01%); split: -0.01%, +0.01%
Cycles: 3567164271 -> 3569319539 (+0.06%); split: -1.61%, +1.67%
Spill count: 420 -> 0 (-inf%)
Fill count: 781 -> 0 (-inf%)
Scratch Memory Size: 94208 -> 0 (-inf%)
Witcher 3 shows a 33% reduction in scratch memory size, for example.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30319>
2024-07-22 17:22:47 -07:00
|
|
|
void fs_visitor::calculate_payload_ranges(bool allow_spilling,
|
|
|
|
|
unsigned payload_node_count,
|
2016-09-23 15:15:33 +03:00
|
|
|
int *payload_last_use_ip) const
|
2012-10-01 17:54:10 -07:00
|
|
|
{
|
2012-10-02 15:01:24 -07:00
|
|
|
int loop_depth = 0;
|
|
|
|
|
int loop_end_ip = 0;
|
|
|
|
|
|
2022-07-07 01:12:24 -07:00
|
|
|
for (unsigned i = 0; i < payload_node_count; i++)
|
2015-06-30 13:42:15 -07:00
|
|
|
payload_last_use_ip[i] = -1;
|
|
|
|
|
|
2012-10-02 15:01:24 -07:00
|
|
|
int ip = 0;
|
2014-09-01 13:35:04 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2012-10-02 15:01:24 -07:00
|
|
|
switch (inst->opcode) {
|
|
|
|
|
case BRW_OPCODE_DO:
|
|
|
|
|
loop_depth++;
|
|
|
|
|
|
|
|
|
|
/* Since payload regs are deffed only at the start of the shader
|
|
|
|
|
* execution, any uses of the payload within a loop mean the live
|
|
|
|
|
* interval extends to the end of the outermost loop. Find the ip of
|
|
|
|
|
* the end now.
|
|
|
|
|
*/
|
|
|
|
|
if (loop_depth == 1)
|
2014-09-01 15:38:58 -07:00
|
|
|
loop_end_ip = count_to_loop_end(block);
|
2012-10-02 15:01:24 -07:00
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_WHILE:
|
|
|
|
|
loop_depth--;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int use_ip;
|
|
|
|
|
if (loop_depth > 0)
|
|
|
|
|
use_ip = loop_end_ip;
|
|
|
|
|
else
|
|
|
|
|
use_ip = ip;
|
|
|
|
|
|
2015-10-26 17:52:57 -07:00
|
|
|
/* Note that UNIFORM args have been turned into FIXED_GRF by
|
2012-10-02 15:01:24 -07:00
|
|
|
* assign_curbe_setup(), and interpolation uses fixed hardware regs from
|
|
|
|
|
* the start (see interp_reg()).
|
|
|
|
|
*/
|
2014-03-17 10:39:43 -07:00
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2015-10-26 17:52:57 -07:00
|
|
|
if (inst->src[i].file == FIXED_GRF) {
|
2022-07-07 14:01:29 -07:00
|
|
|
unsigned reg_nr = inst->src[i].nr;
|
|
|
|
|
if (reg_nr / reg_unit(devinfo) >= payload_node_count)
|
2012-10-02 15:01:24 -07:00
|
|
|
continue;
|
|
|
|
|
|
2022-07-07 14:01:29 -07:00
|
|
|
for (unsigned j = reg_nr / reg_unit(devinfo);
|
2024-06-19 10:50:51 -07:00
|
|
|
j < DIV_ROUND_UP(reg_nr + regs_read(devinfo, inst, i),
|
2022-07-07 14:01:29 -07:00
|
|
|
reg_unit(devinfo));
|
|
|
|
|
j++) {
|
|
|
|
|
payload_last_use_ip[j] = use_ip;
|
|
|
|
|
assert(j < payload_node_count);
|
2015-02-02 14:23:35 -08:00
|
|
|
}
|
2012-10-02 15:01:24 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-10 13:19:53 -06:00
|
|
|
if (inst->dst.file == FIXED_GRF) {
|
2022-07-07 14:01:29 -07:00
|
|
|
unsigned reg_nr = inst->dst.nr;
|
|
|
|
|
if (reg_nr / reg_unit(devinfo) < payload_node_count) {
|
|
|
|
|
for (unsigned j = reg_nr / reg_unit(devinfo);
|
|
|
|
|
j < DIV_ROUND_UP(reg_nr + regs_written(inst),
|
|
|
|
|
reg_unit(devinfo));
|
|
|
|
|
j++) {
|
|
|
|
|
payload_last_use_ip[j] = use_ip;
|
|
|
|
|
assert(j < payload_node_count);
|
2021-03-10 13:19:53 -06:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-22 16:41:52 -07:00
|
|
|
/* The generator implicitly uses g0 to construct extended message
|
|
|
|
|
* descriptors for scratch send messages when this bit is set.
|
|
|
|
|
*/
|
|
|
|
|
if (inst->send_ex_desc_scratch)
|
|
|
|
|
payload_last_use_ip[0] = use_ip;
|
|
|
|
|
|
2012-10-02 15:01:24 -07:00
|
|
|
ip++;
|
|
|
|
|
}
|
intel/brw: Only force g0's liveness to be the whole program if spilling
We don't actually need to extend g0's live range to the EOT message
generally - most messages that end a shader are headerless. The main
implicit use of g0 is for constructing scratch headers. With the last
two patches, we now consider scratch access that may exist in the IR
and already extend the liveness appropriately.
There is one remaining problem: spilling. The register allocator will
create new scratch messages when spilling a register, which need to
create scratch headers, which need g0. So, every new spill or fill
might extend the live range of g0, which would create new interference,
altering the graph. This can be problematic.
However, when compiling SIMD16 or SIMD32 fragment shaders, we don't
allow spilling anyway. So, why not use allow g0? Also, when trying
various scheduling modes, we first try allocation without spilling.
If it works, great, if not, we try a (hopefully) less aggressive
schedule, and only allow spilling on the lowest-pressure schedule.
So, even for regular SIMD8 shaders, we can potentially gain the use
of g0 on the first few tries at scheduling+allocation.
Once we try to allocate with spilling, we go back to reserving g0
for the entire program, so that we can construct scratch headers at
any point. We could possibly do better here, but this is simple and
reliable with some benefit.
Thanks to Ian Romanick for suggesting I try this approach.
fossil-db on Alchemist shows some more spill/fill improvements:
Totals:
Instrs: 149062395 -> 149053010 (-0.01%); split: -0.01%, +0.00%
Cycles: 12609496913 -> 12611652181 (+0.02%); split: -0.45%, +0.47%
Spill count: 52891 -> 52471 (-0.79%)
Fill count: 101599 -> 100818 (-0.77%)
Scratch Memory Size: 3292160 -> 3197952 (-2.86%)
Totals from 416541 (66.59% of 625484) affected shaders:
Instrs: 124058587 -> 124049202 (-0.01%); split: -0.01%, +0.01%
Cycles: 3567164271 -> 3569319539 (+0.06%); split: -1.61%, +1.67%
Spill count: 420 -> 0 (-inf%)
Fill count: 781 -> 0 (-inf%)
Scratch Memory Size: 94208 -> 0 (-inf%)
Witcher 3 shows a 33% reduction in scratch memory size, for example.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30319>
2024-07-22 17:22:47 -07:00
|
|
|
|
|
|
|
|
/* g0 is needed to construct scratch headers for spilling. While we could
|
|
|
|
|
* extend its live range each time we spill a register, and update the
|
|
|
|
|
* interference graph accordingly, this would get pretty messy. Instead,
|
|
|
|
|
* simply consider g0 live for the whole program if spilling is required.
|
|
|
|
|
*/
|
|
|
|
|
if (allow_spilling)
|
|
|
|
|
payload_last_use_ip[0] = ip - 1;
|
2015-06-12 12:01:35 -07:00
|
|
|
}
|
|
|
|
|
|
2019-05-07 20:09:08 -05:00
|
|
|
class fs_reg_alloc {
|
|
|
|
|
public:
|
|
|
|
|
fs_reg_alloc(fs_visitor *fs):
|
2016-03-13 16:25:57 -07:00
|
|
|
fs(fs), devinfo(fs->devinfo), compiler(fs->compiler),
|
|
|
|
|
live(fs->live_analysis.require()), g(NULL),
|
2019-05-14 23:03:29 -05:00
|
|
|
have_spill_costs(false)
|
2019-05-07 20:09:08 -05:00
|
|
|
{
|
|
|
|
|
mem_ctx = ralloc_context(NULL);
|
2019-05-07 23:54:17 -05:00
|
|
|
|
2020-10-12 15:07:25 -05:00
|
|
|
/* Stash the number of instructions so we can sanity check that our
|
|
|
|
|
* counts still match liveness.
|
|
|
|
|
*/
|
|
|
|
|
live_instr_count = fs->cfg->last_block()->end_ip + 1;
|
|
|
|
|
|
2020-10-09 04:27:35 -05:00
|
|
|
spill_insts = _mesa_pointer_set_create(mem_ctx);
|
|
|
|
|
|
2019-05-07 23:54:17 -05:00
|
|
|
/* Most of this allocation was written for a reg_width of 1
|
|
|
|
|
* (dispatch_width == 8). In extending to SIMD16, the code was
|
|
|
|
|
* left in place and it was converted to have the hardware
|
|
|
|
|
* registers it's allocating be contiguous physical pairs of regs
|
|
|
|
|
* for reg_width == 2.
|
|
|
|
|
*/
|
2019-05-07 20:09:08 -05:00
|
|
|
int reg_width = fs->dispatch_width / 8;
|
2019-05-07 23:54:17 -05:00
|
|
|
payload_node_count = ALIGN(fs->first_non_payload_grf, reg_width);
|
|
|
|
|
|
|
|
|
|
/* Get payload IP information */
|
|
|
|
|
payload_last_use_ip = ralloc_array(mem_ctx, int, payload_node_count);
|
2019-05-08 13:34:04 -05:00
|
|
|
|
2020-09-09 18:13:43 +02:00
|
|
|
node_count = 0;
|
|
|
|
|
first_payload_node = 0;
|
|
|
|
|
grf127_send_hack_node = 0;
|
|
|
|
|
first_vgrf_node = 0;
|
2020-10-08 15:51:13 -05:00
|
|
|
last_vgrf_node = 0;
|
2020-09-09 18:13:43 +02:00
|
|
|
first_spill_node = 0;
|
|
|
|
|
|
2019-05-08 13:34:04 -05:00
|
|
|
spill_vgrf_ip = NULL;
|
|
|
|
|
spill_vgrf_ip_alloc = 0;
|
|
|
|
|
spill_node_count = 0;
|
2019-05-07 20:09:08 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
~fs_reg_alloc()
|
|
|
|
|
{
|
|
|
|
|
ralloc_free(mem_ctx);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool assign_regs(bool allow_spilling, bool spill_all);
|
|
|
|
|
|
|
|
|
|
private:
|
2019-05-07 23:54:17 -05:00
|
|
|
void setup_live_interference(unsigned node,
|
|
|
|
|
int node_start_ip, int node_end_ip);
|
2016-09-23 15:15:33 +03:00
|
|
|
void setup_inst_interference(const fs_inst *inst);
|
2019-05-07 23:54:17 -05:00
|
|
|
|
intel/brw: Only force g0's liveness to be the whole program if spilling
We don't actually need to extend g0's live range to the EOT message
generally - most messages that end a shader are headerless. The main
implicit use of g0 is for constructing scratch headers. With the last
two patches, we now consider scratch access that may exist in the IR
and already extend the liveness appropriately.
There is one remaining problem: spilling. The register allocator will
create new scratch messages when spilling a register, which need to
create scratch headers, which need g0. So, every new spill or fill
might extend the live range of g0, which would create new interference,
altering the graph. This can be problematic.
However, when compiling SIMD16 or SIMD32 fragment shaders, we don't
allow spilling anyway. So, why not use allow g0? Also, when trying
various scheduling modes, we first try allocation without spilling.
If it works, great, if not, we try a (hopefully) less aggressive
schedule, and only allow spilling on the lowest-pressure schedule.
So, even for regular SIMD8 shaders, we can potentially gain the use
of g0 on the first few tries at scheduling+allocation.
Once we try to allocate with spilling, we go back to reserving g0
for the entire program, so that we can construct scratch headers at
any point. We could possibly do better here, but this is simple and
reliable with some benefit.
Thanks to Ian Romanick for suggesting I try this approach.
fossil-db on Alchemist shows some more spill/fill improvements:
Totals:
Instrs: 149062395 -> 149053010 (-0.01%); split: -0.01%, +0.00%
Cycles: 12609496913 -> 12611652181 (+0.02%); split: -0.45%, +0.47%
Spill count: 52891 -> 52471 (-0.79%)
Fill count: 101599 -> 100818 (-0.77%)
Scratch Memory Size: 3292160 -> 3197952 (-2.86%)
Totals from 416541 (66.59% of 625484) affected shaders:
Instrs: 124058587 -> 124049202 (-0.01%); split: -0.01%, +0.01%
Cycles: 3567164271 -> 3569319539 (+0.06%); split: -1.61%, +1.67%
Spill count: 420 -> 0 (-inf%)
Fill count: 781 -> 0 (-inf%)
Scratch Memory Size: 94208 -> 0 (-inf%)
Witcher 3 shows a 33% reduction in scratch memory size, for example.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30319>
2024-07-22 17:22:47 -07:00
|
|
|
void build_interference_graph(bool allow_spilling);
|
2019-05-07 20:09:08 -05:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg build_lane_offsets(const fs_builder &bld,
|
2022-07-18 12:27:53 +03:00
|
|
|
uint32_t spill_offset, int ip);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg build_single_offset(const fs_builder &bld,
|
2022-07-18 12:27:53 +03:00
|
|
|
uint32_t spill_offset, int ip);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg build_legacy_scratch_header(const fs_builder &bld,
|
|
|
|
|
uint32_t spill_offset, int ip);
|
2022-07-18 12:27:53 +03:00
|
|
|
|
2024-10-10 23:04:52 -07:00
|
|
|
void emit_unspill(const fs_builder &bld, struct brw_shader_stats *stats,
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dst, uint32_t spill_offset, unsigned count, int ip);
|
2024-10-10 23:04:52 -07:00
|
|
|
void emit_spill(const fs_builder &bld, struct brw_shader_stats *stats,
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg src, uint32_t spill_offset, unsigned count, int ip);
|
2020-10-09 04:27:35 -05:00
|
|
|
|
2019-05-08 13:34:04 -05:00
|
|
|
void set_spill_costs();
|
2019-05-07 20:09:08 -05:00
|
|
|
int choose_spill_reg();
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg alloc_spill_reg(unsigned size, int ip);
|
2019-05-07 20:09:08 -05:00
|
|
|
void spill_reg(unsigned spill_reg);
|
|
|
|
|
|
|
|
|
|
void *mem_ctx;
|
|
|
|
|
fs_visitor *fs;
|
2021-04-05 13:19:39 -07:00
|
|
|
const intel_device_info *devinfo;
|
2019-05-07 20:09:08 -05:00
|
|
|
const brw_compiler *compiler;
|
2016-03-13 16:25:57 -07:00
|
|
|
const fs_live_variables &live;
|
2020-10-12 15:07:25 -05:00
|
|
|
int live_instr_count;
|
2019-05-07 20:09:08 -05:00
|
|
|
|
2020-10-09 04:27:35 -05:00
|
|
|
set *spill_insts;
|
|
|
|
|
|
2019-05-07 20:09:08 -05:00
|
|
|
ra_graph *g;
|
2019-05-14 23:03:29 -05:00
|
|
|
bool have_spill_costs;
|
2012-10-02 15:01:24 -07:00
|
|
|
|
2019-05-07 23:54:17 -05:00
|
|
|
int payload_node_count;
|
|
|
|
|
int *payload_last_use_ip;
|
2015-06-30 13:42:15 -07:00
|
|
|
|
2019-05-07 23:54:17 -05:00
|
|
|
int node_count;
|
|
|
|
|
int first_payload_node;
|
|
|
|
|
int grf127_send_hack_node;
|
2019-05-08 13:09:27 -05:00
|
|
|
int first_vgrf_node;
|
2020-10-08 15:51:13 -05:00
|
|
|
int last_vgrf_node;
|
2019-05-08 13:34:04 -05:00
|
|
|
int first_spill_node;
|
|
|
|
|
|
|
|
|
|
int *spill_vgrf_ip;
|
|
|
|
|
int spill_vgrf_ip_alloc;
|
|
|
|
|
int spill_node_count;
|
2019-05-07 23:54:17 -05:00
|
|
|
};
|
2012-10-01 17:54:10 -07:00
|
|
|
|
2019-05-07 17:38:22 -05:00
|
|
|
namespace {
|
|
|
|
|
/**
|
|
|
|
|
* Maximum spill block size we expect to encounter in 32B units.
|
|
|
|
|
*
|
|
|
|
|
* This is somewhat arbitrary and doesn't necessarily limit the maximum
|
|
|
|
|
* variable size that can be spilled -- A higher value will allow a
|
|
|
|
|
* variable of a given size to be spilled more efficiently with a smaller
|
|
|
|
|
* number of scratch messages, but will increase the likelihood of a
|
|
|
|
|
* collision between the MRFs reserved for spilling and other MRFs used by
|
|
|
|
|
* the program (and possibly increase GRF register pressure on platforms
|
|
|
|
|
* without hardware MRFs), what could cause register allocation to fail.
|
|
|
|
|
*
|
|
|
|
|
* For the moment reserve just enough space so a register of 32 bit
|
|
|
|
|
* component type and natural region width can be spilled without splitting
|
|
|
|
|
* into multiple (force_writemask_all) scratch messages.
|
|
|
|
|
*/
|
|
|
|
|
unsigned
|
2024-02-19 22:25:16 -08:00
|
|
|
spill_max_size(const fs_visitor *s)
|
2019-05-07 17:38:22 -05:00
|
|
|
{
|
2022-07-18 12:27:53 +03:00
|
|
|
/* LSC is limited to SIMD16 sends */
|
|
|
|
|
if (s->devinfo->has_lsc)
|
|
|
|
|
return 2;
|
|
|
|
|
|
2021-03-29 15:46:12 -07:00
|
|
|
/* FINISHME - On Gfx7+ it should be possible to avoid this limit
|
2019-05-07 17:38:22 -05:00
|
|
|
* altogether by spilling directly from the temporary GRF
|
|
|
|
|
* allocated to hold the result of the instruction (and the
|
|
|
|
|
* scratch write header).
|
|
|
|
|
*/
|
|
|
|
|
/* FINISHME - The shader's dispatch width probably belongs in
|
|
|
|
|
* backend_shader (or some nonexistent fs_shader class?)
|
|
|
|
|
* rather than in the visitor class.
|
|
|
|
|
*/
|
2024-02-19 22:25:16 -08:00
|
|
|
return s->dispatch_width / 8;
|
2019-05-07 17:38:22 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-07 20:09:08 -05:00
|
|
|
void
|
2019-05-07 23:54:17 -05:00
|
|
|
fs_reg_alloc::setup_live_interference(unsigned node,
|
|
|
|
|
int node_start_ip, int node_end_ip)
|
2013-10-29 12:18:10 -07:00
|
|
|
{
|
2019-05-07 23:54:17 -05:00
|
|
|
/* Mark any virtual grf that is live between the start of the program and
|
|
|
|
|
* the last use of a payload node interfering with that payload node.
|
2011-05-09 09:56:18 -07:00
|
|
|
*/
|
2019-05-07 23:54:17 -05:00
|
|
|
for (int i = 0; i < payload_node_count; i++) {
|
|
|
|
|
if (payload_last_use_ip[i] == -1)
|
|
|
|
|
continue;
|
2012-10-01 16:39:54 -07:00
|
|
|
|
2016-03-09 17:46:16 -08:00
|
|
|
/* Note that we use a <= comparison, unlike vgrfs_interfere(),
|
2019-05-07 23:54:17 -05:00
|
|
|
* in order to not have to worry about the uniform issue described in
|
|
|
|
|
* calculate_live_intervals().
|
2012-10-01 16:39:54 -07:00
|
|
|
*/
|
2019-05-07 23:54:17 -05:00
|
|
|
if (node_start_ip <= payload_last_use_ip[i])
|
|
|
|
|
ra_add_node_interference(g, node, first_payload_node + i);
|
|
|
|
|
}
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2019-05-07 23:54:17 -05:00
|
|
|
/* Add interference with every vgrf whose live range intersects this
|
|
|
|
|
* node's. We only need to look at nodes below this one as the reflexivity
|
|
|
|
|
* of interference will take care of the rest.
|
|
|
|
|
*/
|
2019-05-08 13:34:04 -05:00
|
|
|
for (unsigned n2 = first_vgrf_node;
|
2020-10-08 15:51:13 -05:00
|
|
|
n2 <= (unsigned)last_vgrf_node && n2 < node; n2++) {
|
2019-05-08 13:09:27 -05:00
|
|
|
unsigned vgrf = n2 - first_vgrf_node;
|
2016-03-13 16:25:57 -07:00
|
|
|
if (!(node_end_ip <= live.vgrf_start[vgrf] ||
|
|
|
|
|
live.vgrf_end[vgrf] <= node_start_ip))
|
2019-05-08 13:09:27 -05:00
|
|
|
ra_add_node_interference(g, node, n2);
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
2019-05-07 23:54:17 -05:00
|
|
|
}
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2024-08-27 13:40:05 -07:00
|
|
|
/**
|
|
|
|
|
* Returns true if this instruction's sources and destinations cannot
|
|
|
|
|
* safely be the same register.
|
|
|
|
|
*
|
|
|
|
|
* In most cases, a register can be written over safely by the same
|
|
|
|
|
* instruction that is its last use. For a single instruction, the
|
|
|
|
|
* sources are dereferenced before writing of the destination starts
|
|
|
|
|
* (naturally).
|
|
|
|
|
*
|
|
|
|
|
* However, there are a few cases where this can be problematic:
|
|
|
|
|
*
|
|
|
|
|
* - Virtual opcodes that translate to multiple instructions in the
|
|
|
|
|
* code generator: if src == dst and one instruction writes the
|
|
|
|
|
* destination before a later instruction reads the source, then
|
|
|
|
|
* src will have been clobbered.
|
|
|
|
|
*
|
|
|
|
|
* - SIMD16 compressed instructions with certain regioning (see below).
|
|
|
|
|
*
|
|
|
|
|
* The register allocator uses this information to set up conflicts between
|
|
|
|
|
* GRF sources and the destination.
|
|
|
|
|
*/
|
|
|
|
|
static bool
|
|
|
|
|
brw_inst_has_source_and_destination_hazard(const fs_inst *inst)
|
|
|
|
|
{
|
|
|
|
|
switch (inst->opcode) {
|
|
|
|
|
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
|
|
|
|
/* Multiple partial writes to the destination */
|
|
|
|
|
return true;
|
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
/* This instruction returns an arbitrary channel from the source and
|
|
|
|
|
* gets split into smaller instructions in the generator. It's possible
|
|
|
|
|
* that one of the instructions will read from a channel corresponding
|
|
|
|
|
* to an earlier instruction.
|
|
|
|
|
*/
|
|
|
|
|
case SHADER_OPCODE_SEL_EXEC:
|
|
|
|
|
/* This is implemented as
|
|
|
|
|
*
|
|
|
|
|
* mov(16) g4<1>D 0D { align1 WE_all 1H };
|
|
|
|
|
* mov(16) g4<1>D g5<8,8,1>D { align1 1H }
|
|
|
|
|
*
|
|
|
|
|
* Because the source is only read in the second instruction, the first
|
|
|
|
|
* may stomp all over it.
|
|
|
|
|
*/
|
|
|
|
|
return true;
|
|
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
|
|
|
|
switch (inst->src[1].ud) {
|
|
|
|
|
case BRW_SWIZZLE_XXXX:
|
|
|
|
|
case BRW_SWIZZLE_YYYY:
|
|
|
|
|
case BRW_SWIZZLE_ZZZZ:
|
|
|
|
|
case BRW_SWIZZLE_WWWW:
|
|
|
|
|
case BRW_SWIZZLE_XXZZ:
|
|
|
|
|
case BRW_SWIZZLE_YYWW:
|
|
|
|
|
case BRW_SWIZZLE_XYXY:
|
|
|
|
|
case BRW_SWIZZLE_ZWZW:
|
|
|
|
|
/* These can be implemented as a single Align1 region on all
|
|
|
|
|
* platforms, so there's never a hazard between source and
|
|
|
|
|
* destination. C.f. fs_generator::generate_quad_swizzle().
|
|
|
|
|
*/
|
|
|
|
|
return false;
|
|
|
|
|
default:
|
|
|
|
|
return !is_uniform(inst->src[0]);
|
|
|
|
|
}
|
|
|
|
|
case BRW_OPCODE_DPAS:
|
|
|
|
|
/* This is overly conservative. The actual hazard is more complicated to
|
|
|
|
|
* describe. When the repeat count is N, the single instruction behaves
|
|
|
|
|
* like N instructions with a repeat count of one, but the destination
|
|
|
|
|
* and source registers are incremented (in somewhat complex ways) for
|
|
|
|
|
* each instruction.
|
|
|
|
|
*
|
|
|
|
|
* This means the source and destination register is actually a range of
|
|
|
|
|
* registers. The hazard exists of an earlier iteration would write a
|
|
|
|
|
* register that should be read by a later iteration.
|
|
|
|
|
*
|
|
|
|
|
* There may be some advantage to properly modeling this, but for now,
|
|
|
|
|
* be overly conservative.
|
|
|
|
|
*/
|
|
|
|
|
return inst->rcount > 1;
|
|
|
|
|
default:
|
|
|
|
|
/* The SIMD16 compressed instruction
|
|
|
|
|
*
|
|
|
|
|
* add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F
|
|
|
|
|
*
|
|
|
|
|
* is actually decoded in hardware as:
|
|
|
|
|
*
|
|
|
|
|
* add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F
|
|
|
|
|
* add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F
|
|
|
|
|
*
|
|
|
|
|
* Which is safe. However, if we have uniform accesses
|
|
|
|
|
* happening, we get into trouble:
|
|
|
|
|
*
|
|
|
|
|
* add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F
|
|
|
|
|
* add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F
|
|
|
|
|
*
|
|
|
|
|
* Now our destination for the first instruction overwrote the
|
|
|
|
|
* second instruction's src0, and we get garbage for those 8
|
|
|
|
|
* pixels. There's a similar issue for the pre-gfx6
|
|
|
|
|
* pixel_x/pixel_y, which are registers of 16-bit values and thus
|
|
|
|
|
* would get stomped by the first decode as well.
|
|
|
|
|
*/
|
|
|
|
|
if (inst->exec_size == 16) {
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file == VGRF && (inst->src[i].stride == 0 ||
|
|
|
|
|
inst->src[i].type == BRW_TYPE_UW ||
|
|
|
|
|
inst->src[i].type == BRW_TYPE_W ||
|
|
|
|
|
inst->src[i].type == BRW_TYPE_UB ||
|
|
|
|
|
inst->src[i].type == BRW_TYPE_B)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-07 23:54:17 -05:00
|
|
|
void
|
2016-09-23 15:15:33 +03:00
|
|
|
fs_reg_alloc::setup_inst_interference(const fs_inst *inst)
|
2019-05-07 23:54:17 -05:00
|
|
|
{
|
i965: Add src/dst interference for certain instructions with hazards.
When working on tessellation shaders, I created some vec4 virtual
opcodes for creating message headers through a sequence like:
mov(8) g7<1>UD 0x00000000UD { align1 WE_all 1Q compacted };
mov(1) g7.5<1>UD 0x00000100UD { align1 WE_all };
mov(1) g7<1>UD g0<0,1,0>UD { align1 WE_all compacted };
mov(1) g7.3<1>UD g8<0,1,0>UD { align1 WE_all };
This is done in the generator since the vec4 backend can't handle align1
regioning. From the visitor's point of view, this is a single opcode:
hs_set_output_urb_offsets vgrf7.0:UD, 1U, vgrf8.xxxx:UD
Normally, there's no hazard between sources and destinations - an
instruction (naturally) reads its sources, then writes the result to the
destination. However, when the virtual instruction generates multiple
hardware instructions, we can get into trouble.
In the above example, if the register allocator assigned vgrf7 and vgrf8
to the same hardware register, then we'd clobber the source with 0 in
the first instruction, and read back the wrong value in the last one.
It occured to me that this is exactly the same problem we have with
SIMD16 instructions that use W/UW or B/UB types with 0 stride. The
hardware implicitly decodes them as two SIMD8 instructions, and with
the overlapping regions, the first would clobber the second.
Previously, we handled that by incrementing the live range end IP by 1,
which works, but is excessive: the next instruction doesn't actually
care about that. It might also be the end of control flow. This might
keep values alive too long. What we really want is to say "my source
and destinations interfere".
This patch creates new infrastructure for doing just that, and teaches
the register allocator to add interference when there's a hazard. For
my vec4 case, we can determine this by switching on opcodes. For the
SIMD16 case, we just move the existing code there.
I audited our existing virtual opcodes that generate multiple
instructions; I believe FS_OPCODE_PACK_HALF_2x16_SPLIT needs this
treatment as well, but no others.
v2: Rebased by mattst88.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-11-19 16:00:18 -08:00
|
|
|
/* Certain instructions can't safely use the same register for their
|
|
|
|
|
* sources and destination. Add interference.
|
|
|
|
|
*/
|
2024-08-27 13:40:05 -07:00
|
|
|
if (inst->dst.file == VGRF && brw_inst_has_source_and_destination_hazard(inst)) {
|
2019-05-07 23:54:17 -05:00
|
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file == VGRF) {
|
2019-05-08 13:09:27 -05:00
|
|
|
ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
|
|
|
|
|
first_vgrf_node + inst->src[i].nr);
|
2014-09-12 16:17:37 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-07 01:21:23 -05:00
|
|
|
/* A compressed instruction is actually two instructions executed
|
|
|
|
|
* simultaneously. On most platforms, it ok to have the source and
|
|
|
|
|
* destination registers be the same. In this case, each instruction
|
|
|
|
|
* over-writes its own source and there's no problem. The real problem
|
|
|
|
|
* here is if the source and destination registers are off by one. Then
|
|
|
|
|
* you can end up in a scenario where the first instruction over-writes the
|
|
|
|
|
* source of the second instruction. Since the compiler doesn't know about
|
|
|
|
|
* this level of granularity, we simply make the source and destination
|
|
|
|
|
* interfere.
|
2018-10-17 12:05:42 +02:00
|
|
|
*/
|
2020-09-07 01:21:23 -05:00
|
|
|
if (inst->dst.component_size(inst->exec_size) > REG_SIZE &&
|
|
|
|
|
inst->dst.file == VGRF) {
|
2018-10-17 12:05:42 +02:00
|
|
|
for (int i = 0; i < inst->sources; ++i) {
|
|
|
|
|
if (inst->src[i].file == VGRF) {
|
2019-05-08 13:09:27 -05:00
|
|
|
ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
|
|
|
|
|
first_vgrf_node + inst->src[i].nr);
|
2014-08-18 14:27:55 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-07 23:54:17 -05:00
|
|
|
if (grf127_send_hack_node >= 0) {
|
2018-04-19 01:15:23 +02:00
|
|
|
/* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
|
|
|
|
|
* subsection "EUISA Instructions", Send Message (page 990):
|
|
|
|
|
*
|
|
|
|
|
* "r127 must not be used for return address when there is a src and
|
|
|
|
|
* dest overlap in send instruction."
|
|
|
|
|
*
|
|
|
|
|
* We are avoiding using grf127 as part of the destination of send
|
|
|
|
|
* messages adding a node interference to the grf127_send_hack_node.
|
2022-06-22 18:31:08 +02:00
|
|
|
* This node has a fixed assignment to grf127.
|
2018-04-19 01:15:23 +02:00
|
|
|
*
|
2019-01-15 10:53:44 -06:00
|
|
|
* We don't apply it to SIMD16 instructions because previous code avoids
|
|
|
|
|
* any register overlap between sources and destination.
|
2018-04-19 01:15:23 +02:00
|
|
|
*/
|
2019-05-07 23:54:17 -05:00
|
|
|
if (inst->exec_size < 16 && inst->is_send_from_grf() &&
|
|
|
|
|
inst->dst.file == VGRF)
|
2019-05-08 13:09:27 -05:00
|
|
|
ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
|
|
|
|
|
grf127_send_hack_node);
|
2018-04-19 01:15:23 +02:00
|
|
|
}
|
|
|
|
|
|
2019-01-28 23:24:24 -06:00
|
|
|
/* From the Skylake PRM Vol. 2a docs for sends:
|
|
|
|
|
*
|
|
|
|
|
* "It is required that the second block of GRFs does not overlap with
|
|
|
|
|
* the first block."
|
|
|
|
|
*
|
|
|
|
|
* Normally, this is taken care of by fixup_sends_duplicate_payload() but
|
|
|
|
|
* in the case where one of the registers is an undefined value, the
|
|
|
|
|
* register allocator may decide that they don't interfere even though
|
|
|
|
|
* they're used as sources in the same instruction. We also need to add
|
|
|
|
|
* interference here.
|
|
|
|
|
*/
|
2024-02-15 02:34:50 -08:00
|
|
|
if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
|
|
|
|
|
inst->src[2].file == VGRF && inst->src[3].file == VGRF &&
|
|
|
|
|
inst->src[2].nr != inst->src[3].nr)
|
|
|
|
|
ra_add_node_interference(g, first_vgrf_node + inst->src[2].nr,
|
|
|
|
|
first_vgrf_node + inst->src[3].nr);
|
2019-05-07 23:54:17 -05:00
|
|
|
|
|
|
|
|
/* When we do send-from-GRF for FB writes, we need to ensure that the last
|
|
|
|
|
* write instruction sends from a high register. This is because the
|
|
|
|
|
* vertex fetcher wants to start filling the low payload registers while
|
|
|
|
|
* the pixel data port is still working on writing out the memory. If we
|
|
|
|
|
* don't do this, we get rendering artifacts.
|
|
|
|
|
*
|
|
|
|
|
* We could just do "something high". Instead, we just pick the highest
|
|
|
|
|
* register that works.
|
|
|
|
|
*/
|
2024-09-18 14:26:04 -07:00
|
|
|
if (inst->eot && devinfo->ver < 30) {
|
2019-05-07 23:54:17 -05:00
|
|
|
const int vgrf = inst->opcode == SHADER_OPCODE_SEND ?
|
|
|
|
|
inst->src[2].nr : inst->src[0].nr;
|
2022-06-28 17:49:38 -07:00
|
|
|
const int size = DIV_ROUND_UP(fs->alloc.sizes[vgrf], reg_unit(devinfo));
|
|
|
|
|
int reg = BRW_MAX_GRF - size;
|
2019-05-07 23:54:17 -05:00
|
|
|
|
2024-02-15 02:34:50 -08:00
|
|
|
if (grf127_send_hack_node >= 0) {
|
2019-12-27 16:08:04 -08:00
|
|
|
/* Avoid r127 which might be unusable if the node was previously
|
|
|
|
|
* written by a SIMD8 SEND message with source/destination overlap.
|
|
|
|
|
*/
|
|
|
|
|
reg--;
|
|
|
|
|
}
|
2019-05-07 23:54:17 -05:00
|
|
|
|
2024-01-12 02:57:27 -08:00
|
|
|
assert(reg >= 112);
|
2019-05-08 13:09:27 -05:00
|
|
|
ra_set_node_reg(g, first_vgrf_node + vgrf, reg);
|
2022-06-13 15:29:15 -07:00
|
|
|
|
|
|
|
|
if (inst->ex_mlen > 0) {
|
|
|
|
|
const int vgrf = inst->src[3].nr;
|
2022-06-28 17:49:38 -07:00
|
|
|
reg -= DIV_ROUND_UP(fs->alloc.sizes[vgrf], reg_unit(devinfo));
|
2024-01-12 02:57:27 -08:00
|
|
|
assert(reg >= 112);
|
2022-06-13 15:29:15 -07:00
|
|
|
ra_set_node_reg(g, first_vgrf_node + vgrf, reg);
|
|
|
|
|
}
|
2019-05-07 23:54:17 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
intel/brw: Only force g0's liveness to be the whole program if spilling
We don't actually need to extend g0's live range to the EOT message
generally - most messages that end a shader are headerless. The main
implicit use of g0 is for constructing scratch headers. With the last
two patches, we now consider scratch access that may exist in the IR
and already extend the liveness appropriately.
There is one remaining problem: spilling. The register allocator will
create new scratch messages when spilling a register, which need to
create scratch headers, which need g0. So, every new spill or fill
might extend the live range of g0, which would create new interference,
altering the graph. This can be problematic.
However, when compiling SIMD16 or SIMD32 fragment shaders, we don't
allow spilling anyway. So, why not use allow g0? Also, when trying
various scheduling modes, we first try allocation without spilling.
If it works, great, if not, we try a (hopefully) less aggressive
schedule, and only allow spilling on the lowest-pressure schedule.
So, even for regular SIMD8 shaders, we can potentially gain the use
of g0 on the first few tries at scheduling+allocation.
Once we try to allocate with spilling, we go back to reserving g0
for the entire program, so that we can construct scratch headers at
any point. We could possibly do better here, but this is simple and
reliable with some benefit.
Thanks to Ian Romanick for suggesting I try this approach.
fossil-db on Alchemist shows some more spill/fill improvements:
Totals:
Instrs: 149062395 -> 149053010 (-0.01%); split: -0.01%, +0.00%
Cycles: 12609496913 -> 12611652181 (+0.02%); split: -0.45%, +0.47%
Spill count: 52891 -> 52471 (-0.79%)
Fill count: 101599 -> 100818 (-0.77%)
Scratch Memory Size: 3292160 -> 3197952 (-2.86%)
Totals from 416541 (66.59% of 625484) affected shaders:
Instrs: 124058587 -> 124049202 (-0.01%); split: -0.01%, +0.01%
Cycles: 3567164271 -> 3569319539 (+0.06%); split: -1.61%, +1.67%
Spill count: 420 -> 0 (-inf%)
Fill count: 781 -> 0 (-inf%)
Scratch Memory Size: 94208 -> 0 (-inf%)
Witcher 3 shows a 33% reduction in scratch memory size, for example.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30319>
2024-07-22 17:22:47 -07:00
|
|
|
fs_reg_alloc::build_interference_graph(bool allow_spilling)
|
2019-05-07 23:54:17 -05:00
|
|
|
{
|
|
|
|
|
/* Compute the RA node layout */
|
2019-05-08 13:09:27 -05:00
|
|
|
node_count = 0;
|
2019-05-07 23:54:17 -05:00
|
|
|
first_payload_node = node_count;
|
|
|
|
|
node_count += payload_node_count;
|
2024-02-15 02:34:50 -08:00
|
|
|
|
|
|
|
|
grf127_send_hack_node = node_count;
|
|
|
|
|
node_count++;
|
|
|
|
|
|
2019-05-08 13:09:27 -05:00
|
|
|
first_vgrf_node = node_count;
|
|
|
|
|
node_count += fs->alloc.count;
|
2020-10-08 15:51:13 -05:00
|
|
|
last_vgrf_node = node_count - 1;
|
2019-05-08 13:34:04 -05:00
|
|
|
first_spill_node = node_count;
|
2019-05-07 23:54:17 -05:00
|
|
|
|
intel/brw: Only force g0's liveness to be the whole program if spilling
We don't actually need to extend g0's live range to the EOT message
generally - most messages that end a shader are headerless. The main
implicit use of g0 is for constructing scratch headers. With the last
two patches, we now consider scratch access that may exist in the IR
and already extend the liveness appropriately.
There is one remaining problem: spilling. The register allocator will
create new scratch messages when spilling a register, which need to
create scratch headers, which need g0. So, every new spill or fill
might extend the live range of g0, which would create new interference,
altering the graph. This can be problematic.
However, when compiling SIMD16 or SIMD32 fragment shaders, we don't
allow spilling anyway. So, why not use allow g0? Also, when trying
various scheduling modes, we first try allocation without spilling.
If it works, great, if not, we try a (hopefully) less aggressive
schedule, and only allow spilling on the lowest-pressure schedule.
So, even for regular SIMD8 shaders, we can potentially gain the use
of g0 on the first few tries at scheduling+allocation.
Once we try to allocate with spilling, we go back to reserving g0
for the entire program, so that we can construct scratch headers at
any point. We could possibly do better here, but this is simple and
reliable with some benefit.
Thanks to Ian Romanick for suggesting I try this approach.
fossil-db on Alchemist shows some more spill/fill improvements:
Totals:
Instrs: 149062395 -> 149053010 (-0.01%); split: -0.01%, +0.00%
Cycles: 12609496913 -> 12611652181 (+0.02%); split: -0.45%, +0.47%
Spill count: 52891 -> 52471 (-0.79%)
Fill count: 101599 -> 100818 (-0.77%)
Scratch Memory Size: 3292160 -> 3197952 (-2.86%)
Totals from 416541 (66.59% of 625484) affected shaders:
Instrs: 124058587 -> 124049202 (-0.01%); split: -0.01%, +0.01%
Cycles: 3567164271 -> 3569319539 (+0.06%); split: -1.61%, +1.67%
Spill count: 420 -> 0 (-inf%)
Fill count: 781 -> 0 (-inf%)
Scratch Memory Size: 94208 -> 0 (-inf%)
Witcher 3 shows a 33% reduction in scratch memory size, for example.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30319>
2024-07-22 17:22:47 -07:00
|
|
|
fs->calculate_payload_ranges(allow_spilling, payload_node_count,
|
2019-05-07 23:54:17 -05:00
|
|
|
payload_last_use_ip);
|
|
|
|
|
|
|
|
|
|
assert(g == NULL);
|
2024-02-26 07:46:58 -08:00
|
|
|
g = ra_alloc_interference_graph(compiler->fs_reg_set.regs, node_count);
|
2019-05-07 23:54:17 -05:00
|
|
|
ralloc_steal(mem_ctx, g);
|
|
|
|
|
|
|
|
|
|
/* Set up the payload nodes */
|
2021-06-07 10:18:03 -05:00
|
|
|
for (int i = 0; i < payload_node_count; i++)
|
|
|
|
|
ra_set_node_reg(g, first_payload_node + i, i);
|
2019-05-07 23:54:17 -05:00
|
|
|
|
|
|
|
|
if (grf127_send_hack_node >= 0)
|
|
|
|
|
ra_set_node_reg(g, grf127_send_hack_node, 127);
|
|
|
|
|
|
2020-01-03 14:53:11 -08:00
|
|
|
/* Specify the classes of each virtual register. */
|
2019-05-07 23:54:17 -05:00
|
|
|
for (unsigned i = 0; i < fs->alloc.count; i++) {
|
2022-06-28 17:49:38 -07:00
|
|
|
unsigned size = DIV_ROUND_UP(fs->alloc.sizes[i], reg_unit(devinfo));
|
2019-05-07 23:54:17 -05:00
|
|
|
|
2024-02-26 07:46:58 -08:00
|
|
|
assert(size <= ARRAY_SIZE(compiler->fs_reg_set.classes) &&
|
2019-05-07 23:54:17 -05:00
|
|
|
"Register allocation relies on split_virtual_grfs()");
|
|
|
|
|
|
2020-01-03 14:53:11 -08:00
|
|
|
ra_set_node_class(g, first_vgrf_node + i,
|
2024-02-26 07:46:58 -08:00
|
|
|
compiler->fs_reg_set.classes[size - 1]);
|
2020-01-03 14:53:11 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Add interference based on the live range of the register */
|
|
|
|
|
for (unsigned i = 0; i < fs->alloc.count; i++) {
|
2019-05-08 13:09:27 -05:00
|
|
|
setup_live_interference(first_vgrf_node + i,
|
2016-03-13 16:25:57 -07:00
|
|
|
live.vgrf_start[i],
|
|
|
|
|
live.vgrf_end[i]);
|
2019-05-07 23:54:17 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Add interference based on the instructions in which a register is used.
|
|
|
|
|
*/
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, fs->cfg)
|
|
|
|
|
setup_inst_interference(inst);
|
2019-05-07 16:43:56 -05:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg
|
2022-07-18 12:27:53 +03:00
|
|
|
fs_reg_alloc::build_single_offset(const fs_builder &bld, uint32_t spill_offset, int ip)
|
|
|
|
|
{
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg offset = retype(alloc_spill_reg(1, ip), BRW_TYPE_UD);
|
2022-07-18 12:27:53 +03:00
|
|
|
fs_inst *inst = bld.MOV(offset, brw_imm_ud(spill_offset));
|
|
|
|
|
_mesa_set_add(spill_insts, inst);
|
|
|
|
|
return offset;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg
|
2022-07-18 12:27:53 +03:00
|
|
|
fs_reg_alloc::build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, int ip)
|
|
|
|
|
{
|
|
|
|
|
/* LSC messages are limited to SIMD16 */
|
|
|
|
|
assert(bld.dispatch_width() <= 16);
|
|
|
|
|
|
|
|
|
|
const fs_builder ubld = bld.exec_all();
|
|
|
|
|
const unsigned reg_count = ubld.dispatch_width() / 8;
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg offset = retype(alloc_spill_reg(reg_count, ip), BRW_TYPE_UD);
|
2022-07-18 12:27:53 +03:00
|
|
|
fs_inst *inst;
|
|
|
|
|
|
|
|
|
|
/* Build an offset per lane in SIMD8 */
|
2024-04-20 17:08:02 -07:00
|
|
|
inst = ubld.group(8, 0).MOV(retype(offset, BRW_TYPE_UW),
|
2022-07-18 12:27:53 +03:00
|
|
|
brw_imm_uv(0x76543210));
|
|
|
|
|
_mesa_set_add(spill_insts, inst);
|
2024-04-20 17:08:02 -07:00
|
|
|
inst = ubld.group(8, 0).MOV(offset, retype(offset, BRW_TYPE_UW));
|
2022-07-18 12:27:53 +03:00
|
|
|
_mesa_set_add(spill_insts, inst);
|
|
|
|
|
|
|
|
|
|
/* Build offsets in the upper 8 lanes of SIMD16 */
|
|
|
|
|
if (ubld.dispatch_width() > 8) {
|
|
|
|
|
inst = ubld.group(8, 0).ADD(
|
|
|
|
|
byte_offset(offset, REG_SIZE),
|
|
|
|
|
byte_offset(offset, 0),
|
|
|
|
|
brw_imm_ud(8));
|
|
|
|
|
_mesa_set_add(spill_insts, inst);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Make the offset a dword */
|
|
|
|
|
inst = ubld.SHL(offset, offset, brw_imm_ud(2));
|
|
|
|
|
_mesa_set_add(spill_insts, inst);
|
|
|
|
|
|
|
|
|
|
/* Add the base offset */
|
2024-10-09 17:35:09 -07:00
|
|
|
if (spill_offset) {
|
|
|
|
|
inst = ubld.ADD(offset, offset, brw_imm_ud(spill_offset));
|
|
|
|
|
_mesa_set_add(spill_insts, inst);
|
|
|
|
|
}
|
2022-07-18 12:27:53 +03:00
|
|
|
|
|
|
|
|
return offset;
|
|
|
|
|
}
|
|
|
|
|
|
intel/brw: Build the scratch header on the fly for pre-LSC systems
Instead of reserving a register to contain the spill header, which
gets marked live for the entire program, we can just emit the ALU
instructions to build it on the fly. (This is similar to the way
we handle scratch on Alchemist with the newer LSC data port.)
There are a couple of downsides that make this not obviously a win.
First, in order to construct the scratch header on Gfx9-12, we have
to use fields from g0, which will have to remain live anywhere that
scratch access is required. This could negate the register pressure
benefits of creating the header on the fly. However, g0 is oft used
in other places anyway, so it may already be there. Another is that
it's a non-trivial number of ALU instructions to construct the value.
Still, trading lower pressure (so fewer spills, less memory access
and stalls) for more cheap ALU seems like it ought to be a win.
There is another valuable benefit: by not reserving a register, we
eliminate the need to reconstruct the interference graph. (The next
patch will actually do so.)
shader-db on Icelake shows spills/fills at 54/53 helped, 4/10 hurt,
and an 8% increase in ALU on affected shaders. Synmark's OglCSDof
(a benchmark that spills) performance remains the same on Alderlake.
fossil-db on Icelake shows a 5.6%/5.1% reduction in spills/fills and a
4% reduction in scratch memory size on affected shaders. Instruction
counts go up by 11.07%, but cycle estimates only increase by 0.57%.
Assassin's Creed Odyssey and Wolfenstein Youngblood both see 20-30%
reductions in spills/fills, a significant improvement.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25811>
2023-09-30 03:14:45 -07:00
|
|
|
/**
|
|
|
|
|
* Generate a scratch header for pre-LSC platforms.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg
|
intel/brw: Build the scratch header on the fly for pre-LSC systems
Instead of reserving a register to contain the spill header, which
gets marked live for the entire program, we can just emit the ALU
instructions to build it on the fly. (This is similar to the way
we handle scratch on Alchemist with the newer LSC data port.)
There are a couple of downsides that make this not obviously a win.
First, in order to construct the scratch header on Gfx9-12, we have
to use fields from g0, which will have to remain live anywhere that
scratch access is required. This could negate the register pressure
benefits of creating the header on the fly. However, g0 is oft used
in other places anyway, so it may already be there. Another is that
it's a non-trivial number of ALU instructions to construct the value.
Still, trading lower pressure (so fewer spills, less memory access
and stalls) for more cheap ALU seems like it ought to be a win.
There is another valuable benefit: by not reserving a register, we
eliminate the need to reconstruct the interference graph. (The next
patch will actually do so.)
shader-db on Icelake shows spills/fills at 54/53 helped, 4/10 hurt,
and an 8% increase in ALU on affected shaders. Synmark's OglCSDof
(a benchmark that spills) performance remains the same on Alderlake.
fossil-db on Icelake shows a 5.6%/5.1% reduction in spills/fills and a
4% reduction in scratch memory size on affected shaders. Instruction
counts go up by 11.07%, but cycle estimates only increase by 0.57%.
Assassin's Creed Odyssey and Wolfenstein Youngblood both see 20-30%
reductions in spills/fills, a significant improvement.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25811>
2023-09-30 03:14:45 -07:00
|
|
|
fs_reg_alloc::build_legacy_scratch_header(const fs_builder &bld,
|
|
|
|
|
uint32_t spill_offset, int ip)
|
|
|
|
|
{
|
|
|
|
|
const fs_builder ubld8 = bld.exec_all().group(8, 0);
|
|
|
|
|
const fs_builder ubld1 = bld.exec_all().group(1, 0);
|
|
|
|
|
|
|
|
|
|
/* Allocate a spill header and make it interfere with g0 */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg header = retype(alloc_spill_reg(1, ip), BRW_TYPE_UD);
|
intel/brw: Build the scratch header on the fly for pre-LSC systems
Instead of reserving a register to contain the spill header, which
gets marked live for the entire program, we can just emit the ALU
instructions to build it on the fly. (This is similar to the way
we handle scratch on Alchemist with the newer LSC data port.)
There are a couple of downsides that make this not obviously a win.
First, in order to construct the scratch header on Gfx9-12, we have
to use fields from g0, which will have to remain live anywhere that
scratch access is required. This could negate the register pressure
benefits of creating the header on the fly. However, g0 is oft used
in other places anyway, so it may already be there. Another is that
it's a non-trivial number of ALU instructions to construct the value.
Still, trading lower pressure (so fewer spills, less memory access
and stalls) for more cheap ALU seems like it ought to be a win.
There is another valuable benefit: by not reserving a register, we
eliminate the need to reconstruct the interference graph. (The next
patch will actually do so.)
shader-db on Icelake shows spills/fills at 54/53 helped, 4/10 hurt,
and an 8% increase in ALU on affected shaders. Synmark's OglCSDof
(a benchmark that spills) performance remains the same on Alderlake.
fossil-db on Icelake shows a 5.6%/5.1% reduction in spills/fills and a
4% reduction in scratch memory size on affected shaders. Instruction
counts go up by 11.07%, but cycle estimates only increase by 0.57%.
Assassin's Creed Odyssey and Wolfenstein Youngblood both see 20-30%
reductions in spills/fills, a significant improvement.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25811>
2023-09-30 03:14:45 -07:00
|
|
|
ra_add_node_interference(g, first_vgrf_node + header.nr, first_payload_node);
|
|
|
|
|
|
2024-07-22 00:22:20 -07:00
|
|
|
fs_inst *inst =
|
|
|
|
|
ubld8.emit(SHADER_OPCODE_SCRATCH_HEADER, header, brw_ud8_grf(0, 0));
|
intel/brw: Build the scratch header on the fly for pre-LSC systems
Instead of reserving a register to contain the spill header, which
gets marked live for the entire program, we can just emit the ALU
instructions to build it on the fly. (This is similar to the way
we handle scratch on Alchemist with the newer LSC data port.)
There are a couple of downsides that make this not obviously a win.
First, in order to construct the scratch header on Gfx9-12, we have
to use fields from g0, which will have to remain live anywhere that
scratch access is required. This could negate the register pressure
benefits of creating the header on the fly. However, g0 is oft used
in other places anyway, so it may already be there. Another is that
it's a non-trivial number of ALU instructions to construct the value.
Still, trading lower pressure (so fewer spills, less memory access
and stalls) for more cheap ALU seems like it ought to be a win.
There is another valuable benefit: by not reserving a register, we
eliminate the need to reconstruct the interference graph. (The next
patch will actually do so.)
shader-db on Icelake shows spills/fills at 54/53 helped, 4/10 hurt,
and an 8% increase in ALU on affected shaders. Synmark's OglCSDof
(a benchmark that spills) performance remains the same on Alderlake.
fossil-db on Icelake shows a 5.6%/5.1% reduction in spills/fills and a
4% reduction in scratch memory size on affected shaders. Instruction
counts go up by 11.07%, but cycle estimates only increase by 0.57%.
Assassin's Creed Odyssey and Wolfenstein Youngblood both see 20-30%
reductions in spills/fills, a significant improvement.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25811>
2023-09-30 03:14:45 -07:00
|
|
|
_mesa_set_add(spill_insts, inst);
|
|
|
|
|
|
|
|
|
|
/* Write the scratch offset */
|
|
|
|
|
assert(spill_offset % 16 == 0);
|
|
|
|
|
inst = ubld1.MOV(component(header, 2), brw_imm_ud(spill_offset / 16));
|
|
|
|
|
_mesa_set_add(spill_insts, inst);
|
|
|
|
|
|
|
|
|
|
return header;
|
|
|
|
|
}
|
|
|
|
|
|
2020-10-09 04:27:35 -05:00
|
|
|
void
|
2022-05-24 02:44:53 -07:00
|
|
|
fs_reg_alloc::emit_unspill(const fs_builder &bld,
|
2024-10-10 23:04:52 -07:00
|
|
|
struct brw_shader_stats *stats,
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dst,
|
2022-07-18 12:27:53 +03:00
|
|
|
uint32_t spill_offset, unsigned count, int ip)
|
2010-10-19 09:25:51 -07:00
|
|
|
{
|
2021-04-05 13:19:39 -07:00
|
|
|
const intel_device_info *devinfo = bld.shader->devinfo;
|
2016-05-15 22:59:04 -07:00
|
|
|
const unsigned reg_size = dst.component_size(bld.dispatch_width()) /
|
|
|
|
|
REG_SIZE;
|
2015-06-03 19:05:54 +03:00
|
|
|
|
2023-01-10 15:12:24 -08:00
|
|
|
for (unsigned i = 0; i < DIV_ROUND_UP(count, reg_size); i++) {
|
2022-05-24 02:44:53 -07:00
|
|
|
++stats->fill_count;
|
|
|
|
|
|
2020-10-08 14:32:30 -05:00
|
|
|
fs_inst *unspill_inst;
|
2022-07-18 12:27:53 +03:00
|
|
|
if (devinfo->verx10 >= 125) {
|
|
|
|
|
/* LSC is limited to SIMD16 load/store but we can load more using
|
|
|
|
|
* transpose messages.
|
|
|
|
|
*/
|
2024-11-13 11:26:53 +02:00
|
|
|
const bool use_transpose = bld.dispatch_width() > 16 || bld.has_writemask_all();
|
2022-07-18 12:27:53 +03:00
|
|
|
const fs_builder ubld = use_transpose ? bld.exec_all().group(1, 0) : bld;
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg offset;
|
2022-07-18 12:27:53 +03:00
|
|
|
if (use_transpose) {
|
|
|
|
|
offset = build_single_offset(ubld, spill_offset, ip);
|
|
|
|
|
} else {
|
|
|
|
|
offset = build_lane_offsets(ubld, spill_offset, ip);
|
|
|
|
|
}
|
|
|
|
|
/* We leave the extended descriptor empty and flag the instruction to
|
|
|
|
|
* ask the generated to insert the extended descriptor in the address
|
|
|
|
|
* register. That way we don't need to burn an additional register
|
|
|
|
|
* for register allocation spill/fill.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[] = {
|
2022-07-18 12:27:53 +03:00
|
|
|
brw_imm_ud(0), /* desc */
|
|
|
|
|
brw_imm_ud(0), /* ex_desc */
|
|
|
|
|
offset, /* payload */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg(), /* payload2 */
|
2022-07-18 12:27:53 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
unspill_inst = ubld.emit(SHADER_OPCODE_SEND, dst,
|
|
|
|
|
srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
unspill_inst->sfid = GFX12_SFID_UGM;
|
|
|
|
|
unspill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
|
2022-11-14 15:54:01 +02:00
|
|
|
LSC_ADDR_SURFTYPE_SS,
|
2022-07-18 12:27:53 +03:00
|
|
|
LSC_ADDR_SIZE_A32,
|
|
|
|
|
LSC_DATA_SIZE_D32,
|
|
|
|
|
use_transpose ? reg_size * 8 : 1 /* num_channels */,
|
|
|
|
|
use_transpose,
|
2022-09-29 12:38:19 -07:00
|
|
|
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
|
2022-07-18 12:27:53 +03:00
|
|
|
unspill_inst->header_size = 0;
|
2022-09-28 16:17:02 -07:00
|
|
|
unspill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
|
|
|
|
|
unspill_inst->exec_size);
|
2022-07-18 12:27:53 +03:00
|
|
|
unspill_inst->ex_mlen = 0;
|
|
|
|
|
unspill_inst->size_written =
|
2022-09-28 16:17:02 -07:00
|
|
|
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, bld.dispatch_width()) * REG_SIZE;
|
2022-07-18 12:27:53 +03:00
|
|
|
unspill_inst->send_has_side_effects = false;
|
|
|
|
|
unspill_inst->send_is_volatile = true;
|
|
|
|
|
unspill_inst->send_ex_desc_scratch = true;
|
2024-02-15 02:34:50 -08:00
|
|
|
} else {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip);
|
2020-10-08 14:41:43 -05:00
|
|
|
|
2022-07-18 12:27:53 +03:00
|
|
|
const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg ex_desc = brw_imm_ud(0);
|
2020-10-20 17:42:21 -05:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[] = { brw_imm_ud(0), ex_desc, header };
|
2020-10-08 14:41:43 -05:00
|
|
|
unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst,
|
|
|
|
|
srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
unspill_inst->mlen = 1;
|
|
|
|
|
unspill_inst->header_size = 1;
|
|
|
|
|
unspill_inst->size_written = reg_size * REG_SIZE;
|
|
|
|
|
unspill_inst->send_has_side_effects = false;
|
|
|
|
|
unspill_inst->send_is_volatile = true;
|
2021-03-29 15:16:59 -07:00
|
|
|
unspill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
|
2020-10-08 14:41:43 -05:00
|
|
|
unspill_inst->desc =
|
2020-10-20 17:42:21 -05:00
|
|
|
brw_dp_desc(devinfo, bti,
|
2021-02-05 08:11:01 -06:00
|
|
|
BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
|
|
|
|
|
BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8));
|
2013-10-16 11:51:22 -07:00
|
|
|
}
|
2020-10-09 04:27:35 -05:00
|
|
|
_mesa_set_add(spill_insts, unspill_inst);
|
2023-01-10 15:12:24 -08:00
|
|
|
assert(unspill_inst->force_writemask_all || count % reg_size == 0);
|
i965/fs: Make register spill/unspill only do the regs for that instruction.
Previously, if we were spilling the result of a texture call, we would store
all 4 regs, then for each use of one of those regs as the source of an
instruction, we would unspill all 4 regs even though only one was needed.
In both lightsmark and l4d2 with my current graphics config, the shaders that
produce spilling do so on split GRFs, so this doesn't help them out. However,
in a capture of the l4d2 shaders with a different snapshot and playing the
game instead of using a demo, it reduced one shader from 2817 instructions to
2179, due to choosing a now-cheaper texture result to spill instead of piles
of texcoords.
v2: Fix comment noted by Ken, and fix the if condition associated with it for
the current state of what constitutes a partial write of the destination.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> (v1)
2012-07-06 17:18:35 -07:00
|
|
|
|
2016-09-01 12:42:20 -07:00
|
|
|
dst.offset += reg_size * REG_SIZE;
|
2014-10-24 11:41:25 -07:00
|
|
|
spill_offset += reg_size * REG_SIZE;
|
2014-08-18 14:27:55 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-10-09 04:27:35 -05:00
|
|
|
void
|
2022-05-24 02:44:53 -07:00
|
|
|
fs_reg_alloc::emit_spill(const fs_builder &bld,
|
2024-10-10 23:04:52 -07:00
|
|
|
struct brw_shader_stats *stats,
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg src,
|
2022-07-18 12:27:53 +03:00
|
|
|
uint32_t spill_offset, unsigned count, int ip)
|
2014-08-18 14:27:55 -07:00
|
|
|
{
|
2021-04-05 13:19:39 -07:00
|
|
|
const intel_device_info *devinfo = bld.shader->devinfo;
|
2016-05-15 22:59:04 -07:00
|
|
|
const unsigned reg_size = src.component_size(bld.dispatch_width()) /
|
|
|
|
|
REG_SIZE;
|
2015-06-03 19:05:54 +03:00
|
|
|
|
2023-01-10 15:12:24 -08:00
|
|
|
for (unsigned i = 0; i < DIV_ROUND_UP(count, reg_size); i++) {
|
2022-05-24 02:44:53 -07:00
|
|
|
++stats->spill_count;
|
|
|
|
|
|
2020-10-08 14:41:43 -05:00
|
|
|
fs_inst *spill_inst;
|
2022-07-18 12:27:53 +03:00
|
|
|
if (devinfo->verx10 >= 125) {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg offset = build_lane_offsets(bld, spill_offset, ip);
|
2022-07-18 12:27:53 +03:00
|
|
|
/* We leave the extended descriptor empty and flag the instruction
|
|
|
|
|
* relocate the extended descriptor. That way the surface offset is
|
|
|
|
|
* directly put into the instruction and we don't need to use a
|
|
|
|
|
* register to hold it.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[] = {
|
2022-07-18 12:27:53 +03:00
|
|
|
brw_imm_ud(0), /* desc */
|
|
|
|
|
brw_imm_ud(0), /* ex_desc */
|
|
|
|
|
offset, /* payload */
|
|
|
|
|
src, /* payload2 */
|
|
|
|
|
};
|
|
|
|
|
spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
|
|
|
|
|
srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
spill_inst->sfid = GFX12_SFID_UGM;
|
|
|
|
|
spill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
|
2022-11-14 15:54:01 +02:00
|
|
|
LSC_ADDR_SURFTYPE_SS,
|
2022-07-18 12:27:53 +03:00
|
|
|
LSC_ADDR_SIZE_A32,
|
|
|
|
|
LSC_DATA_SIZE_D32,
|
|
|
|
|
1 /* num_channels */,
|
|
|
|
|
false /* transpose */,
|
2022-09-29 12:38:19 -07:00
|
|
|
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
|
2022-07-18 12:27:53 +03:00
|
|
|
spill_inst->header_size = 0;
|
2022-09-28 16:17:02 -07:00
|
|
|
spill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
|
|
|
|
|
bld.dispatch_width());
|
2022-07-18 12:27:53 +03:00
|
|
|
spill_inst->ex_mlen = reg_size;
|
|
|
|
|
spill_inst->size_written = 0;
|
|
|
|
|
spill_inst->send_has_side_effects = true;
|
|
|
|
|
spill_inst->send_is_volatile = false;
|
|
|
|
|
spill_inst->send_ex_desc_scratch = true;
|
2024-02-15 02:34:50 -08:00
|
|
|
} else {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg header = build_legacy_scratch_header(bld, spill_offset, ip);
|
2020-10-08 14:41:43 -05:00
|
|
|
|
2022-07-18 12:27:53 +03:00
|
|
|
const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg ex_desc = brw_imm_ud(0);
|
2020-10-20 17:42:21 -05:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[] = { brw_imm_ud(0), ex_desc, header, src };
|
2020-10-08 14:41:43 -05:00
|
|
|
spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
|
|
|
|
|
srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
spill_inst->mlen = 1;
|
|
|
|
|
spill_inst->ex_mlen = reg_size;
|
|
|
|
|
spill_inst->size_written = 0;
|
|
|
|
|
spill_inst->header_size = 1;
|
|
|
|
|
spill_inst->send_has_side_effects = true;
|
|
|
|
|
spill_inst->send_is_volatile = false;
|
2021-03-29 15:16:59 -07:00
|
|
|
spill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
|
2020-10-08 14:41:43 -05:00
|
|
|
spill_inst->desc =
|
2020-10-20 17:42:21 -05:00
|
|
|
brw_dp_desc(devinfo, bti,
|
2021-02-05 08:11:01 -06:00
|
|
|
GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE,
|
|
|
|
|
BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8));
|
2020-10-08 14:41:43 -05:00
|
|
|
}
|
2020-10-09 04:27:35 -05:00
|
|
|
_mesa_set_add(spill_insts, spill_inst);
|
2023-01-10 15:12:24 -08:00
|
|
|
assert(spill_inst->force_writemask_all || count % reg_size == 0);
|
2020-10-08 14:26:57 -05:00
|
|
|
|
|
|
|
|
src.offset += reg_size * REG_SIZE;
|
|
|
|
|
spill_offset += reg_size * REG_SIZE;
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
}
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
|
2019-05-08 13:34:04 -05:00
|
|
|
void
|
|
|
|
|
fs_reg_alloc::set_spill_costs()
|
2010-10-19 09:25:51 -07:00
|
|
|
{
|
2017-04-09 17:28:58 -07:00
|
|
|
float block_scale = 1.0;
|
2025-01-06 20:51:32 -08:00
|
|
|
float *spill_costs = rzalloc_array(NULL, float, fs->alloc.count);
|
2010-10-19 09:25:51 -07:00
|
|
|
|
|
|
|
|
/* Calculate costs for spilling nodes. Call it a cost of 1 per
|
|
|
|
|
* spill/unspill we'll have to do, and guess that the insides of
|
|
|
|
|
* loops run 10 times.
|
|
|
|
|
*/
|
2019-05-07 20:09:08 -05:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2016-04-27 02:07:08 -07:00
|
|
|
if (inst->src[i].file == VGRF)
|
2024-06-19 10:50:51 -07:00
|
|
|
spill_costs[inst->src[i].nr] += regs_read(devinfo, inst, i) * block_scale;
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
|
2016-04-27 02:07:08 -07:00
|
|
|
if (inst->dst.file == VGRF)
|
2017-04-20 11:44:01 -07:00
|
|
|
spill_costs[inst->dst.nr] += regs_written(inst) * block_scale;
|
2012-09-19 13:28:00 -07:00
|
|
|
|
2020-10-09 04:27:35 -05:00
|
|
|
/* Don't spill anything we generated while spilling */
|
|
|
|
|
if (_mesa_set_search(spill_insts, inst)) {
|
|
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file == VGRF)
|
2025-01-06 20:58:35 -08:00
|
|
|
spill_costs[inst->src[i].nr] = INFINITY;
|
2020-10-09 04:27:35 -05:00
|
|
|
}
|
|
|
|
|
if (inst->dst.file == VGRF)
|
2025-01-06 20:58:35 -08:00
|
|
|
spill_costs[inst->dst.nr] = INFINITY;
|
2020-10-09 04:27:35 -05:00
|
|
|
}
|
|
|
|
|
|
2010-10-19 09:25:51 -07:00
|
|
|
switch (inst->opcode) {
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_DO:
|
2017-04-09 17:28:58 -07:00
|
|
|
block_scale *= 10;
|
2010-10-19 09:25:51 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_WHILE:
|
2017-04-09 17:28:58 -07:00
|
|
|
block_scale /= 10;
|
2010-10-19 09:25:51 -07:00
|
|
|
break;
|
|
|
|
|
|
2017-04-09 17:28:58 -07:00
|
|
|
case BRW_OPCODE_IF:
|
|
|
|
|
block_scale *= 0.5;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_ENDIF:
|
|
|
|
|
block_scale /= 0.5;
|
|
|
|
|
break;
|
|
|
|
|
|
2011-05-03 10:55:50 -07:00
|
|
|
default:
|
|
|
|
|
break;
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-07 20:09:08 -05:00
|
|
|
for (unsigned i = 0; i < fs->alloc.count; i++) {
|
2019-06-03 17:09:12 -05:00
|
|
|
/* Do the no_spill check first. Registers that are used as spill
|
|
|
|
|
* temporaries may have been allocated after we calculated liveness so
|
|
|
|
|
* we shouldn't look their liveness up. Fortunately, they're always
|
|
|
|
|
* used in SCRATCH_READ/WRITE instructions so they'll always be flagged
|
|
|
|
|
* no_spill.
|
|
|
|
|
*/
|
2025-01-06 20:58:35 -08:00
|
|
|
if (isinf(spill_costs[i]))
|
2019-06-03 17:09:12 -05:00
|
|
|
continue;
|
|
|
|
|
|
2016-03-13 16:25:57 -07:00
|
|
|
int live_length = live.vgrf_end[i] - live.vgrf_start[i];
|
2019-04-13 16:01:50 -05:00
|
|
|
if (live_length <= 0)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* Divide the cost (in number of spills/fills) by the log of the length
|
|
|
|
|
* of the live range of the register. This will encourage spill logic
|
|
|
|
|
* to spill long-living things before spilling short-lived things where
|
|
|
|
|
* spilling is less likely to actually do us any good. We use the log
|
|
|
|
|
* of the length because it will fall off very quickly and not cause us
|
|
|
|
|
* to spill medium length registers with more uses.
|
|
|
|
|
*/
|
|
|
|
|
float adjusted_cost = spill_costs[i] / logf(live_length);
|
2019-06-03 17:09:12 -05:00
|
|
|
ra_set_node_spill_cost(g, first_vgrf_node + i, adjusted_cost);
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
2019-05-14 23:03:29 -05:00
|
|
|
|
|
|
|
|
have_spill_costs = true;
|
2025-01-06 20:51:32 -08:00
|
|
|
|
|
|
|
|
ralloc_free(spill_costs);
|
2019-05-08 13:34:04 -05:00
|
|
|
}
|
2010-10-19 09:25:51 -07:00
|
|
|
|
2019-05-08 13:34:04 -05:00
|
|
|
int
|
|
|
|
|
fs_reg_alloc::choose_spill_reg()
|
|
|
|
|
{
|
2019-05-14 23:03:29 -05:00
|
|
|
if (!have_spill_costs)
|
|
|
|
|
set_spill_costs();
|
|
|
|
|
|
2019-05-08 13:09:27 -05:00
|
|
|
int node = ra_get_best_spill_node(g);
|
|
|
|
|
if (node < 0)
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
assert(node >= first_vgrf_node);
|
|
|
|
|
return node - first_vgrf_node;
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg
|
2019-05-08 13:34:04 -05:00
|
|
|
fs_reg_alloc::alloc_spill_reg(unsigned size, int ip)
|
|
|
|
|
{
|
2023-01-11 00:20:36 -08:00
|
|
|
int vgrf = fs->alloc.allocate(ALIGN(size, reg_unit(devinfo)));
|
2022-06-28 17:49:38 -07:00
|
|
|
int class_idx = DIV_ROUND_UP(size, reg_unit(devinfo)) - 1;
|
2024-02-26 07:46:58 -08:00
|
|
|
int n = ra_add_node(g, compiler->fs_reg_set.classes[class_idx]);
|
2019-05-08 13:34:04 -05:00
|
|
|
assert(n == first_vgrf_node + vgrf);
|
|
|
|
|
assert(n == first_spill_node + spill_node_count);
|
|
|
|
|
|
|
|
|
|
setup_live_interference(n, ip - 1, ip + 1);
|
|
|
|
|
|
|
|
|
|
/* Add interference between this spill node and any other spill nodes for
|
|
|
|
|
* the same instruction.
|
|
|
|
|
*/
|
|
|
|
|
for (int s = 0; s < spill_node_count; s++) {
|
|
|
|
|
if (spill_vgrf_ip[s] == ip)
|
|
|
|
|
ra_add_node_interference(g, n, first_spill_node + s);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Add this spill node to the list for next time */
|
|
|
|
|
if (spill_node_count >= spill_vgrf_ip_alloc) {
|
|
|
|
|
if (spill_vgrf_ip_alloc == 0)
|
|
|
|
|
spill_vgrf_ip_alloc = 16;
|
|
|
|
|
else
|
|
|
|
|
spill_vgrf_ip_alloc *= 2;
|
|
|
|
|
spill_vgrf_ip = reralloc(mem_ctx, spill_vgrf_ip, int,
|
|
|
|
|
spill_vgrf_ip_alloc);
|
|
|
|
|
}
|
|
|
|
|
spill_vgrf_ip[spill_node_count++] = ip;
|
|
|
|
|
|
2024-06-18 15:25:22 -07:00
|
|
|
return brw_vgrf(vgrf, BRW_TYPE_F);
|
2019-05-08 13:34:04 -05:00
|
|
|
}
|
|
|
|
|
|
2010-10-19 09:25:51 -07:00
|
|
|
void
|
2019-05-07 20:09:08 -05:00
|
|
|
fs_reg_alloc::spill_reg(unsigned spill_reg)
|
2010-10-19 09:25:51 -07:00
|
|
|
{
|
2019-05-07 20:09:08 -05:00
|
|
|
int size = fs->alloc.sizes[spill_reg];
|
|
|
|
|
unsigned int spill_offset = fs->last_scratch;
|
2010-10-19 09:25:51 -07:00
|
|
|
assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
|
2013-10-29 12:46:18 -07:00
|
|
|
|
intel/brw: Build the scratch header on the fly for pre-LSC systems
Instead of reserving a register to contain the spill header, which
gets marked live for the entire program, we can just emit the ALU
instructions to build it on the fly. (This is similar to the way
we handle scratch on Alchemist with the newer LSC data port.)
There are a couple of downsides that make this not obviously a win.
First, in order to construct the scratch header on Gfx9-12, we have
to use fields from g0, which will have to remain live anywhere that
scratch access is required. This could negate the register pressure
benefits of creating the header on the fly. However, g0 is oft used
in other places anyway, so it may already be there. Another is that
it's a non-trivial number of ALU instructions to construct the value.
Still, trading lower pressure (so fewer spills, less memory access
and stalls) for more cheap ALU seems like it ought to be a win.
There is another valuable benefit: by not reserving a register, we
eliminate the need to reconstruct the interference graph. (The next
patch will actually do so.)
shader-db on Icelake shows spills/fills at 54/53 helped, 4/10 hurt,
and an 8% increase in ALU on affected shaders. Synmark's OglCSDof
(a benchmark that spills) performance remains the same on Alderlake.
fossil-db on Icelake shows a 5.6%/5.1% reduction in spills/fills and a
4% reduction in scratch memory size on affected shaders. Instruction
counts go up by 11.07%, but cycle estimates only increase by 0.57%.
Assassin's Creed Odyssey and Wolfenstein Youngblood both see 20-30%
reductions in spills/fills, a significant improvement.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25811>
2023-09-30 03:14:45 -07:00
|
|
|
fs->spilled_any_registers = true;
|
2010-10-19 09:25:51 -07:00
|
|
|
|
2024-09-03 00:25:10 +03:00
|
|
|
fs->last_scratch += align(size * REG_SIZE, REG_SIZE * reg_unit(devinfo));
|
2013-10-16 12:16:51 -07:00
|
|
|
|
2019-05-08 13:34:04 -05:00
|
|
|
/* We're about to replace all uses of this register. It no longer
|
|
|
|
|
* conflicts with anything so we can get rid of its interference.
|
|
|
|
|
*/
|
|
|
|
|
ra_set_node_spill_cost(g, first_vgrf_node + spill_reg, 0);
|
|
|
|
|
ra_reset_node_interference(g, first_vgrf_node + spill_reg);
|
|
|
|
|
|
2010-10-19 09:25:51 -07:00
|
|
|
/* Generate spill/unspill instructions for the objects being
|
|
|
|
|
* spilled. Right now, we spill or unspill the whole thing to a
|
|
|
|
|
* virtual grf of the same size. For most instructions, though, we
|
|
|
|
|
* could just spill/unspill the GRF being accessed.
|
|
|
|
|
*/
|
2019-05-08 13:34:04 -05:00
|
|
|
int ip = 0;
|
2019-05-07 20:09:08 -05:00
|
|
|
foreach_block_and_inst (block, fs_inst, inst, fs->cfg) {
|
|
|
|
|
const fs_builder ibld = fs_builder(fs, block, inst);
|
2019-05-08 13:34:04 -05:00
|
|
|
exec_node *before = inst->prev;
|
|
|
|
|
exec_node *after = inst->next;
|
2016-05-15 20:30:06 -07:00
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
if (inst->src[i].file == VGRF &&
|
2015-10-26 04:35:14 -07:00
|
|
|
inst->src[i].nr == spill_reg) {
|
2024-11-13 11:08:26 +02:00
|
|
|
/* Count registers needed in units of physical registers */
|
2024-06-19 10:50:51 -07:00
|
|
|
int count = align(regs_read(devinfo, inst, i), reg_unit(devinfo));
|
2024-11-13 11:08:26 +02:00
|
|
|
/* Align the spilling offset the physical register size */
|
2016-09-01 12:42:20 -07:00
|
|
|
int subset_spill_offset = spill_offset +
|
2024-09-03 00:25:10 +03:00
|
|
|
ROUND_DOWN_TO(inst->src[i].offset, REG_SIZE * reg_unit(devinfo));
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg unspill_dst = alloc_spill_reg(count, ip);
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
2015-10-26 04:35:14 -07:00
|
|
|
inst->src[i].nr = unspill_dst.nr;
|
2024-11-13 11:08:26 +02:00
|
|
|
/* The unspilled register is aligned to physical register, so
|
|
|
|
|
* adjust the offset to the remaining within the physical register
|
|
|
|
|
* size.
|
|
|
|
|
*/
|
|
|
|
|
inst->src[i].offset %= REG_SIZE * reg_unit(devinfo);
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
2016-05-16 01:03:43 -07:00
|
|
|
/* We read the largest power-of-two divisor of the register count
|
|
|
|
|
* (because only POT scratch read blocks are allowed by the
|
|
|
|
|
* hardware) up to the maximum supported block size.
|
|
|
|
|
*/
|
2016-05-15 22:59:04 -07:00
|
|
|
const unsigned width =
|
2016-09-07 16:59:35 -07:00
|
|
|
MIN2(32, 1u << (ffs(MAX2(1, count) * 8) - 1));
|
2016-05-16 00:59:37 -07:00
|
|
|
|
|
|
|
|
/* Set exec_all() on unspill messages under the (rather
|
|
|
|
|
* pessimistic) assumption that there is no one-to-one
|
|
|
|
|
* correspondence between channels of the spilled variable in
|
|
|
|
|
* scratch space and the scratch read message, which operates on
|
|
|
|
|
* 32 bit channels. It shouldn't hurt in any case because the
|
|
|
|
|
* unspill destination is a block-local temporary.
|
|
|
|
|
*/
|
2022-05-24 02:44:53 -07:00
|
|
|
emit_unspill(ibld.exec_all().group(width, 0), &fs->shader_stats,
|
2022-07-18 12:27:53 +03:00
|
|
|
unspill_dst, subset_spill_offset, count, ip);
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-10-26 17:09:25 -07:00
|
|
|
if (inst->dst.file == VGRF &&
|
2020-12-09 21:16:37 +02:00
|
|
|
inst->dst.nr == spill_reg &&
|
|
|
|
|
inst->opcode != SHADER_OPCODE_UNDEF) {
|
2024-11-13 11:08:26 +02:00
|
|
|
/* Count registers needed in units of physical registers */
|
|
|
|
|
int count = align(regs_written(inst), reg_unit(devinfo));
|
|
|
|
|
/* Align the spilling offset the physical register size */
|
2016-09-01 12:42:20 -07:00
|
|
|
int subset_spill_offset = spill_offset +
|
2024-09-03 00:25:10 +03:00
|
|
|
ROUND_DOWN_TO(inst->dst.offset, reg_unit(devinfo) * REG_SIZE);
|
2024-11-13 11:08:26 +02:00
|
|
|
brw_reg spill_src = alloc_spill_reg(count, ip);
|
2013-12-08 04:57:08 +01:00
|
|
|
|
2015-10-26 04:35:14 -07:00
|
|
|
inst->dst.nr = spill_src.nr;
|
2024-11-13 11:08:26 +02:00
|
|
|
/* The spilled register is aligned to physical register, so adjust
|
|
|
|
|
* the offset to the remaining within the physical register size.
|
|
|
|
|
*/
|
|
|
|
|
inst->dst.offset %= REG_SIZE * reg_unit(devinfo);
|
i965/fs: Make register spill/unspill only do the regs for that instruction.
Previously, if we were spilling the result of a texture call, we would store
all 4 regs, then for each use of one of those regs as the source of an
instruction, we would unspill all 4 regs even though only one was needed.
In both lightsmark and l4d2 with my current graphics config, the shaders that
produce spilling do so on split GRFs, so this doesn't help them out. However,
in a capture of the l4d2 shaders with a different snapshot and playing the
game instead of using a demo, it reduced one shader from 2817 instructions to
2179, due to choosing a now-cheaper texture result to spill instead of piles
of texcoords.
v2: Fix comment noted by Ken, and fix the if condition associated with it for
the current state of what constitutes a partial write of the destination.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> (v1)
2012-07-06 17:18:35 -07:00
|
|
|
|
2014-10-27 16:50:12 -07:00
|
|
|
/* If we're immediately spilling the register, we should not use
|
|
|
|
|
* destination dependency hints. Doing so will cause the GPU do
|
|
|
|
|
* try to read and write the register at the same time and may
|
|
|
|
|
* hang the GPU.
|
|
|
|
|
*/
|
|
|
|
|
inst->no_dd_clear = false;
|
|
|
|
|
inst->no_dd_check = false;
|
|
|
|
|
|
2016-05-16 01:03:43 -07:00
|
|
|
/* Calculate the execution width of the scratch messages (which work
|
|
|
|
|
* in terms of 32 bit components so we have a fixed number of eight
|
|
|
|
|
* channels per spilled register). We attempt to write one
|
|
|
|
|
* exec_size-wide component of the variable at a time without
|
|
|
|
|
* exceeding the maximum number of (fake) MRF registers reserved for
|
|
|
|
|
* spills.
|
|
|
|
|
*/
|
2022-10-10 18:05:13 -07:00
|
|
|
const unsigned width = 8 * reg_unit(devinfo) *
|
|
|
|
|
DIV_ROUND_UP(MIN2(inst->dst.component_size(inst->exec_size),
|
|
|
|
|
spill_max_size(fs) * REG_SIZE),
|
|
|
|
|
reg_unit(devinfo) * REG_SIZE);
|
2016-05-15 22:59:04 -07:00
|
|
|
|
2016-05-16 01:23:44 -07:00
|
|
|
/* Spills should only write data initialized by the instruction for
|
2022-06-22 18:31:08 +02:00
|
|
|
* whichever channels are enabled in the execution mask. If that's
|
2016-05-16 01:23:44 -07:00
|
|
|
* not possible we'll have to emit a matching unspill before the
|
|
|
|
|
* instruction and set force_writemask_all on the spill.
|
|
|
|
|
*/
|
|
|
|
|
const bool per_channel =
|
2024-04-21 00:57:59 -07:00
|
|
|
inst->dst.is_contiguous() &&
|
|
|
|
|
brw_type_size_bytes(inst->dst.type) == 4 &&
|
2016-05-16 01:23:44 -07:00
|
|
|
inst->exec_size == width;
|
|
|
|
|
|
2016-05-15 22:59:04 -07:00
|
|
|
/* Builder used to emit the scratch messages. */
|
2016-05-16 01:23:44 -07:00
|
|
|
const fs_builder ubld = ibld.exec_all(!per_channel).group(width, 0);
|
2016-05-15 22:59:04 -07:00
|
|
|
|
i965/fs: Make register spill/unspill only do the regs for that instruction.
Previously, if we were spilling the result of a texture call, we would store
all 4 regs, then for each use of one of those regs as the source of an
instruction, we would unspill all 4 regs even though only one was needed.
In both lightsmark and l4d2 with my current graphics config, the shaders that
produce spilling do so on split GRFs, so this doesn't help them out. However,
in a capture of the l4d2 shaders with a different snapshot and playing the
game instead of using a demo, it reduced one shader from 2817 instructions to
2179, due to choosing a now-cheaper texture result to spill instead of piles
of texcoords.
v2: Fix comment noted by Ken, and fix the if condition associated with it for
the current state of what constitutes a partial write of the destination.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> (v1)
2012-07-06 17:18:35 -07:00
|
|
|
/* If our write is going to affect just part of the
|
2016-09-07 16:59:35 -07:00
|
|
|
* regs_written(inst), then we need to unspill the destination since
|
|
|
|
|
* we write back out all of the regs_written(). If the original
|
|
|
|
|
* instruction had force_writemask_all set and is not a partial
|
|
|
|
|
* write, there should be no need for the unspill since the
|
2016-05-16 01:23:44 -07:00
|
|
|
* instruction will be overwriting the whole destination in any case.
|
2010-10-19 09:25:51 -07:00
|
|
|
*/
|
2019-04-24 12:38:28 +02:00
|
|
|
if (inst->is_partial_write() ||
|
2016-05-16 01:23:44 -07:00
|
|
|
(!inst->force_writemask_all && !per_channel))
|
2022-05-24 02:44:53 -07:00
|
|
|
emit_unspill(ubld, &fs->shader_stats, spill_src,
|
2022-07-18 12:27:53 +03:00
|
|
|
subset_spill_offset, regs_written(inst), ip);
|
2010-10-19 09:25:51 -07:00
|
|
|
|
2022-05-24 02:44:53 -07:00
|
|
|
emit_spill(ubld.at(block, inst->next), &fs->shader_stats, spill_src,
|
2022-07-18 12:27:53 +03:00
|
|
|
subset_spill_offset, regs_written(inst), ip);
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
2011-01-12 10:10:01 -08:00
|
|
|
|
2019-05-08 13:34:04 -05:00
|
|
|
for (fs_inst *inst = (fs_inst *)before->next;
|
|
|
|
|
inst != after; inst = (fs_inst *)inst->next)
|
|
|
|
|
setup_inst_interference(inst);
|
|
|
|
|
|
|
|
|
|
/* We don't advance the ip for scratch read/write instructions
|
|
|
|
|
* because we consider them to have the same ip as instruction we're
|
2020-10-09 04:27:35 -05:00
|
|
|
* spilling around for the purposes of interference. Also, we're
|
|
|
|
|
* inserting spill instructions without re-running liveness analysis
|
|
|
|
|
* and we don't want to mess up our IPs.
|
2019-05-08 13:34:04 -05:00
|
|
|
*/
|
2020-10-09 04:27:35 -05:00
|
|
|
if (!_mesa_set_search(spill_insts, inst))
|
2019-05-08 13:34:04 -05:00
|
|
|
ip++;
|
|
|
|
|
}
|
2020-10-12 15:07:25 -05:00
|
|
|
|
|
|
|
|
assert(ip == live_instr_count);
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
2019-05-07 16:48:27 -05:00
|
|
|
|
|
|
|
|
bool
|
2019-05-07 20:09:08 -05:00
|
|
|
fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all)
|
2019-05-07 16:48:27 -05:00
|
|
|
{
|
intel/brw: Only force g0's liveness to be the whole program if spilling
We don't actually need to extend g0's live range to the EOT message
generally - most messages that end a shader are headerless. The main
implicit use of g0 is for constructing scratch headers. With the last
two patches, we now consider scratch access that may exist in the IR
and already extend the liveness appropriately.
There is one remaining problem: spilling. The register allocator will
create new scratch messages when spilling a register, which need to
create scratch headers, which need g0. So, every new spill or fill
might extend the live range of g0, which would create new interference,
altering the graph. This can be problematic.
However, when compiling SIMD16 or SIMD32 fragment shaders, we don't
allow spilling anyway. So, why not use allow g0? Also, when trying
various scheduling modes, we first try allocation without spilling.
If it works, great, if not, we try a (hopefully) less aggressive
schedule, and only allow spilling on the lowest-pressure schedule.
So, even for regular SIMD8 shaders, we can potentially gain the use
of g0 on the first few tries at scheduling+allocation.
Once we try to allocate with spilling, we go back to reserving g0
for the entire program, so that we can construct scratch headers at
any point. We could possibly do better here, but this is simple and
reliable with some benefit.
Thanks to Ian Romanick for suggesting I try this approach.
fossil-db on Alchemist shows some more spill/fill improvements:
Totals:
Instrs: 149062395 -> 149053010 (-0.01%); split: -0.01%, +0.00%
Cycles: 12609496913 -> 12611652181 (+0.02%); split: -0.45%, +0.47%
Spill count: 52891 -> 52471 (-0.79%)
Fill count: 101599 -> 100818 (-0.77%)
Scratch Memory Size: 3292160 -> 3197952 (-2.86%)
Totals from 416541 (66.59% of 625484) affected shaders:
Instrs: 124058587 -> 124049202 (-0.01%); split: -0.01%, +0.01%
Cycles: 3567164271 -> 3569319539 (+0.06%); split: -1.61%, +1.67%
Spill count: 420 -> 0 (-inf%)
Fill count: 781 -> 0 (-inf%)
Scratch Memory Size: 94208 -> 0 (-inf%)
Witcher 3 shows a 33% reduction in scratch memory size, for example.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30319>
2024-07-22 17:22:47 -07:00
|
|
|
build_interference_graph(allow_spilling);
|
2019-05-07 18:14:46 -05:00
|
|
|
|
2020-10-23 15:58:06 -05:00
|
|
|
unsigned spilled = 0;
|
2019-05-08 13:34:04 -05:00
|
|
|
while (1) {
|
2019-05-07 18:14:46 -05:00
|
|
|
/* Debug of register spilling: Go spill everything. */
|
|
|
|
|
if (unlikely(spill_all)) {
|
|
|
|
|
int reg = choose_spill_reg();
|
|
|
|
|
if (reg != -1) {
|
|
|
|
|
spill_reg(reg);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-05-07 16:48:27 -05:00
|
|
|
|
2019-05-07 18:14:46 -05:00
|
|
|
if (ra_allocate(g))
|
|
|
|
|
break;
|
2019-05-07 16:48:27 -05:00
|
|
|
|
2019-05-07 18:14:46 -05:00
|
|
|
if (!allow_spilling)
|
2019-05-07 16:48:27 -05:00
|
|
|
return false;
|
|
|
|
|
|
2020-10-23 15:58:06 -05:00
|
|
|
/* Failed to allocate registers. Spill some regs, and the caller will
|
2019-05-14 22:51:20 -05:00
|
|
|
* loop back into here to try again.
|
|
|
|
|
*/
|
2020-10-23 15:58:06 -05:00
|
|
|
unsigned nr_spills = 1;
|
|
|
|
|
if (compiler->spilling_rate)
|
|
|
|
|
nr_spills = MAX2(1, spilled / compiler->spilling_rate);
|
2019-05-14 22:51:20 -05:00
|
|
|
|
2020-10-23 15:58:06 -05:00
|
|
|
for (unsigned j = 0; j < nr_spills; j++) {
|
|
|
|
|
int reg = choose_spill_reg();
|
|
|
|
|
if (reg == -1) {
|
|
|
|
|
if (j == 0)
|
|
|
|
|
return false; /* Nothing to spill */
|
|
|
|
|
break;
|
|
|
|
|
}
|
2019-05-08 13:34:04 -05:00
|
|
|
|
2020-10-23 15:58:06 -05:00
|
|
|
spill_reg(reg);
|
|
|
|
|
spilled++;
|
|
|
|
|
}
|
2019-05-07 16:48:27 -05:00
|
|
|
}
|
|
|
|
|
|
2019-05-08 13:34:04 -05:00
|
|
|
if (spilled)
|
2016-03-13 19:26:37 -07:00
|
|
|
fs->invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
2019-05-08 13:34:04 -05:00
|
|
|
|
2019-05-07 16:48:27 -05:00
|
|
|
/* Get the chosen virtual registers for each node, and map virtual
|
|
|
|
|
* regs in the register classes back down to real hardware reg
|
|
|
|
|
* numbers.
|
|
|
|
|
*/
|
2025-01-06 20:51:32 -08:00
|
|
|
unsigned *hw_reg_mapping = ralloc_array(NULL, unsigned, fs->alloc.count);
|
2019-05-07 20:09:08 -05:00
|
|
|
fs->grf_used = fs->first_non_payload_grf;
|
|
|
|
|
for (unsigned i = 0; i < fs->alloc.count; i++) {
|
2019-05-08 13:09:27 -05:00
|
|
|
int reg = ra_get_node_reg(g, first_vgrf_node + i);
|
2019-05-07 16:48:27 -05:00
|
|
|
|
2021-03-05 09:20:01 -08:00
|
|
|
hw_reg_mapping[i] = reg;
|
2019-05-07 20:09:08 -05:00
|
|
|
fs->grf_used = MAX2(fs->grf_used,
|
2022-06-28 17:49:38 -07:00
|
|
|
hw_reg_mapping[i] + DIV_ROUND_UP(fs->alloc.sizes[i],
|
|
|
|
|
reg_unit(devinfo)));
|
2019-05-07 16:48:27 -05:00
|
|
|
}
|
|
|
|
|
|
2019-05-07 20:09:08 -05:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
|
2022-06-28 17:49:38 -07:00
|
|
|
assign_reg(devinfo, hw_reg_mapping, &inst->dst);
|
2019-05-07 16:48:27 -05:00
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2022-06-28 17:49:38 -07:00
|
|
|
assign_reg(devinfo, hw_reg_mapping, &inst->src[i]);
|
2019-05-07 16:48:27 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-07 20:09:08 -05:00
|
|
|
fs->alloc.count = fs->grf_used;
|
2019-05-07 16:48:27 -05:00
|
|
|
|
2025-01-06 20:51:32 -08:00
|
|
|
ralloc_free(hw_reg_mapping);
|
|
|
|
|
|
2019-05-07 16:48:27 -05:00
|
|
|
return true;
|
|
|
|
|
}
|
2019-05-07 20:09:08 -05:00
|
|
|
|
|
|
|
|
bool
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_assign_regs(fs_visitor &s, bool allow_spilling, bool spill_all)
|
2019-05-07 20:09:08 -05:00
|
|
|
{
|
2024-07-12 16:55:33 -07:00
|
|
|
fs_reg_alloc alloc(&s);
|
2019-05-07 18:14:46 -05:00
|
|
|
bool success = alloc.assign_regs(allow_spilling, spill_all);
|
|
|
|
|
if (!success && allow_spilling) {
|
2024-07-12 16:55:33 -07:00
|
|
|
s.fail("no register to spill:\n");
|
2024-12-07 09:53:31 -08:00
|
|
|
brw_print_instructions(s);
|
2019-05-07 18:14:46 -05:00
|
|
|
}
|
|
|
|
|
return success;
|
2019-05-07 20:09:08 -05:00
|
|
|
}
|