2010-10-20 10:26:29 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
* Authors:
|
|
|
|
|
* Eric Anholt <eric@anholt.net>
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
2015-11-22 17:58:51 -08:00
|
|
|
#include "brw_eu.h"
|
2010-10-20 10:26:29 -07:00
|
|
|
#include "brw_fs.h"
|
2023-11-21 09:58:55 -08:00
|
|
|
#include "brw_fs_builder.h"
|
2014-07-15 11:45:20 -07:00
|
|
|
#include "brw_cfg.h"
|
2020-10-09 04:27:35 -05:00
|
|
|
#include "util/set.h"
|
2015-11-22 18:27:42 -08:00
|
|
|
#include "util/register_allocate.h"
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2015-06-03 19:05:54 +03:00
|
|
|
using namespace brw;
|
|
|
|
|
|
2022-02-18 22:25:58 -08:00
|
|
|
#define REG_CLASS_COUNT 20
|
|
|
|
|
|
2010-10-20 10:26:29 -07:00
|
|
|
static void
|
2022-06-28 17:49:38 -07:00
|
|
|
assign_reg(const struct intel_device_info *devinfo,
|
|
|
|
|
unsigned *reg_hw_locations, fs_reg *reg)
|
2010-10-20 10:26:29 -07:00
|
|
|
{
|
2015-10-26 17:09:25 -07:00
|
|
|
if (reg->file == VGRF) {
|
2022-06-28 17:49:38 -07:00
|
|
|
reg->nr = reg_unit(devinfo) * reg_hw_locations[reg->nr] + reg->offset / REG_SIZE;
|
2016-09-01 12:42:20 -07:00
|
|
|
reg->offset %= REG_SIZE;
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
fs_visitor::assign_regs_trivial()
|
|
|
|
|
{
|
2015-02-10 15:51:34 +02:00
|
|
|
unsigned hw_reg_mapping[this->alloc.count + 1];
|
|
|
|
|
unsigned i;
|
2012-11-20 13:50:52 -08:00
|
|
|
int reg_width = dispatch_width / 8;
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2011-03-13 13:26:09 -07:00
|
|
|
/* Note that compressed instructions require alignment to 2 registers. */
|
2011-05-04 13:50:13 -07:00
|
|
|
hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
|
2015-02-10 15:51:34 +02:00
|
|
|
for (i = 1; i <= this->alloc.count; i++) {
|
2010-10-20 10:26:29 -07:00
|
|
|
hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
|
2022-06-28 17:49:38 -07:00
|
|
|
DIV_ROUND_UP(this->alloc.sizes[i - 1],
|
|
|
|
|
reg_unit(devinfo)));
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
2015-02-10 15:51:34 +02:00
|
|
|
this->grf_used = hw_reg_mapping[this->alloc.count];
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2022-06-28 17:49:38 -07:00
|
|
|
assign_reg(devinfo, hw_reg_mapping, &inst->dst);
|
2014-08-08 14:57:27 -07:00
|
|
|
for (i = 0; i < inst->sources; i++) {
|
2022-06-28 17:49:38 -07:00
|
|
|
assign_reg(devinfo, hw_reg_mapping, &inst->src[i]);
|
2014-08-08 14:57:27 -07:00
|
|
|
}
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
|
2024-02-21 21:21:20 -08:00
|
|
|
if (this->grf_used >= BRW_MAX_GRF) {
|
2011-03-13 13:26:09 -07:00
|
|
|
fail("Ran out of regs on trivial allocator (%d/%d)\n",
|
2024-02-21 21:21:20 -08:00
|
|
|
this->grf_used, BRW_MAX_GRF);
|
2014-08-08 16:25:34 -07:00
|
|
|
} else {
|
2015-02-10 15:51:34 +02:00
|
|
|
this->alloc.count = this->grf_used;
|
2011-03-13 13:26:09 -07:00
|
|
|
}
|
|
|
|
|
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
|
2024-02-28 13:59:35 -08:00
|
|
|
extern "C" void
|
2024-02-26 07:46:58 -08:00
|
|
|
brw_fs_alloc_reg_sets(struct brw_compiler *compiler)
|
2010-10-20 10:26:29 -07:00
|
|
|
{
|
2021-04-05 13:19:39 -07:00
|
|
|
const struct intel_device_info *devinfo = compiler->devinfo;
|
2014-08-18 14:27:55 -07:00
|
|
|
int base_reg_count = BRW_MAX_GRF;
|
2015-07-31 08:36:35 -07:00
|
|
|
|
2012-10-01 16:39:54 -07:00
|
|
|
/* The registers used to make up almost all values handled in the compiler
|
|
|
|
|
* are a scalar value occupying a single register (or 2 registers in the
|
2013-11-12 15:33:27 -08:00
|
|
|
* case of SIMD16, which is handled by dividing base_reg_count by 2 and
|
2012-10-01 16:39:54 -07:00
|
|
|
* multiplying allocated register numbers by 2). Things that were
|
|
|
|
|
* aggregates of scalar values at the GLSL level were split to scalar
|
|
|
|
|
* values by split_virtual_grfs().
|
|
|
|
|
*
|
2013-05-22 11:26:03 -07:00
|
|
|
* However, texture SEND messages return a series of contiguous registers
|
|
|
|
|
* to write into. We currently always ask for 4 registers, but we may
|
|
|
|
|
* convert that to use less some day.
|
2012-10-01 16:39:54 -07:00
|
|
|
*
|
2021-03-29 15:40:04 -07:00
|
|
|
* Additionally, on gfx5 we need aligned pairs of registers for the PLN
|
|
|
|
|
* instruction, and on gfx4 we need 8 contiguous regs for workaround simd16
|
2012-11-13 15:54:41 -08:00
|
|
|
* texturing.
|
2012-10-01 16:39:54 -07:00
|
|
|
*/
|
2022-06-28 17:49:38 -07:00
|
|
|
assert(REG_CLASS_COUNT == MAX_VGRF_SIZE(devinfo) / reg_unit(devinfo));
|
2022-02-18 22:25:58 -08:00
|
|
|
int class_sizes[REG_CLASS_COUNT];
|
|
|
|
|
for (unsigned i = 0; i < REG_CLASS_COUNT; i++)
|
2016-04-30 20:47:49 -07:00
|
|
|
class_sizes[i] = i + 1;
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2021-03-05 09:20:01 -08:00
|
|
|
struct ra_regs *regs = ra_alloc_reg_set(compiler, BRW_MAX_GRF, false);
|
2024-02-15 02:34:50 -08:00
|
|
|
ra_set_allocate_round_robin(regs);
|
2022-02-18 22:25:58 -08:00
|
|
|
struct ra_class **classes = ralloc_array(compiler, struct ra_class *,
|
|
|
|
|
REG_CLASS_COUNT);
|
2014-09-12 16:17:37 -07:00
|
|
|
|
2021-03-05 09:20:01 -08:00
|
|
|
/* Now, make the register classes for each size of contiguous register
|
|
|
|
|
* allocation we might need to make.
|
|
|
|
|
*/
|
2022-02-18 22:25:58 -08:00
|
|
|
for (int i = 0; i < REG_CLASS_COUNT; i++) {
|
2021-03-05 09:20:01 -08:00
|
|
|
classes[i] = ra_alloc_contig_reg_class(regs, class_sizes[i]);
|
|
|
|
|
|
2024-02-15 02:34:50 -08:00
|
|
|
for (int reg = 0; reg <= base_reg_count - class_sizes[i]; reg++)
|
|
|
|
|
ra_class_add_reg(classes[i], reg);
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
|
2021-03-05 09:20:01 -08:00
|
|
|
ra_set_finalize(regs, NULL);
|
2012-10-02 19:07:20 -07:00
|
|
|
|
2024-02-26 07:46:58 -08:00
|
|
|
compiler->fs_reg_set.regs = regs;
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(compiler->fs_reg_set.classes); i++)
|
|
|
|
|
compiler->fs_reg_set.classes[i] = NULL;
|
2022-02-18 22:25:58 -08:00
|
|
|
for (int i = 0; i < REG_CLASS_COUNT; i++)
|
2024-02-26 07:46:58 -08:00
|
|
|
compiler->fs_reg_set.classes[class_sizes[i] - 1] = classes[i];
|
2011-05-09 09:56:18 -07:00
|
|
|
}
|
|
|
|
|
|
2014-09-01 15:38:58 -07:00
|
|
|
static int
|
|
|
|
|
count_to_loop_end(const bblock_t *block)
|
2012-10-02 15:01:24 -07:00
|
|
|
{
|
2014-09-01 15:01:23 -07:00
|
|
|
if (block->end()->opcode == BRW_OPCODE_WHILE)
|
2014-09-01 15:38:58 -07:00
|
|
|
return block->end_ip;
|
|
|
|
|
|
2012-10-02 15:01:24 -07:00
|
|
|
int depth = 1;
|
2014-09-01 15:38:58 -07:00
|
|
|
/* Skip the first block, since we don't want to count the do the calling
|
|
|
|
|
* function found.
|
|
|
|
|
*/
|
2014-09-02 21:07:51 -07:00
|
|
|
for (block = block->next();
|
2012-10-02 15:01:24 -07:00
|
|
|
depth > 0;
|
2014-09-02 21:07:51 -07:00
|
|
|
block = block->next()) {
|
2014-09-01 15:01:23 -07:00
|
|
|
if (block->start()->opcode == BRW_OPCODE_DO)
|
2012-10-02 15:01:24 -07:00
|
|
|
depth++;
|
2014-09-01 15:01:23 -07:00
|
|
|
if (block->end()->opcode == BRW_OPCODE_WHILE) {
|
2012-10-02 15:01:24 -07:00
|
|
|
depth--;
|
2014-09-01 15:38:58 -07:00
|
|
|
if (depth == 0)
|
|
|
|
|
return block->end_ip;
|
2012-10-02 15:01:24 -07:00
|
|
|
}
|
|
|
|
|
}
|
2014-09-01 15:38:58 -07:00
|
|
|
unreachable("not reached");
|
2012-10-02 15:01:24 -07:00
|
|
|
}
|
|
|
|
|
|
2022-07-07 01:12:24 -07:00
|
|
|
void fs_visitor::calculate_payload_ranges(unsigned payload_node_count,
|
2016-09-23 15:15:33 +03:00
|
|
|
int *payload_last_use_ip) const
|
2012-10-01 17:54:10 -07:00
|
|
|
{
|
2012-10-02 15:01:24 -07:00
|
|
|
int loop_depth = 0;
|
|
|
|
|
int loop_end_ip = 0;
|
|
|
|
|
|
2022-07-07 01:12:24 -07:00
|
|
|
for (unsigned i = 0; i < payload_node_count; i++)
|
2015-06-30 13:42:15 -07:00
|
|
|
payload_last_use_ip[i] = -1;
|
|
|
|
|
|
2012-10-02 15:01:24 -07:00
|
|
|
int ip = 0;
|
2014-09-01 13:35:04 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2012-10-02 15:01:24 -07:00
|
|
|
switch (inst->opcode) {
|
|
|
|
|
case BRW_OPCODE_DO:
|
|
|
|
|
loop_depth++;
|
|
|
|
|
|
|
|
|
|
/* Since payload regs are deffed only at the start of the shader
|
|
|
|
|
* execution, any uses of the payload within a loop mean the live
|
|
|
|
|
* interval extends to the end of the outermost loop. Find the ip of
|
|
|
|
|
* the end now.
|
|
|
|
|
*/
|
|
|
|
|
if (loop_depth == 1)
|
2014-09-01 15:38:58 -07:00
|
|
|
loop_end_ip = count_to_loop_end(block);
|
2012-10-02 15:01:24 -07:00
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_WHILE:
|
|
|
|
|
loop_depth--;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int use_ip;
|
|
|
|
|
if (loop_depth > 0)
|
|
|
|
|
use_ip = loop_end_ip;
|
|
|
|
|
else
|
|
|
|
|
use_ip = ip;
|
|
|
|
|
|
2015-10-26 17:52:57 -07:00
|
|
|
/* Note that UNIFORM args have been turned into FIXED_GRF by
|
2012-10-02 15:01:24 -07:00
|
|
|
* assign_curbe_setup(), and interpolation uses fixed hardware regs from
|
|
|
|
|
* the start (see interp_reg()).
|
|
|
|
|
*/
|
2014-03-17 10:39:43 -07:00
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2015-10-26 17:52:57 -07:00
|
|
|
if (inst->src[i].file == FIXED_GRF) {
|
2022-07-07 14:01:29 -07:00
|
|
|
unsigned reg_nr = inst->src[i].nr;
|
|
|
|
|
if (reg_nr / reg_unit(devinfo) >= payload_node_count)
|
2012-10-02 15:01:24 -07:00
|
|
|
continue;
|
|
|
|
|
|
2022-07-07 14:01:29 -07:00
|
|
|
for (unsigned j = reg_nr / reg_unit(devinfo);
|
|
|
|
|
j < DIV_ROUND_UP(reg_nr + regs_read(inst, i),
|
|
|
|
|
reg_unit(devinfo));
|
|
|
|
|
j++) {
|
|
|
|
|
payload_last_use_ip[j] = use_ip;
|
|
|
|
|
assert(j < payload_node_count);
|
2015-02-02 14:23:35 -08:00
|
|
|
}
|
2012-10-02 15:01:24 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-10 13:19:53 -06:00
|
|
|
if (inst->dst.file == FIXED_GRF) {
|
2022-07-07 14:01:29 -07:00
|
|
|
unsigned reg_nr = inst->dst.nr;
|
|
|
|
|
if (reg_nr / reg_unit(devinfo) < payload_node_count) {
|
|
|
|
|
for (unsigned j = reg_nr / reg_unit(devinfo);
|
|
|
|
|
j < DIV_ROUND_UP(reg_nr + regs_written(inst),
|
|
|
|
|
reg_unit(devinfo));
|
|
|
|
|
j++) {
|
|
|
|
|
payload_last_use_ip[j] = use_ip;
|
|
|
|
|
assert(j < payload_node_count);
|
2021-03-10 13:19:53 -06:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-28 02:58:00 -08:00
|
|
|
if (inst->eot) {
|
|
|
|
|
/* We could omit this for the !inst->header_present case, except
|
|
|
|
|
* that the simulator apparently incorrectly reads from g0/g1
|
|
|
|
|
* instead of sideband. It also really freaks out driver
|
|
|
|
|
* developers to see g0 used in unusual places, so just always
|
|
|
|
|
* reserve it.
|
|
|
|
|
*/
|
2014-09-27 11:15:28 -07:00
|
|
|
payload_last_use_ip[0] = use_ip;
|
2024-02-28 02:58:00 -08:00
|
|
|
payload_last_use_ip[1] = use_ip;
|
2012-10-02 15:01:24 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ip++;
|
|
|
|
|
}
|
2015-06-12 12:01:35 -07:00
|
|
|
}
|
|
|
|
|
|
2019-05-07 20:09:08 -05:00
|
|
|
class fs_reg_alloc {
|
|
|
|
|
public:
|
|
|
|
|
fs_reg_alloc(fs_visitor *fs):
|
2016-03-13 16:25:57 -07:00
|
|
|
fs(fs), devinfo(fs->devinfo), compiler(fs->compiler),
|
|
|
|
|
live(fs->live_analysis.require()), g(NULL),
|
2019-05-14 23:03:29 -05:00
|
|
|
have_spill_costs(false)
|
2019-05-07 20:09:08 -05:00
|
|
|
{
|
|
|
|
|
mem_ctx = ralloc_context(NULL);
|
2019-05-07 23:54:17 -05:00
|
|
|
|
2020-10-12 15:07:25 -05:00
|
|
|
/* Stash the number of instructions so we can sanity check that our
|
|
|
|
|
* counts still match liveness.
|
|
|
|
|
*/
|
|
|
|
|
live_instr_count = fs->cfg->last_block()->end_ip + 1;
|
|
|
|
|
|
2020-10-09 04:27:35 -05:00
|
|
|
spill_insts = _mesa_pointer_set_create(mem_ctx);
|
|
|
|
|
|
2019-05-07 23:54:17 -05:00
|
|
|
/* Most of this allocation was written for a reg_width of 1
|
|
|
|
|
* (dispatch_width == 8). In extending to SIMD16, the code was
|
|
|
|
|
* left in place and it was converted to have the hardware
|
|
|
|
|
* registers it's allocating be contiguous physical pairs of regs
|
|
|
|
|
* for reg_width == 2.
|
|
|
|
|
*/
|
2019-05-07 20:09:08 -05:00
|
|
|
int reg_width = fs->dispatch_width / 8;
|
2019-05-07 23:54:17 -05:00
|
|
|
payload_node_count = ALIGN(fs->first_non_payload_grf, reg_width);
|
|
|
|
|
|
|
|
|
|
/* Get payload IP information */
|
|
|
|
|
payload_last_use_ip = ralloc_array(mem_ctx, int, payload_node_count);
|
2019-05-08 13:34:04 -05:00
|
|
|
|
2020-09-09 18:13:43 +02:00
|
|
|
node_count = 0;
|
|
|
|
|
first_payload_node = 0;
|
2020-10-08 14:41:43 -05:00
|
|
|
scratch_header_node = 0;
|
2020-09-09 18:13:43 +02:00
|
|
|
grf127_send_hack_node = 0;
|
|
|
|
|
first_vgrf_node = 0;
|
2020-10-08 15:51:13 -05:00
|
|
|
last_vgrf_node = 0;
|
2020-09-09 18:13:43 +02:00
|
|
|
first_spill_node = 0;
|
|
|
|
|
|
2019-05-08 13:34:04 -05:00
|
|
|
spill_vgrf_ip = NULL;
|
|
|
|
|
spill_vgrf_ip_alloc = 0;
|
|
|
|
|
spill_node_count = 0;
|
2019-05-07 20:09:08 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
~fs_reg_alloc()
|
|
|
|
|
{
|
|
|
|
|
ralloc_free(mem_ctx);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool assign_regs(bool allow_spilling, bool spill_all);
|
|
|
|
|
|
|
|
|
|
private:
|
2019-05-07 23:54:17 -05:00
|
|
|
void setup_live_interference(unsigned node,
|
|
|
|
|
int node_start_ip, int node_end_ip);
|
2016-09-23 15:15:33 +03:00
|
|
|
void setup_inst_interference(const fs_inst *inst);
|
2019-05-07 23:54:17 -05:00
|
|
|
|
2019-05-07 17:38:22 -05:00
|
|
|
void build_interference_graph(bool allow_spilling);
|
2019-05-14 23:02:42 -05:00
|
|
|
void discard_interference_graph();
|
2019-05-07 20:09:08 -05:00
|
|
|
|
2022-07-18 12:27:53 +03:00
|
|
|
fs_reg build_lane_offsets(const fs_builder &bld,
|
|
|
|
|
uint32_t spill_offset, int ip);
|
|
|
|
|
fs_reg build_single_offset(const fs_builder &bld,
|
|
|
|
|
uint32_t spill_offset, int ip);
|
|
|
|
|
|
2022-05-24 02:44:53 -07:00
|
|
|
void emit_unspill(const fs_builder &bld, struct shader_stats *stats,
|
2022-07-18 12:27:53 +03:00
|
|
|
fs_reg dst, uint32_t spill_offset, unsigned count, int ip);
|
2022-05-24 02:44:53 -07:00
|
|
|
void emit_spill(const fs_builder &bld, struct shader_stats *stats,
|
2022-07-18 12:27:53 +03:00
|
|
|
fs_reg src, uint32_t spill_offset, unsigned count, int ip);
|
2020-10-09 04:27:35 -05:00
|
|
|
|
2019-05-08 13:34:04 -05:00
|
|
|
void set_spill_costs();
|
2019-05-07 20:09:08 -05:00
|
|
|
int choose_spill_reg();
|
2020-10-08 14:41:43 -05:00
|
|
|
fs_reg alloc_scratch_header();
|
2019-05-08 13:34:04 -05:00
|
|
|
fs_reg alloc_spill_reg(unsigned size, int ip);
|
2019-05-07 20:09:08 -05:00
|
|
|
void spill_reg(unsigned spill_reg);
|
|
|
|
|
|
|
|
|
|
void *mem_ctx;
|
|
|
|
|
fs_visitor *fs;
|
2021-04-05 13:19:39 -07:00
|
|
|
const intel_device_info *devinfo;
|
2019-05-07 20:09:08 -05:00
|
|
|
const brw_compiler *compiler;
|
2016-03-13 16:25:57 -07:00
|
|
|
const fs_live_variables &live;
|
2020-10-12 15:07:25 -05:00
|
|
|
int live_instr_count;
|
2019-05-07 20:09:08 -05:00
|
|
|
|
2020-10-09 04:27:35 -05:00
|
|
|
set *spill_insts;
|
|
|
|
|
|
2019-05-07 20:09:08 -05:00
|
|
|
ra_graph *g;
|
2019-05-14 23:03:29 -05:00
|
|
|
bool have_spill_costs;
|
2012-10-02 15:01:24 -07:00
|
|
|
|
2019-05-07 23:54:17 -05:00
|
|
|
int payload_node_count;
|
|
|
|
|
int *payload_last_use_ip;
|
2015-06-30 13:42:15 -07:00
|
|
|
|
2019-05-07 23:54:17 -05:00
|
|
|
int node_count;
|
|
|
|
|
int first_payload_node;
|
2020-10-08 14:41:43 -05:00
|
|
|
int scratch_header_node;
|
2019-05-07 23:54:17 -05:00
|
|
|
int grf127_send_hack_node;
|
2019-05-08 13:09:27 -05:00
|
|
|
int first_vgrf_node;
|
2020-10-08 15:51:13 -05:00
|
|
|
int last_vgrf_node;
|
2019-05-08 13:34:04 -05:00
|
|
|
int first_spill_node;
|
|
|
|
|
|
|
|
|
|
int *spill_vgrf_ip;
|
|
|
|
|
int spill_vgrf_ip_alloc;
|
|
|
|
|
int spill_node_count;
|
2020-10-08 14:41:43 -05:00
|
|
|
|
|
|
|
|
fs_reg scratch_header;
|
2019-05-07 23:54:17 -05:00
|
|
|
};
|
2012-10-01 17:54:10 -07:00
|
|
|
|
2019-05-07 17:38:22 -05:00
|
|
|
namespace {
|
|
|
|
|
/**
|
|
|
|
|
* Maximum spill block size we expect to encounter in 32B units.
|
|
|
|
|
*
|
|
|
|
|
* This is somewhat arbitrary and doesn't necessarily limit the maximum
|
|
|
|
|
* variable size that can be spilled -- A higher value will allow a
|
|
|
|
|
* variable of a given size to be spilled more efficiently with a smaller
|
|
|
|
|
* number of scratch messages, but will increase the likelihood of a
|
|
|
|
|
* collision between the MRFs reserved for spilling and other MRFs used by
|
|
|
|
|
* the program (and possibly increase GRF register pressure on platforms
|
|
|
|
|
* without hardware MRFs), what could cause register allocation to fail.
|
|
|
|
|
*
|
|
|
|
|
* For the moment reserve just enough space so a register of 32 bit
|
|
|
|
|
* component type and natural region width can be spilled without splitting
|
|
|
|
|
* into multiple (force_writemask_all) scratch messages.
|
|
|
|
|
*/
|
|
|
|
|
unsigned
|
2024-02-19 22:25:16 -08:00
|
|
|
spill_max_size(const fs_visitor *s)
|
2019-05-07 17:38:22 -05:00
|
|
|
{
|
2022-07-18 12:27:53 +03:00
|
|
|
/* LSC is limited to SIMD16 sends */
|
|
|
|
|
if (s->devinfo->has_lsc)
|
|
|
|
|
return 2;
|
|
|
|
|
|
2021-03-29 15:46:12 -07:00
|
|
|
/* FINISHME - On Gfx7+ it should be possible to avoid this limit
|
2019-05-07 17:38:22 -05:00
|
|
|
* altogether by spilling directly from the temporary GRF
|
|
|
|
|
* allocated to hold the result of the instruction (and the
|
|
|
|
|
* scratch write header).
|
|
|
|
|
*/
|
|
|
|
|
/* FINISHME - The shader's dispatch width probably belongs in
|
|
|
|
|
* backend_shader (or some nonexistent fs_shader class?)
|
|
|
|
|
* rather than in the visitor class.
|
|
|
|
|
*/
|
2024-02-19 22:25:16 -08:00
|
|
|
return s->dispatch_width / 8;
|
2019-05-07 17:38:22 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-07 20:09:08 -05:00
|
|
|
void
|
2019-05-07 23:54:17 -05:00
|
|
|
fs_reg_alloc::setup_live_interference(unsigned node,
|
|
|
|
|
int node_start_ip, int node_end_ip)
|
2013-10-29 12:18:10 -07:00
|
|
|
{
|
2019-05-07 23:54:17 -05:00
|
|
|
/* Mark any virtual grf that is live between the start of the program and
|
|
|
|
|
* the last use of a payload node interfering with that payload node.
|
2011-05-09 09:56:18 -07:00
|
|
|
*/
|
2019-05-07 23:54:17 -05:00
|
|
|
for (int i = 0; i < payload_node_count; i++) {
|
|
|
|
|
if (payload_last_use_ip[i] == -1)
|
|
|
|
|
continue;
|
2012-10-01 16:39:54 -07:00
|
|
|
|
2016-03-09 17:46:16 -08:00
|
|
|
/* Note that we use a <= comparison, unlike vgrfs_interfere(),
|
2019-05-07 23:54:17 -05:00
|
|
|
* in order to not have to worry about the uniform issue described in
|
|
|
|
|
* calculate_live_intervals().
|
2012-10-01 16:39:54 -07:00
|
|
|
*/
|
2019-05-07 23:54:17 -05:00
|
|
|
if (node_start_ip <= payload_last_use_ip[i])
|
|
|
|
|
ra_add_node_interference(g, node, first_payload_node + i);
|
|
|
|
|
}
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2020-10-08 14:41:43 -05:00
|
|
|
/* Everything interferes with the scratch header */
|
|
|
|
|
if (scratch_header_node >= 0)
|
|
|
|
|
ra_add_node_interference(g, node, scratch_header_node);
|
|
|
|
|
|
2019-05-07 23:54:17 -05:00
|
|
|
/* Add interference with every vgrf whose live range intersects this
|
|
|
|
|
* node's. We only need to look at nodes below this one as the reflexivity
|
|
|
|
|
* of interference will take care of the rest.
|
|
|
|
|
*/
|
2019-05-08 13:34:04 -05:00
|
|
|
for (unsigned n2 = first_vgrf_node;
|
2020-10-08 15:51:13 -05:00
|
|
|
n2 <= (unsigned)last_vgrf_node && n2 < node; n2++) {
|
2019-05-08 13:09:27 -05:00
|
|
|
unsigned vgrf = n2 - first_vgrf_node;
|
2016-03-13 16:25:57 -07:00
|
|
|
if (!(node_end_ip <= live.vgrf_start[vgrf] ||
|
|
|
|
|
live.vgrf_end[vgrf] <= node_start_ip))
|
2019-05-08 13:09:27 -05:00
|
|
|
ra_add_node_interference(g, node, n2);
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
2019-05-07 23:54:17 -05:00
|
|
|
}
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2019-05-07 23:54:17 -05:00
|
|
|
void
|
2016-09-23 15:15:33 +03:00
|
|
|
fs_reg_alloc::setup_inst_interference(const fs_inst *inst)
|
2019-05-07 23:54:17 -05:00
|
|
|
{
|
i965: Add src/dst interference for certain instructions with hazards.
When working on tessellation shaders, I created some vec4 virtual
opcodes for creating message headers through a sequence like:
mov(8) g7<1>UD 0x00000000UD { align1 WE_all 1Q compacted };
mov(1) g7.5<1>UD 0x00000100UD { align1 WE_all };
mov(1) g7<1>UD g0<0,1,0>UD { align1 WE_all compacted };
mov(1) g7.3<1>UD g8<0,1,0>UD { align1 WE_all };
This is done in the generator since the vec4 backend can't handle align1
regioning. From the visitor's point of view, this is a single opcode:
hs_set_output_urb_offsets vgrf7.0:UD, 1U, vgrf8.xxxx:UD
Normally, there's no hazard between sources and destinations - an
instruction (naturally) reads its sources, then writes the result to the
destination. However, when the virtual instruction generates multiple
hardware instructions, we can get into trouble.
In the above example, if the register allocator assigned vgrf7 and vgrf8
to the same hardware register, then we'd clobber the source with 0 in
the first instruction, and read back the wrong value in the last one.
It occured to me that this is exactly the same problem we have with
SIMD16 instructions that use W/UW or B/UB types with 0 stride. The
hardware implicitly decodes them as two SIMD8 instructions, and with
the overlapping regions, the first would clobber the second.
Previously, we handled that by incrementing the live range end IP by 1,
which works, but is excessive: the next instruction doesn't actually
care about that. It might also be the end of control flow. This might
keep values alive too long. What we really want is to say "my source
and destinations interfere".
This patch creates new infrastructure for doing just that, and teaches
the register allocator to add interference when there's a hazard. For
my vec4 case, we can determine this by switching on opcodes. For the
SIMD16 case, we just move the existing code there.
I audited our existing virtual opcodes that generate multiple
instructions; I believe FS_OPCODE_PACK_HALF_2x16_SPLIT needs this
treatment as well, but no others.
v2: Rebased by mattst88.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-11-19 16:00:18 -08:00
|
|
|
/* Certain instructions can't safely use the same register for their
|
|
|
|
|
* sources and destination. Add interference.
|
|
|
|
|
*/
|
2019-05-07 23:54:17 -05:00
|
|
|
if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
|
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file == VGRF) {
|
2019-05-08 13:09:27 -05:00
|
|
|
ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
|
|
|
|
|
first_vgrf_node + inst->src[i].nr);
|
2014-09-12 16:17:37 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-07 01:21:23 -05:00
|
|
|
/* A compressed instruction is actually two instructions executed
|
|
|
|
|
* simultaneously. On most platforms, it ok to have the source and
|
|
|
|
|
* destination registers be the same. In this case, each instruction
|
|
|
|
|
* over-writes its own source and there's no problem. The real problem
|
|
|
|
|
* here is if the source and destination registers are off by one. Then
|
|
|
|
|
* you can end up in a scenario where the first instruction over-writes the
|
|
|
|
|
* source of the second instruction. Since the compiler doesn't know about
|
|
|
|
|
* this level of granularity, we simply make the source and destination
|
|
|
|
|
* interfere.
|
2018-10-17 12:05:42 +02:00
|
|
|
*/
|
2020-09-07 01:21:23 -05:00
|
|
|
if (inst->dst.component_size(inst->exec_size) > REG_SIZE &&
|
|
|
|
|
inst->dst.file == VGRF) {
|
2018-10-17 12:05:42 +02:00
|
|
|
for (int i = 0; i < inst->sources; ++i) {
|
|
|
|
|
if (inst->src[i].file == VGRF) {
|
2019-05-08 13:09:27 -05:00
|
|
|
ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
|
|
|
|
|
first_vgrf_node + inst->src[i].nr);
|
2014-08-18 14:27:55 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-07 23:54:17 -05:00
|
|
|
if (grf127_send_hack_node >= 0) {
|
2018-04-19 01:15:23 +02:00
|
|
|
/* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
|
|
|
|
|
* subsection "EUISA Instructions", Send Message (page 990):
|
|
|
|
|
*
|
|
|
|
|
* "r127 must not be used for return address when there is a src and
|
|
|
|
|
* dest overlap in send instruction."
|
|
|
|
|
*
|
|
|
|
|
* We are avoiding using grf127 as part of the destination of send
|
|
|
|
|
* messages adding a node interference to the grf127_send_hack_node.
|
2022-06-22 18:31:08 +02:00
|
|
|
* This node has a fixed assignment to grf127.
|
2018-04-19 01:15:23 +02:00
|
|
|
*
|
2019-01-15 10:53:44 -06:00
|
|
|
* We don't apply it to SIMD16 instructions because previous code avoids
|
|
|
|
|
* any register overlap between sources and destination.
|
2018-04-19 01:15:23 +02:00
|
|
|
*/
|
2019-05-07 23:54:17 -05:00
|
|
|
if (inst->exec_size < 16 && inst->is_send_from_grf() &&
|
|
|
|
|
inst->dst.file == VGRF)
|
2019-05-08 13:09:27 -05:00
|
|
|
ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
|
|
|
|
|
grf127_send_hack_node);
|
2018-04-19 01:15:23 +02:00
|
|
|
}
|
|
|
|
|
|
2019-01-28 23:24:24 -06:00
|
|
|
/* From the Skylake PRM Vol. 2a docs for sends:
|
|
|
|
|
*
|
|
|
|
|
* "It is required that the second block of GRFs does not overlap with
|
|
|
|
|
* the first block."
|
|
|
|
|
*
|
|
|
|
|
* Normally, this is taken care of by fixup_sends_duplicate_payload() but
|
|
|
|
|
* in the case where one of the registers is an undefined value, the
|
|
|
|
|
* register allocator may decide that they don't interfere even though
|
|
|
|
|
* they're used as sources in the same instruction. We also need to add
|
|
|
|
|
* interference here.
|
|
|
|
|
*/
|
2024-02-15 02:34:50 -08:00
|
|
|
if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
|
|
|
|
|
inst->src[2].file == VGRF && inst->src[3].file == VGRF &&
|
|
|
|
|
inst->src[2].nr != inst->src[3].nr)
|
|
|
|
|
ra_add_node_interference(g, first_vgrf_node + inst->src[2].nr,
|
|
|
|
|
first_vgrf_node + inst->src[3].nr);
|
2019-05-07 23:54:17 -05:00
|
|
|
|
|
|
|
|
/* When we do send-from-GRF for FB writes, we need to ensure that the last
|
|
|
|
|
* write instruction sends from a high register. This is because the
|
|
|
|
|
* vertex fetcher wants to start filling the low payload registers while
|
|
|
|
|
* the pixel data port is still working on writing out the memory. If we
|
|
|
|
|
* don't do this, we get rendering artifacts.
|
|
|
|
|
*
|
|
|
|
|
* We could just do "something high". Instead, we just pick the highest
|
|
|
|
|
* register that works.
|
|
|
|
|
*/
|
|
|
|
|
if (inst->eot) {
|
|
|
|
|
const int vgrf = inst->opcode == SHADER_OPCODE_SEND ?
|
|
|
|
|
inst->src[2].nr : inst->src[0].nr;
|
2022-06-28 17:49:38 -07:00
|
|
|
const int size = DIV_ROUND_UP(fs->alloc.sizes[vgrf], reg_unit(devinfo));
|
|
|
|
|
int reg = BRW_MAX_GRF - size;
|
2019-05-07 23:54:17 -05:00
|
|
|
|
2024-02-15 02:34:50 -08:00
|
|
|
if (grf127_send_hack_node >= 0) {
|
2019-12-27 16:08:04 -08:00
|
|
|
/* Avoid r127 which might be unusable if the node was previously
|
|
|
|
|
* written by a SIMD8 SEND message with source/destination overlap.
|
|
|
|
|
*/
|
|
|
|
|
reg--;
|
|
|
|
|
}
|
2019-05-07 23:54:17 -05:00
|
|
|
|
2024-01-12 02:57:27 -08:00
|
|
|
assert(reg >= 112);
|
2019-05-08 13:09:27 -05:00
|
|
|
ra_set_node_reg(g, first_vgrf_node + vgrf, reg);
|
2022-06-13 15:29:15 -07:00
|
|
|
|
|
|
|
|
if (inst->ex_mlen > 0) {
|
|
|
|
|
const int vgrf = inst->src[3].nr;
|
2022-06-28 17:49:38 -07:00
|
|
|
reg -= DIV_ROUND_UP(fs->alloc.sizes[vgrf], reg_unit(devinfo));
|
2024-01-12 02:57:27 -08:00
|
|
|
assert(reg >= 112);
|
2022-06-13 15:29:15 -07:00
|
|
|
ra_set_node_reg(g, first_vgrf_node + vgrf, reg);
|
|
|
|
|
}
|
2019-05-07 23:54:17 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
fs_reg_alloc::build_interference_graph(bool allow_spilling)
|
|
|
|
|
{
|
|
|
|
|
/* Compute the RA node layout */
|
2019-05-08 13:09:27 -05:00
|
|
|
node_count = 0;
|
2019-05-07 23:54:17 -05:00
|
|
|
first_payload_node = node_count;
|
|
|
|
|
node_count += payload_node_count;
|
2024-02-15 02:34:50 -08:00
|
|
|
|
|
|
|
|
grf127_send_hack_node = node_count;
|
|
|
|
|
node_count++;
|
|
|
|
|
|
2019-05-08 13:09:27 -05:00
|
|
|
first_vgrf_node = node_count;
|
|
|
|
|
node_count += fs->alloc.count;
|
2020-10-08 15:51:13 -05:00
|
|
|
last_vgrf_node = node_count - 1;
|
2024-02-15 02:34:50 -08:00
|
|
|
if (devinfo->verx10 < 125 && allow_spilling) {
|
2020-10-08 14:41:43 -05:00
|
|
|
scratch_header_node = node_count++;
|
|
|
|
|
} else {
|
|
|
|
|
scratch_header_node = -1;
|
|
|
|
|
}
|
2019-05-08 13:34:04 -05:00
|
|
|
first_spill_node = node_count;
|
2019-05-07 23:54:17 -05:00
|
|
|
|
|
|
|
|
fs->calculate_payload_ranges(payload_node_count,
|
|
|
|
|
payload_last_use_ip);
|
|
|
|
|
|
|
|
|
|
assert(g == NULL);
|
2024-02-26 07:46:58 -08:00
|
|
|
g = ra_alloc_interference_graph(compiler->fs_reg_set.regs, node_count);
|
2019-05-07 23:54:17 -05:00
|
|
|
ralloc_steal(mem_ctx, g);
|
|
|
|
|
|
|
|
|
|
/* Set up the payload nodes */
|
2021-06-07 10:18:03 -05:00
|
|
|
for (int i = 0; i < payload_node_count; i++)
|
|
|
|
|
ra_set_node_reg(g, first_payload_node + i, i);
|
2019-05-07 23:54:17 -05:00
|
|
|
|
|
|
|
|
if (grf127_send_hack_node >= 0)
|
|
|
|
|
ra_set_node_reg(g, grf127_send_hack_node, 127);
|
|
|
|
|
|
2020-01-03 14:53:11 -08:00
|
|
|
/* Specify the classes of each virtual register. */
|
2019-05-07 23:54:17 -05:00
|
|
|
for (unsigned i = 0; i < fs->alloc.count; i++) {
|
2022-06-28 17:49:38 -07:00
|
|
|
unsigned size = DIV_ROUND_UP(fs->alloc.sizes[i], reg_unit(devinfo));
|
2019-05-07 23:54:17 -05:00
|
|
|
|
2024-02-26 07:46:58 -08:00
|
|
|
assert(size <= ARRAY_SIZE(compiler->fs_reg_set.classes) &&
|
2019-05-07 23:54:17 -05:00
|
|
|
"Register allocation relies on split_virtual_grfs()");
|
|
|
|
|
|
2020-01-03 14:53:11 -08:00
|
|
|
ra_set_node_class(g, first_vgrf_node + i,
|
2024-02-26 07:46:58 -08:00
|
|
|
compiler->fs_reg_set.classes[size - 1]);
|
2020-01-03 14:53:11 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Add interference based on the live range of the register */
|
|
|
|
|
for (unsigned i = 0; i < fs->alloc.count; i++) {
|
2019-05-08 13:09:27 -05:00
|
|
|
setup_live_interference(first_vgrf_node + i,
|
2016-03-13 16:25:57 -07:00
|
|
|
live.vgrf_start[i],
|
|
|
|
|
live.vgrf_end[i]);
|
2019-05-07 23:54:17 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Add interference based on the instructions in which a register is used.
|
|
|
|
|
*/
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, fs->cfg)
|
|
|
|
|
setup_inst_interference(inst);
|
2019-05-07 16:43:56 -05:00
|
|
|
}
|
|
|
|
|
|
2019-05-14 23:02:42 -05:00
|
|
|
void
|
|
|
|
|
fs_reg_alloc::discard_interference_graph()
|
|
|
|
|
{
|
|
|
|
|
ralloc_free(g);
|
|
|
|
|
g = NULL;
|
2019-05-14 23:03:29 -05:00
|
|
|
have_spill_costs = false;
|
2019-05-14 23:02:42 -05:00
|
|
|
}
|
|
|
|
|
|
2022-07-18 12:27:53 +03:00
|
|
|
fs_reg
|
|
|
|
|
fs_reg_alloc::build_single_offset(const fs_builder &bld, uint32_t spill_offset, int ip)
|
|
|
|
|
{
|
2024-04-20 17:08:02 -07:00
|
|
|
fs_reg offset = retype(alloc_spill_reg(1, ip), BRW_TYPE_UD);
|
2022-07-18 12:27:53 +03:00
|
|
|
fs_inst *inst = bld.MOV(offset, brw_imm_ud(spill_offset));
|
|
|
|
|
_mesa_set_add(spill_insts, inst);
|
|
|
|
|
return offset;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fs_reg
|
|
|
|
|
fs_reg_alloc::build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, int ip)
|
|
|
|
|
{
|
|
|
|
|
/* LSC messages are limited to SIMD16 */
|
|
|
|
|
assert(bld.dispatch_width() <= 16);
|
|
|
|
|
|
|
|
|
|
const fs_builder ubld = bld.exec_all();
|
|
|
|
|
const unsigned reg_count = ubld.dispatch_width() / 8;
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
fs_reg offset = retype(alloc_spill_reg(reg_count, ip), BRW_TYPE_UD);
|
2022-07-18 12:27:53 +03:00
|
|
|
fs_inst *inst;
|
|
|
|
|
|
|
|
|
|
/* Build an offset per lane in SIMD8 */
|
2024-04-20 17:08:02 -07:00
|
|
|
inst = ubld.group(8, 0).MOV(retype(offset, BRW_TYPE_UW),
|
2022-07-18 12:27:53 +03:00
|
|
|
brw_imm_uv(0x76543210));
|
|
|
|
|
_mesa_set_add(spill_insts, inst);
|
2024-04-20 17:08:02 -07:00
|
|
|
inst = ubld.group(8, 0).MOV(offset, retype(offset, BRW_TYPE_UW));
|
2022-07-18 12:27:53 +03:00
|
|
|
_mesa_set_add(spill_insts, inst);
|
|
|
|
|
|
|
|
|
|
/* Build offsets in the upper 8 lanes of SIMD16 */
|
|
|
|
|
if (ubld.dispatch_width() > 8) {
|
|
|
|
|
inst = ubld.group(8, 0).ADD(
|
|
|
|
|
byte_offset(offset, REG_SIZE),
|
|
|
|
|
byte_offset(offset, 0),
|
|
|
|
|
brw_imm_ud(8));
|
|
|
|
|
_mesa_set_add(spill_insts, inst);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Make the offset a dword */
|
|
|
|
|
inst = ubld.SHL(offset, offset, brw_imm_ud(2));
|
|
|
|
|
_mesa_set_add(spill_insts, inst);
|
|
|
|
|
|
|
|
|
|
/* Add the base offset */
|
|
|
|
|
inst = ubld.ADD(offset, offset, brw_imm_ud(spill_offset));
|
|
|
|
|
_mesa_set_add(spill_insts, inst);
|
|
|
|
|
|
|
|
|
|
return offset;
|
|
|
|
|
}
|
|
|
|
|
|
2020-10-09 04:27:35 -05:00
|
|
|
void
|
2022-05-24 02:44:53 -07:00
|
|
|
fs_reg_alloc::emit_unspill(const fs_builder &bld,
|
|
|
|
|
struct shader_stats *stats,
|
|
|
|
|
fs_reg dst,
|
2022-07-18 12:27:53 +03:00
|
|
|
uint32_t spill_offset, unsigned count, int ip)
|
2010-10-19 09:25:51 -07:00
|
|
|
{
|
2021-04-05 13:19:39 -07:00
|
|
|
const intel_device_info *devinfo = bld.shader->devinfo;
|
2016-05-15 22:59:04 -07:00
|
|
|
const unsigned reg_size = dst.component_size(bld.dispatch_width()) /
|
|
|
|
|
REG_SIZE;
|
|
|
|
|
assert(count % reg_size == 0);
|
2015-06-03 19:05:54 +03:00
|
|
|
|
2016-05-12 00:39:06 -07:00
|
|
|
for (unsigned i = 0; i < count / reg_size; i++) {
|
2022-05-24 02:44:53 -07:00
|
|
|
++stats->fill_count;
|
|
|
|
|
|
2020-10-08 14:32:30 -05:00
|
|
|
fs_inst *unspill_inst;
|
2022-07-18 12:27:53 +03:00
|
|
|
if (devinfo->verx10 >= 125) {
|
|
|
|
|
/* LSC is limited to SIMD16 load/store but we can load more using
|
|
|
|
|
* transpose messages.
|
|
|
|
|
*/
|
|
|
|
|
const bool use_transpose = bld.dispatch_width() > 16;
|
|
|
|
|
const fs_builder ubld = use_transpose ? bld.exec_all().group(1, 0) : bld;
|
|
|
|
|
fs_reg offset;
|
|
|
|
|
if (use_transpose) {
|
|
|
|
|
offset = build_single_offset(ubld, spill_offset, ip);
|
|
|
|
|
} else {
|
|
|
|
|
offset = build_lane_offsets(ubld, spill_offset, ip);
|
|
|
|
|
}
|
|
|
|
|
/* We leave the extended descriptor empty and flag the instruction to
|
|
|
|
|
* ask the generated to insert the extended descriptor in the address
|
|
|
|
|
* register. That way we don't need to burn an additional register
|
|
|
|
|
* for register allocation spill/fill.
|
|
|
|
|
*/
|
|
|
|
|
fs_reg srcs[] = {
|
|
|
|
|
brw_imm_ud(0), /* desc */
|
|
|
|
|
brw_imm_ud(0), /* ex_desc */
|
|
|
|
|
offset, /* payload */
|
|
|
|
|
fs_reg(), /* payload2 */
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
unspill_inst = ubld.emit(SHADER_OPCODE_SEND, dst,
|
|
|
|
|
srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
unspill_inst->sfid = GFX12_SFID_UGM;
|
|
|
|
|
unspill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
|
2022-11-14 15:54:01 +02:00
|
|
|
LSC_ADDR_SURFTYPE_SS,
|
2022-07-18 12:27:53 +03:00
|
|
|
LSC_ADDR_SIZE_A32,
|
|
|
|
|
LSC_DATA_SIZE_D32,
|
|
|
|
|
use_transpose ? reg_size * 8 : 1 /* num_channels */,
|
|
|
|
|
use_transpose,
|
2022-09-29 12:38:19 -07:00
|
|
|
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
|
2022-07-18 12:27:53 +03:00
|
|
|
unspill_inst->header_size = 0;
|
2022-09-28 16:17:02 -07:00
|
|
|
unspill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
|
|
|
|
|
unspill_inst->exec_size);
|
2022-07-18 12:27:53 +03:00
|
|
|
unspill_inst->ex_mlen = 0;
|
|
|
|
|
unspill_inst->size_written =
|
2022-09-28 16:17:02 -07:00
|
|
|
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, bld.dispatch_width()) * REG_SIZE;
|
2022-07-18 12:27:53 +03:00
|
|
|
unspill_inst->send_has_side_effects = false;
|
|
|
|
|
unspill_inst->send_is_volatile = true;
|
|
|
|
|
unspill_inst->send_ex_desc_scratch = true;
|
2024-02-15 02:34:50 -08:00
|
|
|
} else {
|
2020-10-08 14:41:43 -05:00
|
|
|
fs_reg header = this->scratch_header;
|
|
|
|
|
fs_builder ubld = bld.exec_all().group(1, 0);
|
|
|
|
|
assert(spill_offset % 16 == 0);
|
|
|
|
|
unspill_inst = ubld.MOV(component(header, 2),
|
|
|
|
|
brw_imm_ud(spill_offset / 16));
|
|
|
|
|
_mesa_set_add(spill_insts, unspill_inst);
|
|
|
|
|
|
2022-07-18 12:27:53 +03:00
|
|
|
const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
|
|
|
|
|
const fs_reg ex_desc = brw_imm_ud(0);
|
2020-10-20 17:42:21 -05:00
|
|
|
|
|
|
|
|
fs_reg srcs[] = { brw_imm_ud(0), ex_desc, header };
|
2020-10-08 14:41:43 -05:00
|
|
|
unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst,
|
|
|
|
|
srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
unspill_inst->mlen = 1;
|
|
|
|
|
unspill_inst->header_size = 1;
|
|
|
|
|
unspill_inst->size_written = reg_size * REG_SIZE;
|
|
|
|
|
unspill_inst->send_has_side_effects = false;
|
|
|
|
|
unspill_inst->send_is_volatile = true;
|
2021-03-29 15:16:59 -07:00
|
|
|
unspill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
|
2020-10-08 14:41:43 -05:00
|
|
|
unspill_inst->desc =
|
2020-10-20 17:42:21 -05:00
|
|
|
brw_dp_desc(devinfo, bti,
|
2021-02-05 08:11:01 -06:00
|
|
|
BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
|
|
|
|
|
BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8));
|
2013-10-16 11:51:22 -07:00
|
|
|
}
|
2020-10-09 04:27:35 -05:00
|
|
|
_mesa_set_add(spill_insts, unspill_inst);
|
i965/fs: Make register spill/unspill only do the regs for that instruction.
Previously, if we were spilling the result of a texture call, we would store
all 4 regs, then for each use of one of those regs as the source of an
instruction, we would unspill all 4 regs even though only one was needed.
In both lightsmark and l4d2 with my current graphics config, the shaders that
produce spilling do so on split GRFs, so this doesn't help them out. However,
in a capture of the l4d2 shaders with a different snapshot and playing the
game instead of using a demo, it reduced one shader from 2817 instructions to
2179, due to choosing a now-cheaper texture result to spill instead of piles
of texcoords.
v2: Fix comment noted by Ken, and fix the if condition associated with it for
the current state of what constitutes a partial write of the destination.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> (v1)
2012-07-06 17:18:35 -07:00
|
|
|
|
2016-09-01 12:42:20 -07:00
|
|
|
dst.offset += reg_size * REG_SIZE;
|
2014-10-24 11:41:25 -07:00
|
|
|
spill_offset += reg_size * REG_SIZE;
|
2014-08-18 14:27:55 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-10-09 04:27:35 -05:00
|
|
|
void
|
2022-05-24 02:44:53 -07:00
|
|
|
fs_reg_alloc::emit_spill(const fs_builder &bld,
|
|
|
|
|
struct shader_stats *stats,
|
|
|
|
|
fs_reg src,
|
2022-07-18 12:27:53 +03:00
|
|
|
uint32_t spill_offset, unsigned count, int ip)
|
2014-08-18 14:27:55 -07:00
|
|
|
{
|
2021-04-05 13:19:39 -07:00
|
|
|
const intel_device_info *devinfo = bld.shader->devinfo;
|
2016-05-15 22:59:04 -07:00
|
|
|
const unsigned reg_size = src.component_size(bld.dispatch_width()) /
|
|
|
|
|
REG_SIZE;
|
|
|
|
|
assert(count % reg_size == 0);
|
2015-06-03 19:05:54 +03:00
|
|
|
|
2016-05-12 00:39:06 -07:00
|
|
|
for (unsigned i = 0; i < count / reg_size; i++) {
|
2022-05-24 02:44:53 -07:00
|
|
|
++stats->spill_count;
|
|
|
|
|
|
2020-10-08 14:41:43 -05:00
|
|
|
fs_inst *spill_inst;
|
2022-07-18 12:27:53 +03:00
|
|
|
if (devinfo->verx10 >= 125) {
|
|
|
|
|
fs_reg offset = build_lane_offsets(bld, spill_offset, ip);
|
|
|
|
|
/* We leave the extended descriptor empty and flag the instruction
|
|
|
|
|
* relocate the extended descriptor. That way the surface offset is
|
|
|
|
|
* directly put into the instruction and we don't need to use a
|
|
|
|
|
* register to hold it.
|
|
|
|
|
*/
|
|
|
|
|
fs_reg srcs[] = {
|
|
|
|
|
brw_imm_ud(0), /* desc */
|
|
|
|
|
brw_imm_ud(0), /* ex_desc */
|
|
|
|
|
offset, /* payload */
|
|
|
|
|
src, /* payload2 */
|
|
|
|
|
};
|
|
|
|
|
spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
|
|
|
|
|
srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
spill_inst->sfid = GFX12_SFID_UGM;
|
|
|
|
|
spill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
|
2022-11-14 15:54:01 +02:00
|
|
|
LSC_ADDR_SURFTYPE_SS,
|
2022-07-18 12:27:53 +03:00
|
|
|
LSC_ADDR_SIZE_A32,
|
|
|
|
|
LSC_DATA_SIZE_D32,
|
|
|
|
|
1 /* num_channels */,
|
|
|
|
|
false /* transpose */,
|
2022-09-29 12:38:19 -07:00
|
|
|
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
|
2022-07-18 12:27:53 +03:00
|
|
|
spill_inst->header_size = 0;
|
2022-09-28 16:17:02 -07:00
|
|
|
spill_inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32,
|
|
|
|
|
bld.dispatch_width());
|
2022-07-18 12:27:53 +03:00
|
|
|
spill_inst->ex_mlen = reg_size;
|
|
|
|
|
spill_inst->size_written = 0;
|
|
|
|
|
spill_inst->send_has_side_effects = true;
|
|
|
|
|
spill_inst->send_is_volatile = false;
|
|
|
|
|
spill_inst->send_ex_desc_scratch = true;
|
2024-02-15 02:34:50 -08:00
|
|
|
} else {
|
2020-10-08 14:41:43 -05:00
|
|
|
fs_reg header = this->scratch_header;
|
|
|
|
|
fs_builder ubld = bld.exec_all().group(1, 0);
|
|
|
|
|
assert(spill_offset % 16 == 0);
|
|
|
|
|
spill_inst = ubld.MOV(component(header, 2),
|
|
|
|
|
brw_imm_ud(spill_offset / 16));
|
|
|
|
|
_mesa_set_add(spill_insts, spill_inst);
|
|
|
|
|
|
2022-07-18 12:27:53 +03:00
|
|
|
const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
|
|
|
|
|
const fs_reg ex_desc = brw_imm_ud(0);
|
2020-10-20 17:42:21 -05:00
|
|
|
|
|
|
|
|
fs_reg srcs[] = { brw_imm_ud(0), ex_desc, header, src };
|
2020-10-08 14:41:43 -05:00
|
|
|
spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
|
|
|
|
|
srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
spill_inst->mlen = 1;
|
|
|
|
|
spill_inst->ex_mlen = reg_size;
|
|
|
|
|
spill_inst->size_written = 0;
|
|
|
|
|
spill_inst->header_size = 1;
|
|
|
|
|
spill_inst->send_has_side_effects = true;
|
|
|
|
|
spill_inst->send_is_volatile = false;
|
2021-03-29 15:16:59 -07:00
|
|
|
spill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
|
2020-10-08 14:41:43 -05:00
|
|
|
spill_inst->desc =
|
2020-10-20 17:42:21 -05:00
|
|
|
brw_dp_desc(devinfo, bti,
|
2021-02-05 08:11:01 -06:00
|
|
|
GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE,
|
|
|
|
|
BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8));
|
2020-10-08 14:41:43 -05:00
|
|
|
}
|
2020-10-09 04:27:35 -05:00
|
|
|
_mesa_set_add(spill_insts, spill_inst);
|
2020-10-08 14:26:57 -05:00
|
|
|
|
|
|
|
|
src.offset += reg_size * REG_SIZE;
|
|
|
|
|
spill_offset += reg_size * REG_SIZE;
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
}
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
|
2019-05-08 13:34:04 -05:00
|
|
|
void
|
|
|
|
|
fs_reg_alloc::set_spill_costs()
|
2010-10-19 09:25:51 -07:00
|
|
|
{
|
2017-04-09 17:28:58 -07:00
|
|
|
float block_scale = 1.0;
|
2019-05-07 20:09:08 -05:00
|
|
|
float spill_costs[fs->alloc.count];
|
|
|
|
|
bool no_spill[fs->alloc.count];
|
2010-10-19 09:25:51 -07:00
|
|
|
|
2019-05-07 20:09:08 -05:00
|
|
|
for (unsigned i = 0; i < fs->alloc.count; i++) {
|
2010-10-19 09:25:51 -07:00
|
|
|
spill_costs[i] = 0.0;
|
|
|
|
|
no_spill[i] = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Calculate costs for spilling nodes. Call it a cost of 1 per
|
|
|
|
|
* spill/unspill we'll have to do, and guess that the insides of
|
|
|
|
|
* loops run 10 times.
|
|
|
|
|
*/
|
2019-05-07 20:09:08 -05:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2016-04-27 02:07:08 -07:00
|
|
|
if (inst->src[i].file == VGRF)
|
2017-04-20 11:42:27 -07:00
|
|
|
spill_costs[inst->src[i].nr] += regs_read(inst, i) * block_scale;
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
|
2016-04-27 02:07:08 -07:00
|
|
|
if (inst->dst.file == VGRF)
|
2017-04-20 11:44:01 -07:00
|
|
|
spill_costs[inst->dst.nr] += regs_written(inst) * block_scale;
|
2012-09-19 13:28:00 -07:00
|
|
|
|
2020-10-09 04:27:35 -05:00
|
|
|
/* Don't spill anything we generated while spilling */
|
|
|
|
|
if (_mesa_set_search(spill_insts, inst)) {
|
|
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file == VGRF)
|
|
|
|
|
no_spill[inst->src[i].nr] = true;
|
|
|
|
|
}
|
|
|
|
|
if (inst->dst.file == VGRF)
|
|
|
|
|
no_spill[inst->dst.nr] = true;
|
|
|
|
|
}
|
|
|
|
|
|
2010-10-19 09:25:51 -07:00
|
|
|
switch (inst->opcode) {
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_DO:
|
2017-04-09 17:28:58 -07:00
|
|
|
block_scale *= 10;
|
2010-10-19 09:25:51 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_WHILE:
|
2017-04-09 17:28:58 -07:00
|
|
|
block_scale /= 10;
|
2010-10-19 09:25:51 -07:00
|
|
|
break;
|
|
|
|
|
|
2017-04-09 17:28:58 -07:00
|
|
|
case BRW_OPCODE_IF:
|
|
|
|
|
block_scale *= 0.5;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_ENDIF:
|
|
|
|
|
block_scale /= 0.5;
|
|
|
|
|
break;
|
|
|
|
|
|
2011-05-03 10:55:50 -07:00
|
|
|
default:
|
|
|
|
|
break;
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-07 20:09:08 -05:00
|
|
|
for (unsigned i = 0; i < fs->alloc.count; i++) {
|
2019-06-03 17:09:12 -05:00
|
|
|
/* Do the no_spill check first. Registers that are used as spill
|
|
|
|
|
* temporaries may have been allocated after we calculated liveness so
|
|
|
|
|
* we shouldn't look their liveness up. Fortunately, they're always
|
|
|
|
|
* used in SCRATCH_READ/WRITE instructions so they'll always be flagged
|
|
|
|
|
* no_spill.
|
|
|
|
|
*/
|
|
|
|
|
if (no_spill[i])
|
|
|
|
|
continue;
|
|
|
|
|
|
2016-03-13 16:25:57 -07:00
|
|
|
int live_length = live.vgrf_end[i] - live.vgrf_start[i];
|
2019-04-13 16:01:50 -05:00
|
|
|
if (live_length <= 0)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* Divide the cost (in number of spills/fills) by the log of the length
|
|
|
|
|
* of the live range of the register. This will encourage spill logic
|
|
|
|
|
* to spill long-living things before spilling short-lived things where
|
|
|
|
|
* spilling is less likely to actually do us any good. We use the log
|
|
|
|
|
* of the length because it will fall off very quickly and not cause us
|
|
|
|
|
* to spill medium length registers with more uses.
|
|
|
|
|
*/
|
|
|
|
|
float adjusted_cost = spill_costs[i] / logf(live_length);
|
2019-06-03 17:09:12 -05:00
|
|
|
ra_set_node_spill_cost(g, first_vgrf_node + i, adjusted_cost);
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
2019-05-14 23:03:29 -05:00
|
|
|
|
|
|
|
|
have_spill_costs = true;
|
2019-05-08 13:34:04 -05:00
|
|
|
}
|
2010-10-19 09:25:51 -07:00
|
|
|
|
2019-05-08 13:34:04 -05:00
|
|
|
int
|
|
|
|
|
fs_reg_alloc::choose_spill_reg()
|
|
|
|
|
{
|
2019-05-14 23:03:29 -05:00
|
|
|
if (!have_spill_costs)
|
|
|
|
|
set_spill_costs();
|
|
|
|
|
|
2019-05-08 13:09:27 -05:00
|
|
|
int node = ra_get_best_spill_node(g);
|
|
|
|
|
if (node < 0)
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
assert(node >= first_vgrf_node);
|
|
|
|
|
return node - first_vgrf_node;
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
|
2020-10-08 14:41:43 -05:00
|
|
|
fs_reg
|
|
|
|
|
fs_reg_alloc::alloc_scratch_header()
|
|
|
|
|
{
|
|
|
|
|
int vgrf = fs->alloc.allocate(1);
|
|
|
|
|
assert(first_vgrf_node + vgrf == scratch_header_node);
|
|
|
|
|
ra_set_node_class(g, scratch_header_node,
|
2024-02-26 07:46:58 -08:00
|
|
|
compiler->fs_reg_set.classes[0]);
|
2020-10-08 14:41:43 -05:00
|
|
|
|
|
|
|
|
setup_live_interference(scratch_header_node, 0, INT_MAX);
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
return fs_reg(VGRF, vgrf, BRW_TYPE_UD);
|
2020-10-08 14:41:43 -05:00
|
|
|
}
|
|
|
|
|
|
2019-05-08 13:34:04 -05:00
|
|
|
fs_reg
|
|
|
|
|
fs_reg_alloc::alloc_spill_reg(unsigned size, int ip)
|
|
|
|
|
{
|
2023-01-11 00:20:36 -08:00
|
|
|
int vgrf = fs->alloc.allocate(ALIGN(size, reg_unit(devinfo)));
|
2022-06-28 17:49:38 -07:00
|
|
|
int class_idx = DIV_ROUND_UP(size, reg_unit(devinfo)) - 1;
|
2024-02-26 07:46:58 -08:00
|
|
|
int n = ra_add_node(g, compiler->fs_reg_set.classes[class_idx]);
|
2019-05-08 13:34:04 -05:00
|
|
|
assert(n == first_vgrf_node + vgrf);
|
|
|
|
|
assert(n == first_spill_node + spill_node_count);
|
|
|
|
|
|
|
|
|
|
setup_live_interference(n, ip - 1, ip + 1);
|
|
|
|
|
|
|
|
|
|
/* Add interference between this spill node and any other spill nodes for
|
|
|
|
|
* the same instruction.
|
|
|
|
|
*/
|
|
|
|
|
for (int s = 0; s < spill_node_count; s++) {
|
|
|
|
|
if (spill_vgrf_ip[s] == ip)
|
|
|
|
|
ra_add_node_interference(g, n, first_spill_node + s);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Add this spill node to the list for next time */
|
|
|
|
|
if (spill_node_count >= spill_vgrf_ip_alloc) {
|
|
|
|
|
if (spill_vgrf_ip_alloc == 0)
|
|
|
|
|
spill_vgrf_ip_alloc = 16;
|
|
|
|
|
else
|
|
|
|
|
spill_vgrf_ip_alloc *= 2;
|
|
|
|
|
spill_vgrf_ip = reralloc(mem_ctx, spill_vgrf_ip, int,
|
|
|
|
|
spill_vgrf_ip_alloc);
|
|
|
|
|
}
|
|
|
|
|
spill_vgrf_ip[spill_node_count++] = ip;
|
|
|
|
|
|
|
|
|
|
return fs_reg(VGRF, vgrf);
|
|
|
|
|
}
|
|
|
|
|
|
2010-10-19 09:25:51 -07:00
|
|
|
void
|
2019-05-07 20:09:08 -05:00
|
|
|
fs_reg_alloc::spill_reg(unsigned spill_reg)
|
2010-10-19 09:25:51 -07:00
|
|
|
{
|
2019-05-07 20:09:08 -05:00
|
|
|
int size = fs->alloc.sizes[spill_reg];
|
|
|
|
|
unsigned int spill_offset = fs->last_scratch;
|
2010-10-19 09:25:51 -07:00
|
|
|
assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
|
2013-10-29 12:46:18 -07:00
|
|
|
|
|
|
|
|
/* Spills may use MRFs 13-15 in the SIMD16 case. Our texturing is done
|
|
|
|
|
* using up to 11 MRFs starting from either m1 or m2, and fb writes can use
|
2021-03-29 15:40:04 -07:00
|
|
|
* up to m13 (gfx6+ simd16: 2 header + 8 color + 2 src0alpha + 2 omask) or
|
|
|
|
|
* m15 (gfx4-5 simd16: 2 header + 8 color + 1 aads + 2 src depth + 2 dst
|
2013-10-29 12:46:18 -07:00
|
|
|
* depth), starting from m1. In summary: We may not be able to spill in
|
|
|
|
|
* SIMD16 mode, because we'd stomp the FB writes.
|
|
|
|
|
*/
|
2019-05-07 20:09:08 -05:00
|
|
|
if (!fs->spilled_any_registers) {
|
2022-07-18 12:27:53 +03:00
|
|
|
if (devinfo->verx10 >= 125) {
|
|
|
|
|
/* We will allocate a register on the fly */
|
2024-02-15 02:34:50 -08:00
|
|
|
} else {
|
2020-10-08 14:41:43 -05:00
|
|
|
this->scratch_header = alloc_scratch_header();
|
2023-11-20 23:07:58 -08:00
|
|
|
fs_builder ubld = fs_builder(fs, 8).exec_all().at(
|
2020-10-08 14:41:43 -05:00
|
|
|
fs->cfg->first_block(), fs->cfg->first_block()->start());
|
2020-10-20 17:42:21 -05:00
|
|
|
|
2022-07-18 12:27:53 +03:00
|
|
|
fs_inst *inst = ubld.emit(SHADER_OPCODE_SCRATCH_HEADER,
|
|
|
|
|
this->scratch_header);
|
|
|
|
|
_mesa_set_add(spill_insts, inst);
|
2013-10-29 12:46:18 -07:00
|
|
|
}
|
|
|
|
|
|
2019-05-07 20:09:08 -05:00
|
|
|
fs->spilled_any_registers = true;
|
2013-10-29 12:46:18 -07:00
|
|
|
}
|
2010-10-19 09:25:51 -07:00
|
|
|
|
2019-05-07 20:09:08 -05:00
|
|
|
fs->last_scratch += size * REG_SIZE;
|
2013-10-16 12:16:51 -07:00
|
|
|
|
2019-05-08 13:34:04 -05:00
|
|
|
/* We're about to replace all uses of this register. It no longer
|
|
|
|
|
* conflicts with anything so we can get rid of its interference.
|
|
|
|
|
*/
|
|
|
|
|
ra_set_node_spill_cost(g, first_vgrf_node + spill_reg, 0);
|
|
|
|
|
ra_reset_node_interference(g, first_vgrf_node + spill_reg);
|
|
|
|
|
|
2010-10-19 09:25:51 -07:00
|
|
|
/* Generate spill/unspill instructions for the objects being
|
|
|
|
|
* spilled. Right now, we spill or unspill the whole thing to a
|
|
|
|
|
* virtual grf of the same size. For most instructions, though, we
|
|
|
|
|
* could just spill/unspill the GRF being accessed.
|
|
|
|
|
*/
|
2019-05-08 13:34:04 -05:00
|
|
|
int ip = 0;
|
2019-05-07 20:09:08 -05:00
|
|
|
foreach_block_and_inst (block, fs_inst, inst, fs->cfg) {
|
|
|
|
|
const fs_builder ibld = fs_builder(fs, block, inst);
|
2019-05-08 13:34:04 -05:00
|
|
|
exec_node *before = inst->prev;
|
|
|
|
|
exec_node *after = inst->next;
|
2016-05-15 20:30:06 -07:00
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
if (inst->src[i].file == VGRF &&
|
2015-10-26 04:35:14 -07:00
|
|
|
inst->src[i].nr == spill_reg) {
|
2016-09-07 16:59:35 -07:00
|
|
|
int count = regs_read(inst, i);
|
2016-09-01 12:42:20 -07:00
|
|
|
int subset_spill_offset = spill_offset +
|
|
|
|
|
ROUND_DOWN_TO(inst->src[i].offset, REG_SIZE);
|
2019-05-08 13:34:04 -05:00
|
|
|
fs_reg unspill_dst = alloc_spill_reg(count, ip);
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
2015-10-26 04:35:14 -07:00
|
|
|
inst->src[i].nr = unspill_dst.nr;
|
2016-09-01 12:42:20 -07:00
|
|
|
inst->src[i].offset %= REG_SIZE;
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
2016-05-16 01:03:43 -07:00
|
|
|
/* We read the largest power-of-two divisor of the register count
|
|
|
|
|
* (because only POT scratch read blocks are allowed by the
|
|
|
|
|
* hardware) up to the maximum supported block size.
|
|
|
|
|
*/
|
2016-05-15 22:59:04 -07:00
|
|
|
const unsigned width =
|
2016-09-07 16:59:35 -07:00
|
|
|
MIN2(32, 1u << (ffs(MAX2(1, count) * 8) - 1));
|
2016-05-16 00:59:37 -07:00
|
|
|
|
|
|
|
|
/* Set exec_all() on unspill messages under the (rather
|
|
|
|
|
* pessimistic) assumption that there is no one-to-one
|
|
|
|
|
* correspondence between channels of the spilled variable in
|
|
|
|
|
* scratch space and the scratch read message, which operates on
|
|
|
|
|
* 32 bit channels. It shouldn't hurt in any case because the
|
|
|
|
|
* unspill destination is a block-local temporary.
|
|
|
|
|
*/
|
2022-05-24 02:44:53 -07:00
|
|
|
emit_unspill(ibld.exec_all().group(width, 0), &fs->shader_stats,
|
2022-07-18 12:27:53 +03:00
|
|
|
unspill_dst, subset_spill_offset, count, ip);
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-10-26 17:09:25 -07:00
|
|
|
if (inst->dst.file == VGRF &&
|
2020-12-09 21:16:37 +02:00
|
|
|
inst->dst.nr == spill_reg &&
|
|
|
|
|
inst->opcode != SHADER_OPCODE_UNDEF) {
|
2016-09-01 12:42:20 -07:00
|
|
|
int subset_spill_offset = spill_offset +
|
|
|
|
|
ROUND_DOWN_TO(inst->dst.offset, REG_SIZE);
|
2019-05-08 13:34:04 -05:00
|
|
|
fs_reg spill_src = alloc_spill_reg(regs_written(inst), ip);
|
2013-12-08 04:57:08 +01:00
|
|
|
|
2015-10-26 04:35:14 -07:00
|
|
|
inst->dst.nr = spill_src.nr;
|
2016-09-01 12:42:20 -07:00
|
|
|
inst->dst.offset %= REG_SIZE;
|
i965/fs: Make register spill/unspill only do the regs for that instruction.
Previously, if we were spilling the result of a texture call, we would store
all 4 regs, then for each use of one of those regs as the source of an
instruction, we would unspill all 4 regs even though only one was needed.
In both lightsmark and l4d2 with my current graphics config, the shaders that
produce spilling do so on split GRFs, so this doesn't help them out. However,
in a capture of the l4d2 shaders with a different snapshot and playing the
game instead of using a demo, it reduced one shader from 2817 instructions to
2179, due to choosing a now-cheaper texture result to spill instead of piles
of texcoords.
v2: Fix comment noted by Ken, and fix the if condition associated with it for
the current state of what constitutes a partial write of the destination.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> (v1)
2012-07-06 17:18:35 -07:00
|
|
|
|
2014-10-27 16:50:12 -07:00
|
|
|
/* If we're immediately spilling the register, we should not use
|
|
|
|
|
* destination dependency hints. Doing so will cause the GPU do
|
|
|
|
|
* try to read and write the register at the same time and may
|
|
|
|
|
* hang the GPU.
|
|
|
|
|
*/
|
|
|
|
|
inst->no_dd_clear = false;
|
|
|
|
|
inst->no_dd_check = false;
|
|
|
|
|
|
2016-05-16 01:03:43 -07:00
|
|
|
/* Calculate the execution width of the scratch messages (which work
|
|
|
|
|
* in terms of 32 bit components so we have a fixed number of eight
|
|
|
|
|
* channels per spilled register). We attempt to write one
|
|
|
|
|
* exec_size-wide component of the variable at a time without
|
|
|
|
|
* exceeding the maximum number of (fake) MRF registers reserved for
|
|
|
|
|
* spills.
|
|
|
|
|
*/
|
2022-10-10 18:05:13 -07:00
|
|
|
const unsigned width = 8 * reg_unit(devinfo) *
|
|
|
|
|
DIV_ROUND_UP(MIN2(inst->dst.component_size(inst->exec_size),
|
|
|
|
|
spill_max_size(fs) * REG_SIZE),
|
|
|
|
|
reg_unit(devinfo) * REG_SIZE);
|
2016-05-15 22:59:04 -07:00
|
|
|
|
2016-05-16 01:23:44 -07:00
|
|
|
/* Spills should only write data initialized by the instruction for
|
2022-06-22 18:31:08 +02:00
|
|
|
* whichever channels are enabled in the execution mask. If that's
|
2016-05-16 01:23:44 -07:00
|
|
|
* not possible we'll have to emit a matching unspill before the
|
|
|
|
|
* instruction and set force_writemask_all on the spill.
|
|
|
|
|
*/
|
|
|
|
|
const bool per_channel =
|
2024-04-21 00:57:59 -07:00
|
|
|
inst->dst.is_contiguous() &&
|
|
|
|
|
brw_type_size_bytes(inst->dst.type) == 4 &&
|
2016-05-16 01:23:44 -07:00
|
|
|
inst->exec_size == width;
|
|
|
|
|
|
2016-05-15 22:59:04 -07:00
|
|
|
/* Builder used to emit the scratch messages. */
|
2016-05-16 01:23:44 -07:00
|
|
|
const fs_builder ubld = ibld.exec_all(!per_channel).group(width, 0);
|
2016-05-15 22:59:04 -07:00
|
|
|
|
i965/fs: Make register spill/unspill only do the regs for that instruction.
Previously, if we were spilling the result of a texture call, we would store
all 4 regs, then for each use of one of those regs as the source of an
instruction, we would unspill all 4 regs even though only one was needed.
In both lightsmark and l4d2 with my current graphics config, the shaders that
produce spilling do so on split GRFs, so this doesn't help them out. However,
in a capture of the l4d2 shaders with a different snapshot and playing the
game instead of using a demo, it reduced one shader from 2817 instructions to
2179, due to choosing a now-cheaper texture result to spill instead of piles
of texcoords.
v2: Fix comment noted by Ken, and fix the if condition associated with it for
the current state of what constitutes a partial write of the destination.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> (v1)
2012-07-06 17:18:35 -07:00
|
|
|
/* If our write is going to affect just part of the
|
2016-09-07 16:59:35 -07:00
|
|
|
* regs_written(inst), then we need to unspill the destination since
|
|
|
|
|
* we write back out all of the regs_written(). If the original
|
|
|
|
|
* instruction had force_writemask_all set and is not a partial
|
|
|
|
|
* write, there should be no need for the unspill since the
|
2016-05-16 01:23:44 -07:00
|
|
|
* instruction will be overwriting the whole destination in any case.
|
2010-10-19 09:25:51 -07:00
|
|
|
*/
|
2019-04-24 12:38:28 +02:00
|
|
|
if (inst->is_partial_write() ||
|
2016-05-16 01:23:44 -07:00
|
|
|
(!inst->force_writemask_all && !per_channel))
|
2022-05-24 02:44:53 -07:00
|
|
|
emit_unspill(ubld, &fs->shader_stats, spill_src,
|
2022-07-18 12:27:53 +03:00
|
|
|
subset_spill_offset, regs_written(inst), ip);
|
2010-10-19 09:25:51 -07:00
|
|
|
|
2022-05-24 02:44:53 -07:00
|
|
|
emit_spill(ubld.at(block, inst->next), &fs->shader_stats, spill_src,
|
2022-07-18 12:27:53 +03:00
|
|
|
subset_spill_offset, regs_written(inst), ip);
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
2011-01-12 10:10:01 -08:00
|
|
|
|
2019-05-08 13:34:04 -05:00
|
|
|
for (fs_inst *inst = (fs_inst *)before->next;
|
|
|
|
|
inst != after; inst = (fs_inst *)inst->next)
|
|
|
|
|
setup_inst_interference(inst);
|
|
|
|
|
|
|
|
|
|
/* We don't advance the ip for scratch read/write instructions
|
|
|
|
|
* because we consider them to have the same ip as instruction we're
|
2020-10-09 04:27:35 -05:00
|
|
|
* spilling around for the purposes of interference. Also, we're
|
|
|
|
|
* inserting spill instructions without re-running liveness analysis
|
|
|
|
|
* and we don't want to mess up our IPs.
|
2019-05-08 13:34:04 -05:00
|
|
|
*/
|
2020-10-09 04:27:35 -05:00
|
|
|
if (!_mesa_set_search(spill_insts, inst))
|
2019-05-08 13:34:04 -05:00
|
|
|
ip++;
|
|
|
|
|
}
|
2020-10-12 15:07:25 -05:00
|
|
|
|
|
|
|
|
assert(ip == live_instr_count);
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
2019-05-07 16:48:27 -05:00
|
|
|
|
|
|
|
|
bool
|
2019-05-07 20:09:08 -05:00
|
|
|
fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all)
|
2019-05-07 16:48:27 -05:00
|
|
|
{
|
2019-05-08 13:34:04 -05:00
|
|
|
build_interference_graph(fs->spilled_any_registers || spill_all);
|
2019-05-07 18:14:46 -05:00
|
|
|
|
2020-10-23 15:58:06 -05:00
|
|
|
unsigned spilled = 0;
|
2019-05-08 13:34:04 -05:00
|
|
|
while (1) {
|
2019-05-07 18:14:46 -05:00
|
|
|
/* Debug of register spilling: Go spill everything. */
|
|
|
|
|
if (unlikely(spill_all)) {
|
|
|
|
|
int reg = choose_spill_reg();
|
|
|
|
|
if (reg != -1) {
|
|
|
|
|
spill_reg(reg);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-05-07 16:48:27 -05:00
|
|
|
|
2019-05-07 18:14:46 -05:00
|
|
|
if (ra_allocate(g))
|
|
|
|
|
break;
|
2019-05-07 16:48:27 -05:00
|
|
|
|
2019-05-07 18:14:46 -05:00
|
|
|
if (!allow_spilling)
|
2019-05-07 16:48:27 -05:00
|
|
|
return false;
|
|
|
|
|
|
2020-10-23 15:58:06 -05:00
|
|
|
/* Failed to allocate registers. Spill some regs, and the caller will
|
2019-05-14 22:51:20 -05:00
|
|
|
* loop back into here to try again.
|
|
|
|
|
*/
|
2020-10-23 15:58:06 -05:00
|
|
|
unsigned nr_spills = 1;
|
|
|
|
|
if (compiler->spilling_rate)
|
|
|
|
|
nr_spills = MAX2(1, spilled / compiler->spilling_rate);
|
2019-05-14 22:51:20 -05:00
|
|
|
|
2020-10-23 15:58:06 -05:00
|
|
|
for (unsigned j = 0; j < nr_spills; j++) {
|
|
|
|
|
int reg = choose_spill_reg();
|
|
|
|
|
if (reg == -1) {
|
|
|
|
|
if (j == 0)
|
|
|
|
|
return false; /* Nothing to spill */
|
|
|
|
|
break;
|
|
|
|
|
}
|
2019-05-08 13:34:04 -05:00
|
|
|
|
2020-10-23 15:58:06 -05:00
|
|
|
/* If we're going to spill but we've never spilled before, we need
|
|
|
|
|
* to re-build the interference graph with MRFs enabled to allow
|
|
|
|
|
* spilling.
|
|
|
|
|
*/
|
|
|
|
|
if (!fs->spilled_any_registers) {
|
|
|
|
|
discard_interference_graph();
|
|
|
|
|
build_interference_graph(true);
|
|
|
|
|
}
|
2019-05-08 13:34:04 -05:00
|
|
|
|
2020-10-23 15:58:06 -05:00
|
|
|
spill_reg(reg);
|
|
|
|
|
spilled++;
|
|
|
|
|
}
|
2019-05-07 16:48:27 -05:00
|
|
|
}
|
|
|
|
|
|
2019-05-08 13:34:04 -05:00
|
|
|
if (spilled)
|
2016-03-13 19:26:37 -07:00
|
|
|
fs->invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
2019-05-08 13:34:04 -05:00
|
|
|
|
2019-05-07 16:48:27 -05:00
|
|
|
/* Get the chosen virtual registers for each node, and map virtual
|
|
|
|
|
* regs in the register classes back down to real hardware reg
|
|
|
|
|
* numbers.
|
|
|
|
|
*/
|
2019-05-07 20:09:08 -05:00
|
|
|
unsigned hw_reg_mapping[fs->alloc.count];
|
|
|
|
|
fs->grf_used = fs->first_non_payload_grf;
|
|
|
|
|
for (unsigned i = 0; i < fs->alloc.count; i++) {
|
2019-05-08 13:09:27 -05:00
|
|
|
int reg = ra_get_node_reg(g, first_vgrf_node + i);
|
2019-05-07 16:48:27 -05:00
|
|
|
|
2021-03-05 09:20:01 -08:00
|
|
|
hw_reg_mapping[i] = reg;
|
2019-05-07 20:09:08 -05:00
|
|
|
fs->grf_used = MAX2(fs->grf_used,
|
2022-06-28 17:49:38 -07:00
|
|
|
hw_reg_mapping[i] + DIV_ROUND_UP(fs->alloc.sizes[i],
|
|
|
|
|
reg_unit(devinfo)));
|
2019-05-07 16:48:27 -05:00
|
|
|
}
|
|
|
|
|
|
2019-05-07 20:09:08 -05:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
|
2022-06-28 17:49:38 -07:00
|
|
|
assign_reg(devinfo, hw_reg_mapping, &inst->dst);
|
2019-05-07 16:48:27 -05:00
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2022-06-28 17:49:38 -07:00
|
|
|
assign_reg(devinfo, hw_reg_mapping, &inst->src[i]);
|
2019-05-07 16:48:27 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-07 20:09:08 -05:00
|
|
|
fs->alloc.count = fs->grf_used;
|
2019-05-07 16:48:27 -05:00
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2019-05-07 20:09:08 -05:00
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
|
|
|
|
|
{
|
|
|
|
|
fs_reg_alloc alloc(this);
|
2019-05-07 18:14:46 -05:00
|
|
|
bool success = alloc.assign_regs(allow_spilling, spill_all);
|
|
|
|
|
if (!success && allow_spilling) {
|
|
|
|
|
fail("no register to spill:\n");
|
|
|
|
|
dump_instructions(NULL);
|
|
|
|
|
}
|
|
|
|
|
return success;
|
2019-05-07 20:09:08 -05:00
|
|
|
}
|