2010-10-20 10:26:29 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
* Authors:
|
|
|
|
|
* Eric Anholt <eric@anholt.net>
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
2015-11-22 17:58:51 -08:00
|
|
|
#include "brw_eu.h"
|
2010-10-20 10:26:29 -07:00
|
|
|
#include "brw_fs.h"
|
2014-07-15 11:45:20 -07:00
|
|
|
#include "brw_cfg.h"
|
2015-11-22 18:27:42 -08:00
|
|
|
#include "util/register_allocate.h"
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2015-06-03 19:05:54 +03:00
|
|
|
using namespace brw;
|
|
|
|
|
|
2010-10-20 10:26:29 -07:00
|
|
|
static void
|
2015-02-10 15:51:34 +02:00
|
|
|
assign_reg(unsigned *reg_hw_locations, fs_reg *reg)
|
2010-10-20 10:26:29 -07:00
|
|
|
{
|
2015-10-26 17:09:25 -07:00
|
|
|
if (reg->file == VGRF) {
|
2016-09-01 12:42:20 -07:00
|
|
|
reg->nr = reg_hw_locations[reg->nr] + reg->offset / REG_SIZE;
|
|
|
|
|
reg->offset %= REG_SIZE;
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
fs_visitor::assign_regs_trivial()
|
|
|
|
|
{
|
2015-02-10 15:51:34 +02:00
|
|
|
unsigned hw_reg_mapping[this->alloc.count + 1];
|
|
|
|
|
unsigned i;
|
2012-11-20 13:50:52 -08:00
|
|
|
int reg_width = dispatch_width / 8;
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2011-03-13 13:26:09 -07:00
|
|
|
/* Note that compressed instructions require alignment to 2 registers. */
|
2011-05-04 13:50:13 -07:00
|
|
|
hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
|
2015-02-10 15:51:34 +02:00
|
|
|
for (i = 1; i <= this->alloc.count; i++) {
|
2010-10-20 10:26:29 -07:00
|
|
|
hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
|
2015-02-10 15:51:34 +02:00
|
|
|
this->alloc.sizes[i - 1]);
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
2015-02-10 15:51:34 +02:00
|
|
|
this->grf_used = hw_reg_mapping[this->alloc.count];
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-08-18 14:27:55 -07:00
|
|
|
assign_reg(hw_reg_mapping, &inst->dst);
|
2014-08-08 14:57:27 -07:00
|
|
|
for (i = 0; i < inst->sources; i++) {
|
2014-08-18 14:27:55 -07:00
|
|
|
assign_reg(hw_reg_mapping, &inst->src[i]);
|
2014-08-08 14:57:27 -07:00
|
|
|
}
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
|
2012-01-27 12:54:11 -08:00
|
|
|
if (this->grf_used >= max_grf) {
|
2011-03-13 13:26:09 -07:00
|
|
|
fail("Ran out of regs on trivial allocator (%d/%d)\n",
|
2012-01-27 12:54:11 -08:00
|
|
|
this->grf_used, max_grf);
|
2014-08-08 16:25:34 -07:00
|
|
|
} else {
|
2015-02-10 15:51:34 +02:00
|
|
|
this->alloc.count = this->grf_used;
|
2011-03-13 13:26:09 -07:00
|
|
|
}
|
|
|
|
|
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
|
2011-05-09 09:56:18 -07:00
|
|
|
static void
|
2015-07-31 08:35:57 -07:00
|
|
|
brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
|
2010-10-20 10:26:29 -07:00
|
|
|
{
|
2016-08-22 15:01:08 -07:00
|
|
|
const struct gen_device_info *devinfo = compiler->devinfo;
|
2014-08-18 14:27:55 -07:00
|
|
|
int base_reg_count = BRW_MAX_GRF;
|
2016-05-18 13:52:25 -07:00
|
|
|
const int index = _mesa_logbase2(dispatch_width / 8);
|
2012-10-02 19:07:20 -07:00
|
|
|
|
2015-07-31 08:36:35 -07:00
|
|
|
if (dispatch_width > 8 && devinfo->gen >= 7) {
|
|
|
|
|
/* For IVB+, we don't need the PLN hacks or the even-reg alignment in
|
|
|
|
|
* SIMD16. Therefore, we can use the exact same register sets for
|
|
|
|
|
* SIMD16 as we do for SIMD8 and we don't need to recalculate them.
|
|
|
|
|
*/
|
|
|
|
|
compiler->fs_reg_sets[index] = compiler->fs_reg_sets[0];
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2012-10-01 16:39:54 -07:00
|
|
|
/* The registers used to make up almost all values handled in the compiler
|
|
|
|
|
* are a scalar value occupying a single register (or 2 registers in the
|
2013-11-12 15:33:27 -08:00
|
|
|
* case of SIMD16, which is handled by dividing base_reg_count by 2 and
|
2012-10-01 16:39:54 -07:00
|
|
|
* multiplying allocated register numbers by 2). Things that were
|
|
|
|
|
* aggregates of scalar values at the GLSL level were split to scalar
|
|
|
|
|
* values by split_virtual_grfs().
|
|
|
|
|
*
|
2013-05-22 11:26:03 -07:00
|
|
|
* However, texture SEND messages return a series of contiguous registers
|
|
|
|
|
* to write into. We currently always ask for 4 registers, but we may
|
|
|
|
|
* convert that to use less some day.
|
2012-10-01 16:39:54 -07:00
|
|
|
*
|
|
|
|
|
* Additionally, on gen5 we need aligned pairs of registers for the PLN
|
2012-11-13 15:54:41 -08:00
|
|
|
* instruction, and on gen4 we need 8 contiguous regs for workaround simd16
|
|
|
|
|
* texturing.
|
2012-10-01 16:39:54 -07:00
|
|
|
*/
|
2016-04-30 20:47:49 -07:00
|
|
|
const int class_count = MAX_VGRF_SIZE;
|
2014-10-03 18:09:52 -07:00
|
|
|
int class_sizes[MAX_VGRF_SIZE];
|
2016-04-30 20:47:49 -07:00
|
|
|
for (unsigned i = 0; i < MAX_VGRF_SIZE; i++)
|
|
|
|
|
class_sizes[i] = i + 1;
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2015-04-16 12:01:09 -07:00
|
|
|
memset(compiler->fs_reg_sets[index].class_to_ra_reg_range, 0,
|
|
|
|
|
sizeof(compiler->fs_reg_sets[index].class_to_ra_reg_range));
|
|
|
|
|
int *class_to_ra_reg_range = compiler->fs_reg_sets[index].class_to_ra_reg_range;
|
2014-09-12 16:17:37 -07:00
|
|
|
|
2011-05-05 19:37:10 -07:00
|
|
|
/* Compute the total number of registers across all classes. */
|
2010-10-20 10:26:29 -07:00
|
|
|
int ra_reg_count = 0;
|
|
|
|
|
for (int i = 0; i < class_count; i++) {
|
2016-05-18 13:52:25 -07:00
|
|
|
if (devinfo->gen <= 5 && dispatch_width >= 16) {
|
2014-08-18 14:27:55 -07:00
|
|
|
/* From the G45 PRM:
|
|
|
|
|
*
|
|
|
|
|
* In order to reduce the hardware complexity, the following
|
|
|
|
|
* rules and restrictions apply to the compressed instruction:
|
|
|
|
|
* ...
|
|
|
|
|
* * Operand Alignment Rule: With the exceptions listed below, a
|
|
|
|
|
* source/destination operand in general should be aligned to
|
|
|
|
|
* even 256-bit physical register with a region size equal to
|
|
|
|
|
* two 256-bit physical register
|
|
|
|
|
*/
|
|
|
|
|
ra_reg_count += (base_reg_count - (class_sizes[i] - 1)) / 2;
|
|
|
|
|
} else {
|
|
|
|
|
ra_reg_count += base_reg_count - (class_sizes[i] - 1);
|
|
|
|
|
}
|
2014-09-12 16:17:37 -07:00
|
|
|
/* Mark the last register. We'll fill in the beginnings later. */
|
|
|
|
|
class_to_ra_reg_range[class_sizes[i]] = ra_reg_count;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Fill out the rest of the range markers */
|
|
|
|
|
for (int i = 1; i < 17; ++i) {
|
|
|
|
|
if (class_to_ra_reg_range[i] == 0)
|
|
|
|
|
class_to_ra_reg_range[i] = class_to_ra_reg_range[i-1];
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
|
2015-04-16 12:01:09 -07:00
|
|
|
uint8_t *ra_reg_to_grf = ralloc_array(compiler, uint8_t, ra_reg_count);
|
2015-08-15 09:58:32 -07:00
|
|
|
struct ra_regs *regs = ra_alloc_reg_set(compiler, ra_reg_count, false);
|
2014-03-17 13:53:44 -07:00
|
|
|
if (devinfo->gen >= 6)
|
2012-11-30 16:34:09 -08:00
|
|
|
ra_set_allocate_round_robin(regs);
|
2015-04-16 12:01:09 -07:00
|
|
|
int *classes = ralloc_array(compiler, int, class_count);
|
2012-10-02 19:07:20 -07:00
|
|
|
int aligned_pairs_class = -1;
|
2011-05-05 19:37:10 -07:00
|
|
|
|
2014-10-03 18:13:05 -07:00
|
|
|
/* Allocate space for q values. We allocate class_count + 1 because we
|
|
|
|
|
* want to leave room for the aligned pairs class if we have it. */
|
2015-04-16 12:01:09 -07:00
|
|
|
unsigned int **q_values = ralloc_array(compiler, unsigned int *,
|
2014-10-03 18:13:05 -07:00
|
|
|
class_count + 1);
|
|
|
|
|
for (int i = 0; i < class_count + 1; ++i)
|
|
|
|
|
q_values[i] = ralloc_array(q_values, unsigned int, class_count + 1);
|
|
|
|
|
|
2011-05-05 19:37:10 -07:00
|
|
|
/* Now, add the registers to their classes, and add the conflicts
|
|
|
|
|
* between them and the base GRF registers (and also each other).
|
|
|
|
|
*/
|
|
|
|
|
int reg = 0;
|
|
|
|
|
int pairs_base_reg = 0;
|
|
|
|
|
int pairs_reg_count = 0;
|
2010-10-20 10:26:29 -07:00
|
|
|
for (int i = 0; i < class_count; i++) {
|
2014-08-18 14:27:55 -07:00
|
|
|
int class_reg_count;
|
2016-05-18 13:52:25 -07:00
|
|
|
if (devinfo->gen <= 5 && dispatch_width >= 16) {
|
2014-08-18 14:27:55 -07:00
|
|
|
class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2;
|
2014-10-03 18:13:05 -07:00
|
|
|
|
|
|
|
|
/* See comment below. The only difference here is that we are
|
|
|
|
|
* dealing with pairs of registers instead of single registers.
|
|
|
|
|
* Registers of odd sizes simply get rounded up. */
|
|
|
|
|
for (int j = 0; j < class_count; j++)
|
|
|
|
|
q_values[i][j] = (class_sizes[i] + 1) / 2 +
|
|
|
|
|
(class_sizes[j] + 1) / 2 - 1;
|
2014-08-18 14:27:55 -07:00
|
|
|
} else {
|
|
|
|
|
class_reg_count = base_reg_count - (class_sizes[i] - 1);
|
2014-10-03 18:13:05 -07:00
|
|
|
|
|
|
|
|
/* From register_allocate.c:
|
|
|
|
|
*
|
|
|
|
|
* q(B,C) (indexed by C, B is this register class) in
|
|
|
|
|
* Runeson/Nyström paper. This is "how many registers of B could
|
|
|
|
|
* the worst choice register from C conflict with".
|
|
|
|
|
*
|
|
|
|
|
* If we just let the register allocation algorithm compute these
|
|
|
|
|
* values, is extremely expensive. However, since all of our
|
|
|
|
|
* registers are laid out, we can very easily compute them
|
|
|
|
|
* ourselves. View the register from C as fixed starting at GRF n
|
|
|
|
|
* somwhere in the middle, and the register from B as sliding back
|
|
|
|
|
* and forth. Then the first register to conflict from B is the
|
|
|
|
|
* one starting at n - class_size[B] + 1 and the last register to
|
|
|
|
|
* conflict will start at n + class_size[B] - 1. Therefore, the
|
|
|
|
|
* number of conflicts from B is class_size[B] + class_size[C] - 1.
|
|
|
|
|
*
|
|
|
|
|
* +-+-+-+-+-+-+ +-+-+-+-+-+-+
|
|
|
|
|
* B | | | | | |n| --> | | | | | | |
|
|
|
|
|
* +-+-+-+-+-+-+ +-+-+-+-+-+-+
|
|
|
|
|
* +-+-+-+-+-+
|
|
|
|
|
* C |n| | | | |
|
|
|
|
|
* +-+-+-+-+-+
|
|
|
|
|
*/
|
|
|
|
|
for (int j = 0; j < class_count; j++)
|
|
|
|
|
q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
|
2014-08-18 14:27:55 -07:00
|
|
|
}
|
2012-10-02 19:07:20 -07:00
|
|
|
classes[i] = ra_alloc_reg_class(regs);
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2011-05-05 19:37:10 -07:00
|
|
|
/* Save this off for the aligned pair class at the end. */
|
|
|
|
|
if (class_sizes[i] == 2) {
|
2014-08-18 14:27:55 -07:00
|
|
|
pairs_base_reg = reg;
|
|
|
|
|
pairs_reg_count = class_reg_count;
|
2011-05-05 19:37:10 -07:00
|
|
|
}
|
|
|
|
|
|
2016-05-18 13:52:25 -07:00
|
|
|
if (devinfo->gen <= 5 && dispatch_width >= 16) {
|
2014-08-18 14:27:55 -07:00
|
|
|
for (int j = 0; j < class_reg_count; j++) {
|
|
|
|
|
ra_class_add_reg(regs, classes[i], reg);
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2014-08-18 14:27:55 -07:00
|
|
|
ra_reg_to_grf[reg] = j * 2;
|
2011-05-04 13:31:01 -07:00
|
|
|
|
2014-10-06 21:27:06 -07:00
|
|
|
for (int base_reg = j;
|
|
|
|
|
base_reg < j + (class_sizes[i] + 1) / 2;
|
2014-08-18 14:27:55 -07:00
|
|
|
base_reg++) {
|
2015-08-15 09:50:11 -07:00
|
|
|
ra_add_reg_conflict(regs, base_reg, reg);
|
2014-08-18 14:27:55 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
reg++;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
for (int j = 0; j < class_reg_count; j++) {
|
|
|
|
|
ra_class_add_reg(regs, classes[i], reg);
|
|
|
|
|
|
|
|
|
|
ra_reg_to_grf[reg] = j;
|
2011-05-05 19:37:10 -07:00
|
|
|
|
2014-08-18 14:27:55 -07:00
|
|
|
for (int base_reg = j;
|
|
|
|
|
base_reg < j + class_sizes[i];
|
|
|
|
|
base_reg++) {
|
2015-08-15 09:50:11 -07:00
|
|
|
ra_add_reg_conflict(regs, base_reg, reg);
|
2014-08-18 14:27:55 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
reg++;
|
|
|
|
|
}
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
}
|
2011-05-05 19:37:10 -07:00
|
|
|
assert(reg == ra_reg_count);
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2015-08-15 09:50:11 -07:00
|
|
|
/* Applying transitivity to all of the base registers gives us the
|
|
|
|
|
* appropreate register conflict relationships everywhere.
|
|
|
|
|
*/
|
|
|
|
|
for (int reg = 0; reg < base_reg_count; reg++)
|
|
|
|
|
ra_make_reg_conflicts_transitive(regs, reg);
|
|
|
|
|
|
2015-04-06 17:44:40 -07:00
|
|
|
/* Add a special class for aligned pairs, which we'll put delta_xy
|
2015-04-02 16:57:10 -07:00
|
|
|
* in on Gen <= 6 so that we can do PLN.
|
2010-10-20 10:26:29 -07:00
|
|
|
*/
|
2015-07-31 08:35:57 -07:00
|
|
|
if (devinfo->has_pln && dispatch_width == 8 && devinfo->gen <= 6) {
|
2012-10-02 19:07:20 -07:00
|
|
|
aligned_pairs_class = ra_alloc_reg_class(regs);
|
2011-05-05 19:37:10 -07:00
|
|
|
|
|
|
|
|
for (int i = 0; i < pairs_reg_count; i++) {
|
2012-10-02 19:07:20 -07:00
|
|
|
if ((ra_reg_to_grf[pairs_base_reg + i] & 1) == 0) {
|
|
|
|
|
ra_class_add_reg(regs, aligned_pairs_class, pairs_base_reg + i);
|
2011-05-05 19:37:10 -07:00
|
|
|
}
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
2014-10-03 18:13:05 -07:00
|
|
|
|
|
|
|
|
for (int i = 0; i < class_count; i++) {
|
|
|
|
|
/* These are a little counter-intuitive because the pair registers
|
|
|
|
|
* are required to be aligned while the register they are
|
|
|
|
|
* potentially interferring with are not. In the case where the
|
|
|
|
|
* size is even, the worst-case is that the register is
|
|
|
|
|
* odd-aligned. In the odd-size case, it doesn't matter.
|
|
|
|
|
*/
|
|
|
|
|
q_values[class_count][i] = class_sizes[i] / 2 + 1;
|
|
|
|
|
q_values[i][class_count] = class_sizes[i] + 1;
|
|
|
|
|
}
|
|
|
|
|
q_values[class_count][class_count] = 1;
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
|
2014-10-03 18:13:05 -07:00
|
|
|
ra_set_finalize(regs, q_values);
|
|
|
|
|
|
|
|
|
|
ralloc_free(q_values);
|
2012-10-02 19:07:20 -07:00
|
|
|
|
2015-04-16 12:01:09 -07:00
|
|
|
compiler->fs_reg_sets[index].regs = regs;
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(compiler->fs_reg_sets[index].classes); i++)
|
|
|
|
|
compiler->fs_reg_sets[index].classes[i] = -1;
|
2013-05-22 11:26:03 -07:00
|
|
|
for (int i = 0; i < class_count; i++)
|
2015-04-16 12:01:09 -07:00
|
|
|
compiler->fs_reg_sets[index].classes[class_sizes[i] - 1] = classes[i];
|
|
|
|
|
compiler->fs_reg_sets[index].ra_reg_to_grf = ra_reg_to_grf;
|
|
|
|
|
compiler->fs_reg_sets[index].aligned_pairs_class = aligned_pairs_class;
|
2012-10-02 19:07:20 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2015-04-16 12:01:09 -07:00
|
|
|
brw_fs_alloc_reg_sets(struct brw_compiler *compiler)
|
2012-10-02 19:07:20 -07:00
|
|
|
{
|
2015-07-31 08:35:57 -07:00
|
|
|
brw_alloc_reg_set(compiler, 8);
|
|
|
|
|
brw_alloc_reg_set(compiler, 16);
|
2016-05-18 13:52:25 -07:00
|
|
|
brw_alloc_reg_set(compiler, 32);
|
2011-05-09 09:56:18 -07:00
|
|
|
}
|
|
|
|
|
|
2014-09-01 15:38:58 -07:00
|
|
|
static int
|
|
|
|
|
count_to_loop_end(const bblock_t *block)
|
2012-10-02 15:01:24 -07:00
|
|
|
{
|
2014-09-01 15:01:23 -07:00
|
|
|
if (block->end()->opcode == BRW_OPCODE_WHILE)
|
2014-09-01 15:38:58 -07:00
|
|
|
return block->end_ip;
|
|
|
|
|
|
2012-10-02 15:01:24 -07:00
|
|
|
int depth = 1;
|
2014-09-01 15:38:58 -07:00
|
|
|
/* Skip the first block, since we don't want to count the do the calling
|
|
|
|
|
* function found.
|
|
|
|
|
*/
|
2014-09-02 21:07:51 -07:00
|
|
|
for (block = block->next();
|
2012-10-02 15:01:24 -07:00
|
|
|
depth > 0;
|
2014-09-02 21:07:51 -07:00
|
|
|
block = block->next()) {
|
2014-09-01 15:01:23 -07:00
|
|
|
if (block->start()->opcode == BRW_OPCODE_DO)
|
2012-10-02 15:01:24 -07:00
|
|
|
depth++;
|
2014-09-01 15:01:23 -07:00
|
|
|
if (block->end()->opcode == BRW_OPCODE_WHILE) {
|
2012-10-02 15:01:24 -07:00
|
|
|
depth--;
|
2014-09-01 15:38:58 -07:00
|
|
|
if (depth == 0)
|
|
|
|
|
return block->end_ip;
|
2012-10-02 15:01:24 -07:00
|
|
|
}
|
|
|
|
|
}
|
2014-09-01 15:38:58 -07:00
|
|
|
unreachable("not reached");
|
2012-10-02 15:01:24 -07:00
|
|
|
}
|
|
|
|
|
|
2015-06-12 12:01:35 -07:00
|
|
|
void fs_visitor::calculate_payload_ranges(int payload_node_count,
|
|
|
|
|
int *payload_last_use_ip)
|
2012-10-01 17:54:10 -07:00
|
|
|
{
|
2012-10-02 15:01:24 -07:00
|
|
|
int loop_depth = 0;
|
|
|
|
|
int loop_end_ip = 0;
|
|
|
|
|
|
2015-06-30 13:42:15 -07:00
|
|
|
for (int i = 0; i < payload_node_count; i++)
|
|
|
|
|
payload_last_use_ip[i] = -1;
|
|
|
|
|
|
2012-10-02 15:01:24 -07:00
|
|
|
int ip = 0;
|
2014-09-01 13:35:04 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2012-10-02 15:01:24 -07:00
|
|
|
switch (inst->opcode) {
|
|
|
|
|
case BRW_OPCODE_DO:
|
|
|
|
|
loop_depth++;
|
|
|
|
|
|
|
|
|
|
/* Since payload regs are deffed only at the start of the shader
|
|
|
|
|
* execution, any uses of the payload within a loop mean the live
|
|
|
|
|
* interval extends to the end of the outermost loop. Find the ip of
|
|
|
|
|
* the end now.
|
|
|
|
|
*/
|
|
|
|
|
if (loop_depth == 1)
|
2014-09-01 15:38:58 -07:00
|
|
|
loop_end_ip = count_to_loop_end(block);
|
2012-10-02 15:01:24 -07:00
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_WHILE:
|
|
|
|
|
loop_depth--;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int use_ip;
|
|
|
|
|
if (loop_depth > 0)
|
|
|
|
|
use_ip = loop_end_ip;
|
|
|
|
|
else
|
|
|
|
|
use_ip = ip;
|
|
|
|
|
|
2015-10-26 17:52:57 -07:00
|
|
|
/* Note that UNIFORM args have been turned into FIXED_GRF by
|
2012-10-02 15:01:24 -07:00
|
|
|
* assign_curbe_setup(), and interpolation uses fixed hardware regs from
|
|
|
|
|
* the start (see interp_reg()).
|
|
|
|
|
*/
|
2014-03-17 10:39:43 -07:00
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2015-10-26 17:52:57 -07:00
|
|
|
if (inst->src[i].file == FIXED_GRF) {
|
2015-10-24 15:29:03 -07:00
|
|
|
int node_nr = inst->src[i].nr;
|
2012-10-02 15:01:24 -07:00
|
|
|
if (node_nr >= payload_node_count)
|
|
|
|
|
continue;
|
|
|
|
|
|
2016-09-07 16:59:35 -07:00
|
|
|
for (unsigned j = 0; j < regs_read(inst, i); j++) {
|
2015-02-02 14:23:35 -08:00
|
|
|
payload_last_use_ip[node_nr + j] = use_ip;
|
2016-09-07 16:59:35 -07:00
|
|
|
assert(node_nr + j < unsigned(payload_node_count));
|
2015-02-02 14:23:35 -08:00
|
|
|
}
|
2012-10-02 15:01:24 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Special case instructions which have extra implied registers used. */
|
|
|
|
|
switch (inst->opcode) {
|
2014-09-27 11:15:28 -07:00
|
|
|
case CS_OPCODE_CS_TERMINATE:
|
|
|
|
|
payload_last_use_ip[0] = use_ip;
|
|
|
|
|
break;
|
|
|
|
|
|
2012-10-02 15:01:24 -07:00
|
|
|
default:
|
2015-02-04 21:11:56 -08:00
|
|
|
if (inst->eot) {
|
|
|
|
|
/* We could omit this for the !inst->header_present case, except
|
|
|
|
|
* that the simulator apparently incorrectly reads from g0/g1
|
|
|
|
|
* instead of sideband. It also really freaks out driver
|
|
|
|
|
* developers to see g0 used in unusual places, so just always
|
|
|
|
|
* reserve it.
|
|
|
|
|
*/
|
|
|
|
|
payload_last_use_ip[0] = use_ip;
|
|
|
|
|
payload_last_use_ip[1] = use_ip;
|
|
|
|
|
}
|
2012-10-02 15:01:24 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ip++;
|
|
|
|
|
}
|
2015-06-12 12:01:35 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Sets up interference between thread payload registers and the virtual GRFs
|
|
|
|
|
* to be allocated for program temporaries.
|
|
|
|
|
*
|
|
|
|
|
* We want to be able to reallocate the payload for our virtual GRFs, notably
|
|
|
|
|
* because the setup coefficients for a full set of 16 FS inputs takes up 8 of
|
|
|
|
|
* our 128 registers.
|
|
|
|
|
*
|
|
|
|
|
* The layout of the payload registers is:
|
|
|
|
|
*
|
|
|
|
|
* 0..payload.num_regs-1: fixed function setup (including bary coordinates).
|
|
|
|
|
* payload.num_regs..payload.num_regs+curb_read_lengh-1: uniform data
|
|
|
|
|
* payload.num_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients.
|
|
|
|
|
*
|
|
|
|
|
* And we have payload_node_count nodes covering these registers in order
|
|
|
|
|
* (note that in SIMD16, a node is two registers).
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
fs_visitor::setup_payload_interference(struct ra_graph *g,
|
|
|
|
|
int payload_node_count,
|
|
|
|
|
int first_payload_node)
|
|
|
|
|
{
|
|
|
|
|
int payload_last_use_ip[payload_node_count];
|
|
|
|
|
calculate_payload_ranges(payload_node_count, payload_last_use_ip);
|
2012-10-02 15:01:24 -07:00
|
|
|
|
|
|
|
|
for (int i = 0; i < payload_node_count; i++) {
|
2015-06-30 13:42:15 -07:00
|
|
|
if (payload_last_use_ip[i] == -1)
|
|
|
|
|
continue;
|
|
|
|
|
|
2012-10-02 15:01:24 -07:00
|
|
|
/* Mark the payload node as interfering with any virtual grf that is
|
|
|
|
|
* live between the start of the program and our last use of the payload
|
|
|
|
|
* node.
|
|
|
|
|
*/
|
2015-02-10 15:51:34 +02:00
|
|
|
for (unsigned j = 0; j < this->alloc.count; j++) {
|
2013-03-06 17:12:28 -08:00
|
|
|
/* Note that we use a <= comparison, unlike virtual_grf_interferes(),
|
|
|
|
|
* in order to not have to worry about the uniform issue described in
|
|
|
|
|
* calculate_live_intervals().
|
|
|
|
|
*/
|
2013-04-30 15:00:40 -07:00
|
|
|
if (this->virtual_grf_start[j] <= payload_last_use_ip[i]) {
|
2012-10-02 15:01:24 -07:00
|
|
|
ra_add_node_interference(g, first_payload_node + i, j);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < payload_node_count; i++) {
|
|
|
|
|
/* Mark each payload node as being allocated to its physical register.
|
2012-10-01 17:54:10 -07:00
|
|
|
*
|
|
|
|
|
* The alternative would be to have per-physical-register classes, which
|
|
|
|
|
* would just be silly.
|
|
|
|
|
*/
|
2016-05-18 13:52:25 -07:00
|
|
|
if (devinfo->gen <= 5 && dispatch_width >= 16) {
|
2014-10-13 19:41:17 -07:00
|
|
|
/* We have to divide by 2 here because we only have even numbered
|
|
|
|
|
* registers. Some of the payload registers will be odd, but
|
|
|
|
|
* that's ok because their physical register numbers have already
|
|
|
|
|
* been assigned. The only thing this is used for is interference.
|
|
|
|
|
*/
|
|
|
|
|
ra_set_node_reg(g, first_payload_node + i, i / 2);
|
|
|
|
|
} else {
|
|
|
|
|
ra_set_node_reg(g, first_payload_node + i, i);
|
|
|
|
|
}
|
2012-10-01 17:54:10 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2012-10-02 16:31:51 -07:00
|
|
|
/**
|
2013-10-29 12:18:10 -07:00
|
|
|
* Sets the mrf_used array to indicate which MRFs are used by the shader IR
|
|
|
|
|
*
|
|
|
|
|
* This is used in assign_regs() to decide which of the GRFs that we use as
|
|
|
|
|
* MRFs on gen7 get normally register allocated, and in register spilling to
|
|
|
|
|
* see if we can actually use MRFs to do spills without overwriting normal MRF
|
|
|
|
|
* contents.
|
2012-10-02 16:31:51 -07:00
|
|
|
*/
|
2015-06-06 12:08:00 -07:00
|
|
|
static void
|
|
|
|
|
get_used_mrfs(fs_visitor *v, bool *mrf_used)
|
2012-10-02 16:31:51 -07:00
|
|
|
{
|
2015-06-06 12:08:00 -07:00
|
|
|
int reg_width = v->dispatch_width / 8;
|
2012-10-02 16:31:51 -07:00
|
|
|
|
i965: Turn BRW_MAX_MRF into a macro that accepts a hardware generation
There are some bug reports about shaders failing to compile in gen6
because MRF 14 is used when we need to spill. For example:
https://bugs.freedesktop.org/show_bug.cgi?id=86469
https://bugs.freedesktop.org/show_bug.cgi?id=90631
Discussion in bugzilla pointed to the fact that gen6 might actually have
24 MRF registers available instead of 16, so we could use other MRF
registers and avoid these conflicts (we still need to investigate why
some shaders need up to MRF 14 anyway, since this is not expected).
Notice that the hardware docs are not clear about this fact:
SNB PRM Vol4 Part2's "Table 5-4. MRF Registers Available in Device
Hardware" says "Number per Thread" - "24 registers"
However, SNB PRM Vol4 Part1, 1.6.1 Message Register File (MRF) says:
"Normal threads should construct their messages in m1..m15. (...)
Regardless of actual hardware implementation, the thread should
not assume th at MRF addresses above m15 wrap to legal MRF registers."
Therefore experimentation was necessary to evaluate if we had these extra
MRF registers available or not. This was tested in gen6 using MRF
registers 21..23 for spilling and doing a full piglit run (all.py) forcing
spilling of everything on the FS backend. It was also tested by doing
spilling of everything on both the FS and the VS backends with a piglit run
of shader.py. In both cases no regressions were observed. In fact, many of
these tests where helped in the cases where we forced spilling, since that
triggered the same underlying problem described in the bug reports. Here are
some results using INTEL_DEBUG=spill_fs,spill_vec4 for a shader.py run on
gen6 hardware:
Using MRFs 13..15 for spilling:
crash: 2, fail: 113, pass: 6621, skip: 5461
Using MRFs 21..23 for spilling:
crash: 2, fail: 12, pass: 6722, skip: 5461
This patch sets the ground for later patches to implement spilling
using MRF registers 21..23 in gen6.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-09-15 16:00:26 +02:00
|
|
|
memset(mrf_used, 0, BRW_MAX_MRF(v->devinfo->gen) * sizeof(bool));
|
2013-10-29 12:18:10 -07:00
|
|
|
|
2015-06-06 12:08:00 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
|
2012-10-02 16:31:51 -07:00
|
|
|
if (inst->dst.file == MRF) {
|
2015-10-26 04:35:14 -07:00
|
|
|
int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
|
2012-10-02 16:31:51 -07:00
|
|
|
mrf_used[reg] = true;
|
|
|
|
|
if (reg_width == 2) {
|
2015-10-26 04:35:14 -07:00
|
|
|
if (inst->dst.nr & BRW_MRF_COMPR4) {
|
2012-10-02 16:31:51 -07:00
|
|
|
mrf_used[reg + 4] = true;
|
|
|
|
|
} else {
|
|
|
|
|
mrf_used[reg + 1] = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (inst->mlen > 0) {
|
2015-06-06 12:08:00 -07:00
|
|
|
for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
|
2012-10-02 16:31:51 -07:00
|
|
|
mrf_used[inst->base_mrf + i] = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2013-10-29 12:18:10 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Sets interference between virtual GRFs and usage of the high GRFs for SEND
|
|
|
|
|
* messages (treated as MRFs in code generation).
|
|
|
|
|
*/
|
2015-06-06 12:08:00 -07:00
|
|
|
static void
|
|
|
|
|
setup_mrf_hack_interference(fs_visitor *v, struct ra_graph *g,
|
|
|
|
|
int first_mrf_node, int *first_used_mrf)
|
2013-10-29 12:18:10 -07:00
|
|
|
{
|
i965: Turn BRW_MAX_MRF into a macro that accepts a hardware generation
There are some bug reports about shaders failing to compile in gen6
because MRF 14 is used when we need to spill. For example:
https://bugs.freedesktop.org/show_bug.cgi?id=86469
https://bugs.freedesktop.org/show_bug.cgi?id=90631
Discussion in bugzilla pointed to the fact that gen6 might actually have
24 MRF registers available instead of 16, so we could use other MRF
registers and avoid these conflicts (we still need to investigate why
some shaders need up to MRF 14 anyway, since this is not expected).
Notice that the hardware docs are not clear about this fact:
SNB PRM Vol4 Part2's "Table 5-4. MRF Registers Available in Device
Hardware" says "Number per Thread" - "24 registers"
However, SNB PRM Vol4 Part1, 1.6.1 Message Register File (MRF) says:
"Normal threads should construct their messages in m1..m15. (...)
Regardless of actual hardware implementation, the thread should
not assume th at MRF addresses above m15 wrap to legal MRF registers."
Therefore experimentation was necessary to evaluate if we had these extra
MRF registers available or not. This was tested in gen6 using MRF
registers 21..23 for spilling and doing a full piglit run (all.py) forcing
spilling of everything on the FS backend. It was also tested by doing
spilling of everything on both the FS and the VS backends with a piglit run
of shader.py. In both cases no regressions were observed. In fact, many of
these tests where helped in the cases where we forced spilling, since that
triggered the same underlying problem described in the bug reports. Here are
some results using INTEL_DEBUG=spill_fs,spill_vec4 for a shader.py run on
gen6 hardware:
Using MRFs 13..15 for spilling:
crash: 2, fail: 113, pass: 6621, skip: 5461
Using MRFs 21..23 for spilling:
crash: 2, fail: 12, pass: 6722, skip: 5461
This patch sets the ground for later patches to implement spilling
using MRF registers 21..23 in gen6.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-09-15 16:00:26 +02:00
|
|
|
bool mrf_used[BRW_MAX_MRF(v->devinfo->gen)];
|
2015-06-06 12:08:00 -07:00
|
|
|
get_used_mrfs(v, mrf_used);
|
2012-10-02 16:31:51 -07:00
|
|
|
|
i965: Turn BRW_MAX_MRF into a macro that accepts a hardware generation
There are some bug reports about shaders failing to compile in gen6
because MRF 14 is used when we need to spill. For example:
https://bugs.freedesktop.org/show_bug.cgi?id=86469
https://bugs.freedesktop.org/show_bug.cgi?id=90631
Discussion in bugzilla pointed to the fact that gen6 might actually have
24 MRF registers available instead of 16, so we could use other MRF
registers and avoid these conflicts (we still need to investigate why
some shaders need up to MRF 14 anyway, since this is not expected).
Notice that the hardware docs are not clear about this fact:
SNB PRM Vol4 Part2's "Table 5-4. MRF Registers Available in Device
Hardware" says "Number per Thread" - "24 registers"
However, SNB PRM Vol4 Part1, 1.6.1 Message Register File (MRF) says:
"Normal threads should construct their messages in m1..m15. (...)
Regardless of actual hardware implementation, the thread should
not assume th at MRF addresses above m15 wrap to legal MRF registers."
Therefore experimentation was necessary to evaluate if we had these extra
MRF registers available or not. This was tested in gen6 using MRF
registers 21..23 for spilling and doing a full piglit run (all.py) forcing
spilling of everything on the FS backend. It was also tested by doing
spilling of everything on both the FS and the VS backends with a piglit run
of shader.py. In both cases no regressions were observed. In fact, many of
these tests where helped in the cases where we forced spilling, since that
triggered the same underlying problem described in the bug reports. Here are
some results using INTEL_DEBUG=spill_fs,spill_vec4 for a shader.py run on
gen6 hardware:
Using MRFs 13..15 for spilling:
crash: 2, fail: 113, pass: 6621, skip: 5461
Using MRFs 21..23 for spilling:
crash: 2, fail: 12, pass: 6722, skip: 5461
This patch sets the ground for later patches to implement spilling
using MRF registers 21..23 in gen6.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-09-15 16:00:26 +02:00
|
|
|
*first_used_mrf = BRW_MAX_MRF(v->devinfo->gen);
|
|
|
|
|
for (int i = 0; i < BRW_MAX_MRF(v->devinfo->gen); i++) {
|
2013-10-29 12:18:10 -07:00
|
|
|
/* Mark each MRF reg node as being allocated to its physical register.
|
2012-10-02 16:31:51 -07:00
|
|
|
*
|
|
|
|
|
* The alternative would be to have per-physical-register classes, which
|
|
|
|
|
* would just be silly.
|
|
|
|
|
*/
|
2014-08-18 14:27:55 -07:00
|
|
|
ra_set_node_reg(g, first_mrf_node + i, GEN7_MRF_HACK_START + i);
|
2012-10-02 16:31:51 -07:00
|
|
|
|
|
|
|
|
/* Since we don't have any live/dead analysis on the MRFs, just mark all
|
|
|
|
|
* that are used as conflicting with all virtual GRFs.
|
|
|
|
|
*/
|
|
|
|
|
if (mrf_used[i]) {
|
2015-06-06 12:15:30 -07:00
|
|
|
if (i < *first_used_mrf)
|
|
|
|
|
*first_used_mrf = i;
|
|
|
|
|
|
2015-06-06 12:08:00 -07:00
|
|
|
for (unsigned j = 0; j < v->alloc.count; j++) {
|
2012-10-02 16:31:51 -07:00
|
|
|
ra_add_node_interference(g, first_mrf_node + i, j);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-05-09 09:56:18 -07:00
|
|
|
bool
|
2016-05-16 14:30:25 -07:00
|
|
|
fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
|
2011-05-09 09:56:18 -07:00
|
|
|
{
|
|
|
|
|
/* Most of this allocation was written for a reg_width of 1
|
2013-11-12 15:33:27 -08:00
|
|
|
* (dispatch_width == 8). In extending to SIMD16, the code was
|
2011-05-09 09:56:18 -07:00
|
|
|
* left in place and it was converted to have the hardware
|
|
|
|
|
* registers it's allocating be contiguous physical pairs of regs
|
|
|
|
|
* for reg_width == 2.
|
|
|
|
|
*/
|
2012-11-20 13:50:52 -08:00
|
|
|
int reg_width = dispatch_width / 8;
|
2015-02-10 15:51:34 +02:00
|
|
|
unsigned hw_reg_mapping[this->alloc.count];
|
2014-08-18 14:27:55 -07:00
|
|
|
int payload_node_count = ALIGN(this->first_non_payload_grf, reg_width);
|
2016-05-18 13:52:25 -07:00
|
|
|
int rsi = _mesa_logbase2(reg_width); /* Which compiler->fs_reg_sets[] to use */
|
2011-05-09 09:56:18 -07:00
|
|
|
calculate_live_intervals();
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
int node_count = this->alloc.count;
|
2012-10-01 17:54:10 -07:00
|
|
|
int first_payload_node = node_count;
|
2012-10-02 15:01:24 -07:00
|
|
|
node_count += payload_node_count;
|
2012-10-02 16:31:51 -07:00
|
|
|
int first_mrf_hack_node = node_count;
|
2015-04-15 18:00:05 -07:00
|
|
|
if (devinfo->gen >= 7)
|
2012-10-02 16:31:51 -07:00
|
|
|
node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START;
|
2018-04-19 01:15:23 +02:00
|
|
|
int grf127_send_hack_node = node_count;
|
2018-07-11 11:19:20 +02:00
|
|
|
if (devinfo->gen >= 8)
|
2018-04-19 01:15:23 +02:00
|
|
|
node_count ++;
|
2014-11-20 23:45:38 -08:00
|
|
|
struct ra_graph *g =
|
2015-04-16 12:01:09 -07:00
|
|
|
ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count);
|
2010-10-20 10:26:29 -07:00
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
for (unsigned i = 0; i < this->alloc.count; i++) {
|
|
|
|
|
unsigned size = this->alloc.sizes[i];
|
2012-11-13 15:54:41 -08:00
|
|
|
int c;
|
|
|
|
|
|
2015-04-16 12:01:09 -07:00
|
|
|
assert(size <= ARRAY_SIZE(compiler->fs_reg_sets[rsi].classes) &&
|
2013-05-22 11:26:03 -07:00
|
|
|
"Register allocation relies on split_virtual_grfs()");
|
2015-04-16 12:01:09 -07:00
|
|
|
c = compiler->fs_reg_sets[rsi].classes[size - 1];
|
2012-10-01 16:39:54 -07:00
|
|
|
|
|
|
|
|
/* Special case: on pre-GEN6 hardware that supports PLN, the
|
|
|
|
|
* second operand of a PLN instruction needs to be an
|
|
|
|
|
* even-numbered register, so we have a special register class
|
|
|
|
|
* wm_aligned_pairs_class to handle this case. pre-GEN6 always
|
2016-07-11 16:24:12 -07:00
|
|
|
* uses this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] as the
|
2012-10-01 16:39:54 -07:00
|
|
|
* second operand of a PLN instruction (since it doesn't support
|
|
|
|
|
* any other interpolation modes). So all we need to do is find
|
|
|
|
|
* that register and set it to the appropriate class.
|
|
|
|
|
*/
|
2015-04-16 12:01:09 -07:00
|
|
|
if (compiler->fs_reg_sets[rsi].aligned_pairs_class >= 0 &&
|
2016-07-11 16:24:12 -07:00
|
|
|
this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].file == VGRF &&
|
|
|
|
|
this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].nr == i) {
|
2015-04-16 12:01:09 -07:00
|
|
|
c = compiler->fs_reg_sets[rsi].aligned_pairs_class;
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
|
2012-10-01 16:39:54 -07:00
|
|
|
ra_set_node_class(g, i, c);
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
for (unsigned j = 0; j < i; j++) {
|
2010-10-20 10:26:29 -07:00
|
|
|
if (virtual_grf_interferes(i, j)) {
|
|
|
|
|
ra_add_node_interference(g, i, j);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
i965: Add src/dst interference for certain instructions with hazards.
When working on tessellation shaders, I created some vec4 virtual
opcodes for creating message headers through a sequence like:
mov(8) g7<1>UD 0x00000000UD { align1 WE_all 1Q compacted };
mov(1) g7.5<1>UD 0x00000100UD { align1 WE_all };
mov(1) g7<1>UD g0<0,1,0>UD { align1 WE_all compacted };
mov(1) g7.3<1>UD g8<0,1,0>UD { align1 WE_all };
This is done in the generator since the vec4 backend can't handle align1
regioning. From the visitor's point of view, this is a single opcode:
hs_set_output_urb_offsets vgrf7.0:UD, 1U, vgrf8.xxxx:UD
Normally, there's no hazard between sources and destinations - an
instruction (naturally) reads its sources, then writes the result to the
destination. However, when the virtual instruction generates multiple
hardware instructions, we can get into trouble.
In the above example, if the register allocator assigned vgrf7 and vgrf8
to the same hardware register, then we'd clobber the source with 0 in
the first instruction, and read back the wrong value in the last one.
It occured to me that this is exactly the same problem we have with
SIMD16 instructions that use W/UW or B/UB types with 0 stride. The
hardware implicitly decodes them as two SIMD8 instructions, and with
the overlapping regions, the first would clobber the second.
Previously, we handled that by incrementing the live range end IP by 1,
which works, but is excessive: the next instruction doesn't actually
care about that. It might also be the end of control flow. This might
keep values alive too long. What we really want is to say "my source
and destinations interfere".
This patch creates new infrastructure for doing just that, and teaches
the register allocator to add interference when there's a hazard. For
my vec4 case, we can determine this by switching on opcodes. For the
SIMD16 case, we just move the existing code there.
I audited our existing virtual opcodes that generate multiple
instructions; I believe FS_OPCODE_PACK_HALF_2x16_SPLIT needs this
treatment as well, but no others.
v2: Rebased by mattst88.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-11-19 16:00:18 -08:00
|
|
|
/* Certain instructions can't safely use the same register for their
|
|
|
|
|
* sources and destination. Add interference.
|
|
|
|
|
*/
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
|
|
|
|
|
for (unsigned i = 0; i < 3; i++) {
|
|
|
|
|
if (inst->src[i].file == VGRF) {
|
|
|
|
|
ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2012-10-02 15:01:24 -07:00
|
|
|
setup_payload_interference(g, payload_node_count, first_payload_node);
|
2015-04-15 18:00:05 -07:00
|
|
|
if (devinfo->gen >= 7) {
|
i965: Turn BRW_MAX_MRF into a macro that accepts a hardware generation
There are some bug reports about shaders failing to compile in gen6
because MRF 14 is used when we need to spill. For example:
https://bugs.freedesktop.org/show_bug.cgi?id=86469
https://bugs.freedesktop.org/show_bug.cgi?id=90631
Discussion in bugzilla pointed to the fact that gen6 might actually have
24 MRF registers available instead of 16, so we could use other MRF
registers and avoid these conflicts (we still need to investigate why
some shaders need up to MRF 14 anyway, since this is not expected).
Notice that the hardware docs are not clear about this fact:
SNB PRM Vol4 Part2's "Table 5-4. MRF Registers Available in Device
Hardware" says "Number per Thread" - "24 registers"
However, SNB PRM Vol4 Part1, 1.6.1 Message Register File (MRF) says:
"Normal threads should construct their messages in m1..m15. (...)
Regardless of actual hardware implementation, the thread should
not assume th at MRF addresses above m15 wrap to legal MRF registers."
Therefore experimentation was necessary to evaluate if we had these extra
MRF registers available or not. This was tested in gen6 using MRF
registers 21..23 for spilling and doing a full piglit run (all.py) forcing
spilling of everything on the FS backend. It was also tested by doing
spilling of everything on both the FS and the VS backends with a piglit run
of shader.py. In both cases no regressions were observed. In fact, many of
these tests where helped in the cases where we forced spilling, since that
triggered the same underlying problem described in the bug reports. Here are
some results using INTEL_DEBUG=spill_fs,spill_vec4 for a shader.py run on
gen6 hardware:
Using MRFs 13..15 for spilling:
crash: 2, fail: 113, pass: 6621, skip: 5461
Using MRFs 21..23 for spilling:
crash: 2, fail: 12, pass: 6722, skip: 5461
This patch sets the ground for later patches to implement spilling
using MRF registers 21..23 in gen6.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-09-15 16:00:26 +02:00
|
|
|
int first_used_mrf = BRW_MAX_MRF(devinfo->gen);
|
2015-06-06 12:08:00 -07:00
|
|
|
setup_mrf_hack_interference(this, g, first_mrf_hack_node,
|
|
|
|
|
&first_used_mrf);
|
2012-10-01 17:54:10 -07:00
|
|
|
|
2014-09-12 16:17:37 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
/* When we do send-from-GRF for FB writes, we need to ensure that
|
|
|
|
|
* the last write instruction sends from a high register. This is
|
|
|
|
|
* because the vertex fetcher wants to start filling the low
|
|
|
|
|
* payload registers while the pixel data port is still working on
|
|
|
|
|
* writing out the memory. If we don't do this, we get rendering
|
|
|
|
|
* artifacts.
|
|
|
|
|
*
|
|
|
|
|
* We could just do "something high". Instead, we just pick the
|
|
|
|
|
* highest register that works.
|
|
|
|
|
*/
|
2015-02-04 21:17:55 -08:00
|
|
|
if (inst->eot) {
|
2018-10-29 15:06:14 -05:00
|
|
|
const int vgrf = inst->opcode == SHADER_OPCODE_SEND ?
|
|
|
|
|
inst->src[2].nr : inst->src[0].nr;
|
|
|
|
|
int size = alloc.sizes[vgrf];
|
2015-04-16 12:01:09 -07:00
|
|
|
int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1;
|
2015-06-06 12:15:30 -07:00
|
|
|
|
|
|
|
|
/* If something happened to spill, we want to push the EOT send
|
|
|
|
|
* register early enough in the register file that we don't
|
|
|
|
|
* conflict with any used MRF hack registers.
|
|
|
|
|
*/
|
i965: Turn BRW_MAX_MRF into a macro that accepts a hardware generation
There are some bug reports about shaders failing to compile in gen6
because MRF 14 is used when we need to spill. For example:
https://bugs.freedesktop.org/show_bug.cgi?id=86469
https://bugs.freedesktop.org/show_bug.cgi?id=90631
Discussion in bugzilla pointed to the fact that gen6 might actually have
24 MRF registers available instead of 16, so we could use other MRF
registers and avoid these conflicts (we still need to investigate why
some shaders need up to MRF 14 anyway, since this is not expected).
Notice that the hardware docs are not clear about this fact:
SNB PRM Vol4 Part2's "Table 5-4. MRF Registers Available in Device
Hardware" says "Number per Thread" - "24 registers"
However, SNB PRM Vol4 Part1, 1.6.1 Message Register File (MRF) says:
"Normal threads should construct their messages in m1..m15. (...)
Regardless of actual hardware implementation, the thread should
not assume th at MRF addresses above m15 wrap to legal MRF registers."
Therefore experimentation was necessary to evaluate if we had these extra
MRF registers available or not. This was tested in gen6 using MRF
registers 21..23 for spilling and doing a full piglit run (all.py) forcing
spilling of everything on the FS backend. It was also tested by doing
spilling of everything on both the FS and the VS backends with a piglit run
of shader.py. In both cases no regressions were observed. In fact, many of
these tests where helped in the cases where we forced spilling, since that
triggered the same underlying problem described in the bug reports. Here are
some results using INTEL_DEBUG=spill_fs,spill_vec4 for a shader.py run on
gen6 hardware:
Using MRFs 13..15 for spilling:
crash: 2, fail: 113, pass: 6621, skip: 5461
Using MRFs 21..23 for spilling:
crash: 2, fail: 12, pass: 6722, skip: 5461
This patch sets the ground for later patches to implement spilling
using MRF registers 21..23 in gen6.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-09-15 16:00:26 +02:00
|
|
|
reg -= BRW_MAX_MRF(devinfo->gen) - first_used_mrf;
|
2015-06-06 12:15:30 -07:00
|
|
|
|
2018-10-29 15:06:14 -05:00
|
|
|
ra_set_node_reg(g, vgrf, reg);
|
2014-09-12 16:17:37 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-10-17 12:05:42 +02:00
|
|
|
/* In 16-wide instructions we have an issue where a compressed
|
|
|
|
|
* instruction is actually two instructions executed simultaneously.
|
|
|
|
|
* It's actually ok to have the source and destination registers be
|
|
|
|
|
* the same. In this case, each instruction over-writes its own
|
|
|
|
|
* source and there's no problem. The real problem here is if the
|
|
|
|
|
* source and destination registers are off by one. Then you can end
|
|
|
|
|
* up in a scenario where the first instruction over-writes the
|
|
|
|
|
* source of the second instruction. Since the compiler doesn't know
|
|
|
|
|
* about this level of granularity, we simply make the source and
|
|
|
|
|
* destination interfere.
|
|
|
|
|
*/
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
if (inst->exec_size < 16 || inst->dst.file != VGRF)
|
|
|
|
|
continue;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
2018-10-17 12:05:42 +02:00
|
|
|
for (int i = 0; i < inst->sources; ++i) {
|
|
|
|
|
if (inst->src[i].file == VGRF) {
|
|
|
|
|
ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
|
2014-08-18 14:27:55 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-07-11 11:19:20 +02:00
|
|
|
if (devinfo->gen >= 8) {
|
2018-04-19 01:15:23 +02:00
|
|
|
/* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
|
|
|
|
|
* subsection "EUISA Instructions", Send Message (page 990):
|
|
|
|
|
*
|
|
|
|
|
* "r127 must not be used for return address when there is a src and
|
|
|
|
|
* dest overlap in send instruction."
|
|
|
|
|
*
|
|
|
|
|
* We are avoiding using grf127 as part of the destination of send
|
|
|
|
|
* messages adding a node interference to the grf127_send_hack_node.
|
|
|
|
|
* This node has a fixed asignment to grf127.
|
|
|
|
|
*
|
2019-01-15 10:53:44 -06:00
|
|
|
* We don't apply it to SIMD16 instructions because previous code avoids
|
|
|
|
|
* any register overlap between sources and destination.
|
2018-04-19 01:15:23 +02:00
|
|
|
*/
|
|
|
|
|
ra_set_node_reg(g, grf127_send_hack_node, 127);
|
2019-01-15 10:53:44 -06:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
if (inst->exec_size < 16 && inst->is_send_from_grf() &&
|
|
|
|
|
inst->dst.file == VGRF)
|
|
|
|
|
ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
|
2018-07-11 11:19:20 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (spilled_any_registers) {
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
/* Spilling instruction are genereated as SEND messages from MRF
|
|
|
|
|
* but as Gen7+ supports sending from GRF the driver will maps
|
|
|
|
|
* assingn these MRF registers to a GRF. Implementations reuses
|
|
|
|
|
* the dest of the send message as source. So as we will have an
|
|
|
|
|
* overlap for sure, we create an interference between destination
|
|
|
|
|
* and grf127.
|
|
|
|
|
*/
|
|
|
|
|
if ((inst->opcode == SHADER_OPCODE_GEN7_SCRATCH_READ ||
|
|
|
|
|
inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ) &&
|
|
|
|
|
inst->dst.file == VGRF)
|
|
|
|
|
ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
|
2018-04-19 01:15:23 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-01-28 23:24:24 -06:00
|
|
|
/* From the Skylake PRM Vol. 2a docs for sends:
|
|
|
|
|
*
|
|
|
|
|
* "It is required that the second block of GRFs does not overlap with
|
|
|
|
|
* the first block."
|
|
|
|
|
*
|
|
|
|
|
* Normally, this is taken care of by fixup_sends_duplicate_payload() but
|
|
|
|
|
* in the case where one of the registers is an undefined value, the
|
|
|
|
|
* register allocator may decide that they don't interfere even though
|
|
|
|
|
* they're used as sources in the same instruction. We also need to add
|
|
|
|
|
* interference here.
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->gen >= 9) {
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
|
|
|
|
|
inst->src[2].file == VGRF &&
|
|
|
|
|
inst->src[3].file == VGRF &&
|
|
|
|
|
inst->src[2].nr != inst->src[3].nr) {
|
|
|
|
|
for (unsigned i = 0; i < inst->mlen; i++) {
|
|
|
|
|
for (unsigned j = 0; j < inst->ex_mlen; j++) {
|
|
|
|
|
ra_add_node_interference(g, inst->src[2].nr + i,
|
|
|
|
|
inst->src[3].nr + j);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2013-10-16 12:02:41 -07:00
|
|
|
/* Debug of register spilling: Go spill everything. */
|
2016-05-16 14:30:25 -07:00
|
|
|
if (unlikely(spill_all)) {
|
2013-10-16 12:02:41 -07:00
|
|
|
int reg = choose_spill_reg(g);
|
|
|
|
|
|
|
|
|
|
if (reg != -1) {
|
|
|
|
|
spill_reg(reg);
|
|
|
|
|
ralloc_free(g);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2014-07-31 18:57:20 -07:00
|
|
|
if (!ra_allocate(g)) {
|
2010-10-19 09:25:51 -07:00
|
|
|
/* Failed to allocate registers. Spill a reg, and the caller will
|
|
|
|
|
* loop back into here to try again.
|
|
|
|
|
*/
|
|
|
|
|
int reg = choose_spill_reg(g);
|
2011-03-13 13:43:05 -07:00
|
|
|
|
|
|
|
|
if (reg == -1) {
|
2013-06-06 11:24:07 -07:00
|
|
|
fail("no register to spill:\n");
|
2014-05-29 13:08:59 -07:00
|
|
|
dump_instructions(NULL);
|
2013-11-06 17:38:23 -08:00
|
|
|
} else if (allow_spilling) {
|
|
|
|
|
spill_reg(reg);
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
|
2011-01-21 14:32:31 -08:00
|
|
|
ralloc_free(g);
|
2010-10-19 09:25:51 -07:00
|
|
|
|
|
|
|
|
return false;
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Get the chosen virtual registers for each node, and map virtual
|
|
|
|
|
* regs in the register classes back down to real hardware reg
|
|
|
|
|
* numbers.
|
|
|
|
|
*/
|
2014-08-18 14:27:55 -07:00
|
|
|
this->grf_used = payload_node_count;
|
2015-02-10 15:51:34 +02:00
|
|
|
for (unsigned i = 0; i < this->alloc.count; i++) {
|
2010-10-20 10:26:29 -07:00
|
|
|
int reg = ra_get_node_reg(g, i);
|
|
|
|
|
|
2015-04-16 12:01:09 -07:00
|
|
|
hw_reg_mapping[i] = compiler->fs_reg_sets[rsi].ra_reg_to_grf[reg];
|
2011-06-24 15:40:51 -07:00
|
|
|
this->grf_used = MAX2(this->grf_used,
|
2015-02-10 15:51:34 +02:00
|
|
|
hw_reg_mapping[i] + this->alloc.sizes[i]);
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-08-18 14:27:55 -07:00
|
|
|
assign_reg(hw_reg_mapping, &inst->dst);
|
2014-08-08 14:57:27 -07:00
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2014-08-18 14:27:55 -07:00
|
|
|
assign_reg(hw_reg_mapping, &inst->src[i]);
|
2014-08-08 14:57:27 -07:00
|
|
|
}
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
this->alloc.count = this->grf_used;
|
2014-08-08 16:25:34 -07:00
|
|
|
|
2011-01-21 14:32:31 -08:00
|
|
|
ralloc_free(g);
|
2010-10-19 09:25:51 -07:00
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-27 02:16:22 -07:00
|
|
|
namespace {
|
|
|
|
|
/**
|
|
|
|
|
* Maximum spill block size we expect to encounter in 32B units.
|
|
|
|
|
*
|
|
|
|
|
* This is somewhat arbitrary and doesn't necessarily limit the maximum
|
|
|
|
|
* variable size that can be spilled -- A higher value will allow a
|
|
|
|
|
* variable of a given size to be spilled more efficiently with a smaller
|
|
|
|
|
* number of scratch messages, but will increase the likelihood of a
|
|
|
|
|
* collision between the MRFs reserved for spilling and other MRFs used by
|
|
|
|
|
* the program (and possibly increase GRF register pressure on platforms
|
|
|
|
|
* without hardware MRFs), what could cause register allocation to fail.
|
|
|
|
|
*
|
|
|
|
|
* For the moment reserve just enough space so a register of 32 bit
|
|
|
|
|
* component type and natural region width can be spilled without splitting
|
|
|
|
|
* into multiple (force_writemask_all) scratch messages.
|
|
|
|
|
*/
|
|
|
|
|
unsigned
|
|
|
|
|
spill_max_size(const backend_shader *s)
|
|
|
|
|
{
|
|
|
|
|
/* FINISHME - On Gen7+ it should be possible to avoid this limit
|
|
|
|
|
* altogether by spilling directly from the temporary GRF
|
|
|
|
|
* allocated to hold the result of the instruction (and the
|
|
|
|
|
* scratch write header).
|
|
|
|
|
*/
|
|
|
|
|
/* FINISHME - The shader's dispatch width probably belongs in
|
|
|
|
|
* backend_shader (or some nonexistent fs_shader class?)
|
|
|
|
|
* rather than in the visitor class.
|
|
|
|
|
*/
|
|
|
|
|
return static_cast<const fs_visitor *>(s)->dispatch_width / 8;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* First MRF register available for spilling.
|
|
|
|
|
*/
|
|
|
|
|
unsigned
|
|
|
|
|
spill_base_mrf(const backend_shader *s)
|
|
|
|
|
{
|
|
|
|
|
return BRW_MAX_MRF(s->devinfo->gen) - spill_max_size(s) - 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-15 20:30:06 -07:00
|
|
|
static void
|
|
|
|
|
emit_unspill(const fs_builder &bld, fs_reg dst,
|
|
|
|
|
uint32_t spill_offset, unsigned count)
|
2010-10-19 09:25:51 -07:00
|
|
|
{
|
2016-08-22 15:01:08 -07:00
|
|
|
const gen_device_info *devinfo = bld.shader->devinfo;
|
2016-05-15 22:59:04 -07:00
|
|
|
const unsigned reg_size = dst.component_size(bld.dispatch_width()) /
|
|
|
|
|
REG_SIZE;
|
|
|
|
|
assert(count % reg_size == 0);
|
2015-06-03 19:05:54 +03:00
|
|
|
|
2016-05-12 00:39:06 -07:00
|
|
|
for (unsigned i = 0; i < count / reg_size; i++) {
|
2015-11-25 21:02:15 +02:00
|
|
|
/* The Gen7 descriptor-based offset is 12 bits of HWORD units. Because
|
|
|
|
|
* the Gen7-style scratch block read is hardwired to BTI 255, on Gen9+
|
|
|
|
|
* it would cause the DC to do an IA-coherent read, what largely
|
|
|
|
|
* outweighs the slight advantage from not having to provide the address
|
|
|
|
|
* as part of the message header, so we're better off using plain old
|
|
|
|
|
* oword block reads.
|
|
|
|
|
*/
|
|
|
|
|
bool gen7_read = (devinfo->gen >= 7 && devinfo->gen < 9 &&
|
|
|
|
|
spill_offset < (1 << 12) * REG_SIZE);
|
2016-05-15 22:59:04 -07:00
|
|
|
fs_inst *unspill_inst = bld.emit(gen7_read ?
|
|
|
|
|
SHADER_OPCODE_GEN7_SCRATCH_READ :
|
|
|
|
|
SHADER_OPCODE_GEN4_SCRATCH_READ,
|
|
|
|
|
dst);
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
unspill_inst->offset = spill_offset;
|
|
|
|
|
|
2013-10-16 11:51:22 -07:00
|
|
|
if (!gen7_read) {
|
2016-05-15 20:30:06 -07:00
|
|
|
unspill_inst->base_mrf = spill_base_mrf(bld.shader);
|
2013-10-16 11:51:22 -07:00
|
|
|
unspill_inst->mlen = 1; /* header contains offset */
|
|
|
|
|
}
|
i965/fs: Make register spill/unspill only do the regs for that instruction.
Previously, if we were spilling the result of a texture call, we would store
all 4 regs, then for each use of one of those regs as the source of an
instruction, we would unspill all 4 regs even though only one was needed.
In both lightsmark and l4d2 with my current graphics config, the shaders that
produce spilling do so on split GRFs, so this doesn't help them out. However,
in a capture of the l4d2 shaders with a different snapshot and playing the
game instead of using a demo, it reduced one shader from 2817 instructions to
2179, due to choosing a now-cheaper texture result to spill instead of piles
of texcoords.
v2: Fix comment noted by Ken, and fix the if condition associated with it for
the current state of what constitutes a partial write of the destination.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> (v1)
2012-07-06 17:18:35 -07:00
|
|
|
|
2016-09-01 12:42:20 -07:00
|
|
|
dst.offset += reg_size * REG_SIZE;
|
2014-10-24 11:41:25 -07:00
|
|
|
spill_offset += reg_size * REG_SIZE;
|
2014-08-18 14:27:55 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-15 20:30:06 -07:00
|
|
|
static void
|
|
|
|
|
emit_spill(const fs_builder &bld, fs_reg src,
|
|
|
|
|
uint32_t spill_offset, unsigned count)
|
2014-08-18 14:27:55 -07:00
|
|
|
{
|
2016-05-15 22:59:04 -07:00
|
|
|
const unsigned reg_size = src.component_size(bld.dispatch_width()) /
|
|
|
|
|
REG_SIZE;
|
|
|
|
|
assert(count % reg_size == 0);
|
2015-06-03 19:05:54 +03:00
|
|
|
|
2016-05-12 00:39:06 -07:00
|
|
|
for (unsigned i = 0; i < count / reg_size; i++) {
|
2014-08-18 14:27:55 -07:00
|
|
|
fs_inst *spill_inst =
|
2016-05-15 22:59:04 -07:00
|
|
|
bld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE, bld.null_reg_f(), src);
|
2016-09-01 12:42:20 -07:00
|
|
|
src.offset += reg_size * REG_SIZE;
|
2014-10-24 11:41:25 -07:00
|
|
|
spill_inst->offset = spill_offset + i * reg_size * REG_SIZE;
|
2014-08-18 14:27:55 -07:00
|
|
|
spill_inst->mlen = 1 + reg_size; /* header, value */
|
2016-05-15 20:30:06 -07:00
|
|
|
spill_inst->base_mrf = spill_base_mrf(bld.shader);
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
}
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
fs_visitor::choose_spill_reg(struct ra_graph *g)
|
|
|
|
|
{
|
2017-04-09 17:28:58 -07:00
|
|
|
float block_scale = 1.0;
|
2015-02-10 15:51:34 +02:00
|
|
|
float spill_costs[this->alloc.count];
|
|
|
|
|
bool no_spill[this->alloc.count];
|
2010-10-19 09:25:51 -07:00
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
for (unsigned i = 0; i < this->alloc.count; i++) {
|
2010-10-19 09:25:51 -07:00
|
|
|
spill_costs[i] = 0.0;
|
|
|
|
|
no_spill[i] = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Calculate costs for spilling nodes. Call it a cost of 1 per
|
|
|
|
|
* spill/unspill we'll have to do, and guess that the insides of
|
|
|
|
|
* loops run 10 times.
|
|
|
|
|
*/
|
2014-09-01 13:35:04 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2016-04-27 02:07:08 -07:00
|
|
|
if (inst->src[i].file == VGRF)
|
2017-04-20 11:42:27 -07:00
|
|
|
spill_costs[inst->src[i].nr] += regs_read(inst, i) * block_scale;
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
|
2016-04-27 02:07:08 -07:00
|
|
|
if (inst->dst.file == VGRF)
|
2017-04-20 11:44:01 -07:00
|
|
|
spill_costs[inst->dst.nr] += regs_written(inst) * block_scale;
|
2012-09-19 13:28:00 -07:00
|
|
|
|
2010-10-19 09:25:51 -07:00
|
|
|
switch (inst->opcode) {
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_DO:
|
2017-04-09 17:28:58 -07:00
|
|
|
block_scale *= 10;
|
2010-10-19 09:25:51 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_WHILE:
|
2017-04-09 17:28:58 -07:00
|
|
|
block_scale /= 10;
|
2010-10-19 09:25:51 -07:00
|
|
|
break;
|
|
|
|
|
|
2017-04-09 17:28:58 -07:00
|
|
|
case BRW_OPCODE_IF:
|
|
|
|
|
case BRW_OPCODE_IFF:
|
|
|
|
|
block_scale *= 0.5;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_ENDIF:
|
|
|
|
|
block_scale /= 0.5;
|
|
|
|
|
break;
|
|
|
|
|
|
2013-10-16 11:45:06 -07:00
|
|
|
case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
|
2015-10-26 17:09:25 -07:00
|
|
|
if (inst->src[0].file == VGRF)
|
2015-10-26 04:35:14 -07:00
|
|
|
no_spill[inst->src[0].nr] = true;
|
2010-10-19 09:25:51 -07:00
|
|
|
break;
|
|
|
|
|
|
2013-10-16 11:45:06 -07:00
|
|
|
case SHADER_OPCODE_GEN4_SCRATCH_READ:
|
2013-10-16 11:51:22 -07:00
|
|
|
case SHADER_OPCODE_GEN7_SCRATCH_READ:
|
2015-10-26 17:09:25 -07:00
|
|
|
if (inst->dst.file == VGRF)
|
2015-10-26 04:35:14 -07:00
|
|
|
no_spill[inst->dst.nr] = true;
|
2010-10-19 09:25:51 -07:00
|
|
|
break;
|
2011-05-03 10:55:50 -07:00
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
break;
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
for (unsigned i = 0; i < this->alloc.count; i++) {
|
2019-04-13 16:01:50 -05:00
|
|
|
int live_length = virtual_grf_end[i] - virtual_grf_start[i];
|
|
|
|
|
if (live_length <= 0)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* Divide the cost (in number of spills/fills) by the log of the length
|
|
|
|
|
* of the live range of the register. This will encourage spill logic
|
|
|
|
|
* to spill long-living things before spilling short-lived things where
|
|
|
|
|
* spilling is less likely to actually do us any good. We use the log
|
|
|
|
|
* of the length because it will fall off very quickly and not cause us
|
|
|
|
|
* to spill medium length registers with more uses.
|
|
|
|
|
*/
|
|
|
|
|
float adjusted_cost = spill_costs[i] / logf(live_length);
|
2010-10-19 09:25:51 -07:00
|
|
|
if (!no_spill[i])
|
2019-04-13 16:01:50 -05:00
|
|
|
ra_set_node_spill_cost(g, i, adjusted_cost);
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ra_get_best_spill_node(g);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2018-12-10 14:49:49 -08:00
|
|
|
fs_visitor::spill_reg(unsigned spill_reg)
|
2010-10-19 09:25:51 -07:00
|
|
|
{
|
2015-02-10 15:51:34 +02:00
|
|
|
int size = alloc.sizes[spill_reg];
|
2014-05-13 21:00:35 -07:00
|
|
|
unsigned int spill_offset = last_scratch;
|
2010-10-19 09:25:51 -07:00
|
|
|
assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
|
2013-10-29 12:46:18 -07:00
|
|
|
|
|
|
|
|
/* Spills may use MRFs 13-15 in the SIMD16 case. Our texturing is done
|
|
|
|
|
* using up to 11 MRFs starting from either m1 or m2, and fb writes can use
|
|
|
|
|
* up to m13 (gen6+ simd16: 2 header + 8 color + 2 src0alpha + 2 omask) or
|
|
|
|
|
* m15 (gen4-5 simd16: 2 header + 8 color + 1 aads + 2 src depth + 2 dst
|
|
|
|
|
* depth), starting from m1. In summary: We may not be able to spill in
|
|
|
|
|
* SIMD16 mode, because we'd stomp the FB writes.
|
|
|
|
|
*/
|
|
|
|
|
if (!spilled_any_registers) {
|
i965: Turn BRW_MAX_MRF into a macro that accepts a hardware generation
There are some bug reports about shaders failing to compile in gen6
because MRF 14 is used when we need to spill. For example:
https://bugs.freedesktop.org/show_bug.cgi?id=86469
https://bugs.freedesktop.org/show_bug.cgi?id=90631
Discussion in bugzilla pointed to the fact that gen6 might actually have
24 MRF registers available instead of 16, so we could use other MRF
registers and avoid these conflicts (we still need to investigate why
some shaders need up to MRF 14 anyway, since this is not expected).
Notice that the hardware docs are not clear about this fact:
SNB PRM Vol4 Part2's "Table 5-4. MRF Registers Available in Device
Hardware" says "Number per Thread" - "24 registers"
However, SNB PRM Vol4 Part1, 1.6.1 Message Register File (MRF) says:
"Normal threads should construct their messages in m1..m15. (...)
Regardless of actual hardware implementation, the thread should
not assume th at MRF addresses above m15 wrap to legal MRF registers."
Therefore experimentation was necessary to evaluate if we had these extra
MRF registers available or not. This was tested in gen6 using MRF
registers 21..23 for spilling and doing a full piglit run (all.py) forcing
spilling of everything on the FS backend. It was also tested by doing
spilling of everything on both the FS and the VS backends with a piglit run
of shader.py. In both cases no regressions were observed. In fact, many of
these tests where helped in the cases where we forced spilling, since that
triggered the same underlying problem described in the bug reports. Here are
some results using INTEL_DEBUG=spill_fs,spill_vec4 for a shader.py run on
gen6 hardware:
Using MRFs 13..15 for spilling:
crash: 2, fail: 113, pass: 6621, skip: 5461
Using MRFs 21..23 for spilling:
crash: 2, fail: 12, pass: 6722, skip: 5461
This patch sets the ground for later patches to implement spilling
using MRF registers 21..23 in gen6.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-09-15 16:00:26 +02:00
|
|
|
bool mrf_used[BRW_MAX_MRF(devinfo->gen)];
|
2015-06-06 12:08:00 -07:00
|
|
|
get_used_mrfs(this, mrf_used);
|
2013-10-29 12:46:18 -07:00
|
|
|
|
2016-04-27 02:16:22 -07:00
|
|
|
for (int i = spill_base_mrf(this); i < BRW_MAX_MRF(devinfo->gen); i++) {
|
2013-10-29 12:46:18 -07:00
|
|
|
if (mrf_used[i]) {
|
|
|
|
|
fail("Register spilling not supported with m%d used", i);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
spilled_any_registers = true;
|
|
|
|
|
}
|
2010-10-19 09:25:51 -07:00
|
|
|
|
2014-10-24 11:41:25 -07:00
|
|
|
last_scratch += size * REG_SIZE;
|
2013-10-16 12:16:51 -07:00
|
|
|
|
2010-10-19 09:25:51 -07:00
|
|
|
/* Generate spill/unspill instructions for the objects being
|
|
|
|
|
* spilled. Right now, we spill or unspill the whole thing to a
|
|
|
|
|
* virtual grf of the same size. For most instructions, though, we
|
|
|
|
|
* could just spill/unspill the GRF being accessed.
|
|
|
|
|
*/
|
2014-07-15 11:45:20 -07:00
|
|
|
foreach_block_and_inst (block, fs_inst, inst, cfg) {
|
2016-05-15 20:30:06 -07:00
|
|
|
const fs_builder ibld = fs_builder(this, block, inst);
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
if (inst->src[i].file == VGRF &&
|
2015-10-26 04:35:14 -07:00
|
|
|
inst->src[i].nr == spill_reg) {
|
2016-09-07 16:59:35 -07:00
|
|
|
int count = regs_read(inst, i);
|
2016-09-01 12:42:20 -07:00
|
|
|
int subset_spill_offset = spill_offset +
|
|
|
|
|
ROUND_DOWN_TO(inst->src[i].offset, REG_SIZE);
|
2016-09-07 16:59:35 -07:00
|
|
|
fs_reg unspill_dst(VGRF, alloc.allocate(count));
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
2015-10-26 04:35:14 -07:00
|
|
|
inst->src[i].nr = unspill_dst.nr;
|
2016-09-01 12:42:20 -07:00
|
|
|
inst->src[i].offset %= REG_SIZE;
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
2016-05-16 01:03:43 -07:00
|
|
|
/* We read the largest power-of-two divisor of the register count
|
|
|
|
|
* (because only POT scratch read blocks are allowed by the
|
|
|
|
|
* hardware) up to the maximum supported block size.
|
|
|
|
|
*/
|
2016-05-15 22:59:04 -07:00
|
|
|
const unsigned width =
|
2016-09-07 16:59:35 -07:00
|
|
|
MIN2(32, 1u << (ffs(MAX2(1, count) * 8) - 1));
|
2016-05-16 00:59:37 -07:00
|
|
|
|
|
|
|
|
/* Set exec_all() on unspill messages under the (rather
|
|
|
|
|
* pessimistic) assumption that there is no one-to-one
|
|
|
|
|
* correspondence between channels of the spilled variable in
|
|
|
|
|
* scratch space and the scratch read message, which operates on
|
|
|
|
|
* 32 bit channels. It shouldn't hurt in any case because the
|
|
|
|
|
* unspill destination is a block-local temporary.
|
|
|
|
|
*/
|
|
|
|
|
emit_unspill(ibld.exec_all().group(width, 0),
|
2016-09-07 16:59:35 -07:00
|
|
|
unspill_dst, subset_spill_offset, count);
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-10-26 17:09:25 -07:00
|
|
|
if (inst->dst.file == VGRF &&
|
2015-10-26 04:35:14 -07:00
|
|
|
inst->dst.nr == spill_reg) {
|
2016-09-01 12:42:20 -07:00
|
|
|
int subset_spill_offset = spill_offset +
|
|
|
|
|
ROUND_DOWN_TO(inst->dst.offset, REG_SIZE);
|
2016-09-07 16:59:35 -07:00
|
|
|
fs_reg spill_src(VGRF, alloc.allocate(regs_written(inst)));
|
2013-12-08 04:57:08 +01:00
|
|
|
|
2015-10-26 04:35:14 -07:00
|
|
|
inst->dst.nr = spill_src.nr;
|
2016-09-01 12:42:20 -07:00
|
|
|
inst->dst.offset %= REG_SIZE;
|
i965/fs: Make register spill/unspill only do the regs for that instruction.
Previously, if we were spilling the result of a texture call, we would store
all 4 regs, then for each use of one of those regs as the source of an
instruction, we would unspill all 4 regs even though only one was needed.
In both lightsmark and l4d2 with my current graphics config, the shaders that
produce spilling do so on split GRFs, so this doesn't help them out. However,
in a capture of the l4d2 shaders with a different snapshot and playing the
game instead of using a demo, it reduced one shader from 2817 instructions to
2179, due to choosing a now-cheaper texture result to spill instead of piles
of texcoords.
v2: Fix comment noted by Ken, and fix the if condition associated with it for
the current state of what constitutes a partial write of the destination.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> (v1)
2012-07-06 17:18:35 -07:00
|
|
|
|
2014-10-27 16:50:12 -07:00
|
|
|
/* If we're immediately spilling the register, we should not use
|
|
|
|
|
* destination dependency hints. Doing so will cause the GPU do
|
|
|
|
|
* try to read and write the register at the same time and may
|
|
|
|
|
* hang the GPU.
|
|
|
|
|
*/
|
|
|
|
|
inst->no_dd_clear = false;
|
|
|
|
|
inst->no_dd_check = false;
|
|
|
|
|
|
2016-05-16 01:03:43 -07:00
|
|
|
/* Calculate the execution width of the scratch messages (which work
|
|
|
|
|
* in terms of 32 bit components so we have a fixed number of eight
|
|
|
|
|
* channels per spilled register). We attempt to write one
|
|
|
|
|
* exec_size-wide component of the variable at a time without
|
|
|
|
|
* exceeding the maximum number of (fake) MRF registers reserved for
|
|
|
|
|
* spills.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned width = 8 * MIN2(
|
|
|
|
|
DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE),
|
|
|
|
|
spill_max_size(this));
|
2016-05-15 22:59:04 -07:00
|
|
|
|
2016-05-16 01:23:44 -07:00
|
|
|
/* Spills should only write data initialized by the instruction for
|
|
|
|
|
* whichever channels are enabled in the excution mask. If that's
|
|
|
|
|
* not possible we'll have to emit a matching unspill before the
|
|
|
|
|
* instruction and set force_writemask_all on the spill.
|
|
|
|
|
*/
|
|
|
|
|
const bool per_channel =
|
|
|
|
|
inst->dst.is_contiguous() && type_sz(inst->dst.type) == 4 &&
|
|
|
|
|
inst->exec_size == width;
|
|
|
|
|
|
2016-05-15 22:59:04 -07:00
|
|
|
/* Builder used to emit the scratch messages. */
|
2016-05-16 01:23:44 -07:00
|
|
|
const fs_builder ubld = ibld.exec_all(!per_channel).group(width, 0);
|
2016-05-15 22:59:04 -07:00
|
|
|
|
i965/fs: Make register spill/unspill only do the regs for that instruction.
Previously, if we were spilling the result of a texture call, we would store
all 4 regs, then for each use of one of those regs as the source of an
instruction, we would unspill all 4 regs even though only one was needed.
In both lightsmark and l4d2 with my current graphics config, the shaders that
produce spilling do so on split GRFs, so this doesn't help them out. However,
in a capture of the l4d2 shaders with a different snapshot and playing the
game instead of using a demo, it reduced one shader from 2817 instructions to
2179, due to choosing a now-cheaper texture result to spill instead of piles
of texcoords.
v2: Fix comment noted by Ken, and fix the if condition associated with it for
the current state of what constitutes a partial write of the destination.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> (v1)
2012-07-06 17:18:35 -07:00
|
|
|
/* If our write is going to affect just part of the
|
2016-09-07 16:59:35 -07:00
|
|
|
* regs_written(inst), then we need to unspill the destination since
|
|
|
|
|
* we write back out all of the regs_written(). If the original
|
|
|
|
|
* instruction had force_writemask_all set and is not a partial
|
|
|
|
|
* write, there should be no need for the unspill since the
|
2016-05-16 01:23:44 -07:00
|
|
|
* instruction will be overwriting the whole destination in any case.
|
2010-10-19 09:25:51 -07:00
|
|
|
*/
|
intel/compiler: split is_partial_write() into two variants
This function is used in two different scenarios that for 32-bit
instructions are the same, but for 16-bit instructions are not.
One scenario is that in which we are working at a SIMD8 register
level and we need to know if a register is fully defined or written.
This is useful, for example, in the context of liveness analysis or
register allocation, where we work with units of registers.
The other scenario is that in which we want to know if an instruction
is writing a full scalar component or just some subset of it. This is
useful, for example, in the context of some optimization passes
like copy propagation.
For 32-bit instructions (or larger), a SIMD8 dispatch will always write
at least a full SIMD8 register (32B) if the write is not partial. The
function is_partial_write() checks this to determine if we have a partial
write. However, when we deal with 16-bit instructions, that logic disables
some optimizations that should be safe. For example, a SIMD8 16-bit MOV will
only update half of a SIMD register, but it is still a complete write of the
variable for a SIMD8 dispatch, so we should not prevent copy propagation in
this scenario because we don't write all 32 bytes in the SIMD register
or because the write starts at offset 16B (wehere we pack components Y or
W of 16-bit vectors).
This is a problem for SIMD8 executions (VS, TCS, TES, GS) of 16-bit
instructions, which lose a number of optimizations because of this, most
important of which is copy-propagation.
This patch splits is_partial_write() into is_partial_reg_write(), which
represents the current is_partial_write(), useful for things like
liveness analysis, and is_partial_var_write(), which considers
the dispatch size to check if we are writing a full variable (rather
than a full register) to decide if the write is partial or not, which
is what we really want in many optimization passes.
Then the patch goes on and rewrites all uses of is_partial_write() to use
one or the other version. Specifically, we use is_partial_var_write()
in the following places: copy propagation, cmod propagation, common
subexpression elimination, saturate propagation and sel peephole.
Notice that the semantics of is_partial_var_write() exactly match the
current implementation of is_partial_write() for anything that is
32-bit or larger, so no changes are expected for 32-bit instructions.
Tested against ~5000 tests involving 16-bit instructions in CTS produced
the following changes in instruction counts:
Patched | Master | % |
================================================
SIMD8 | 621,900 | 706,721 | -12.00% |
================================================
SIMD16 | 93,252 | 93,252 | 0.00% |
================================================
As expected, the change only affects SIMD8 dispatches.
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
2018-07-10 09:52:46 +02:00
|
|
|
if (inst->is_partial_reg_write() ||
|
2016-05-16 01:23:44 -07:00
|
|
|
(!inst->force_writemask_all && !per_channel))
|
2016-05-15 22:59:04 -07:00
|
|
|
emit_unspill(ubld, spill_src, subset_spill_offset,
|
2016-09-07 16:59:35 -07:00
|
|
|
regs_written(inst));
|
2010-10-19 09:25:51 -07:00
|
|
|
|
2016-05-15 22:59:04 -07:00
|
|
|
emit_spill(ubld.at(block, inst->next), spill_src,
|
2016-09-07 16:59:35 -07:00
|
|
|
subset_spill_offset, regs_written(inst));
|
2010-10-19 09:25:51 -07:00
|
|
|
}
|
|
|
|
|
}
|
2011-01-12 10:10:01 -08:00
|
|
|
|
2014-09-01 10:54:00 -07:00
|
|
|
invalidate_live_intervals();
|
2010-10-20 10:26:29 -07:00
|
|
|
}
|