brw: Basic infrastructure to store convergent values as scalars

In SIMD16 and SIMD32, storing convergent values in full 16- or
32-channel registers is wasteful. It wastes register space, and in most
cases on SIMD32, it wastes instructions. Our register allocator is not
clever enough to handle scalar allocations. It's fundamental unit of
allocation is SIMD8. Start treating convergent values as SIMD8.

Add a tracking bit in brw_reg to specify that a register represents a
convergent, scalar value. This has two implications:

1. All channels of the SIMD8 register must contain the same value. In
   general, this means that writes to the register must be
   force_writemask_all and exec_size = 8;

2. Reads of this register can (and should) use <0,1,0> stride. SIMD8
   instructions that have restrictions on source stride can us <8,8,1>.

Values that are vectors (e.g., results of load_uniform or texture
operations) will be stored as multiple SIMD8 hardware registers.

v2: brw_fs_opt_copy_propagation_defs fix from Ken. Fix for Xe2.

v3: Eliminte offset_to_scalar(). Remove mention of vec4 backend in
brw_reg.h. Both suggested by Caio. The offset_to_scalar() change
necessitates some trickery in the fs_builder offset() function, but I
think this is an improvement overall. There is also some rework in
find_value_for_offset to account for the possibility that is_scalar
sources in LOAD_PAYLOAD might be <8;8,1> or <0;1,0>.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
This commit is contained in:
Ian Romanick 2024-02-09 17:12:11 -08:00
parent ef3dc401da
commit 1bff4f93ca
7 changed files with 72 additions and 12 deletions

View file

@ -729,7 +729,9 @@ fs_inst::size_read(const struct intel_device_info *devinfo, int arg) const
case FIXED_GRF: case FIXED_GRF:
case VGRF: case VGRF:
case ATTR: case ATTR:
return components_read(arg) * src[arg].component_size(exec_size); /* Regardless of exec_size, values marked as scalar are SIMD8. */
return components_read(arg) *
src[arg].component_size(src[arg].is_scalar ? 8 * reg_unit(devinfo) : exec_size);
} }
return 0; return 0;
} }

View file

@ -28,6 +28,9 @@
#include "brw_eu.h" #include "brw_eu.h"
#include "brw_fs.h" #include "brw_fs.h"
static inline brw_reg offset(const brw_reg &, const brw::fs_builder &,
unsigned);
namespace brw { namespace brw {
/** /**
* Toolbox to assemble an FS IR program out of individual instructions. * Toolbox to assemble an FS IR program out of individual instructions.
@ -402,8 +405,9 @@ namespace brw {
move_to_vgrf(const brw_reg &src, unsigned num_components) const move_to_vgrf(const brw_reg &src, unsigned num_components) const
{ {
brw_reg *const src_comps = new brw_reg[num_components]; brw_reg *const src_comps = new brw_reg[num_components];
for (unsigned i = 0; i < num_components; i++) for (unsigned i = 0; i < num_components; i++)
src_comps[i] = offset(src, dispatch_width(), i); src_comps[i] = offset(src, *this, i);
const brw_reg dst = vgrf(src.type, num_components); const brw_reg dst = vgrf(src.type, num_components);
LOAD_PAYLOAD(dst, src_comps, num_components, 0); LOAD_PAYLOAD(dst, src_comps, num_components, 0);
@ -891,8 +895,36 @@ namespace brw {
}; };
} }
/**
* Offset by a number of components into a VGRF
*
* It is assumed that the VGRF represents a vector (e.g., returned by
* load_uniform or a texture operation). Convergent and divergent values are
* stored differently, so care must be taken to offset properly.
*/
static inline brw_reg static inline brw_reg
offset(const brw_reg &reg, const brw::fs_builder &bld, unsigned delta) offset(const brw_reg &reg, const brw::fs_builder &bld, unsigned delta)
{ {
/* If the value is convergent (stored as one or more SIMD8), offset using
* SIMD8 and select component 0.
*/
if (reg.is_scalar) {
const unsigned allocation_width = 8 * reg_unit(bld.shader->devinfo);
brw_reg offset_reg = offset(reg, allocation_width, delta);
/* If the dispatch width is larger than the allocation width, that
* implies that the register can only be used as a source. Otherwise the
* instruction would write past the allocation size of the register.
*/
if (bld.dispatch_width() > allocation_width)
return component(offset_reg, 0);
else
return offset_reg;
}
/* Offset to the component assuming the value was allocated in
* dispatch_width units.
*/
return offset(reg, bld.dispatch_width(), delta); return offset(reg, bld.dispatch_width(), delta);
} }

View file

@ -1778,11 +1778,19 @@ find_value_for_offset(fs_inst *def, const brw_reg &src, unsigned src_size)
case SHADER_OPCODE_LOAD_PAYLOAD: { case SHADER_OPCODE_LOAD_PAYLOAD: {
unsigned offset = 0; unsigned offset = 0;
for (int i = def->header_size; i < def->sources; i++) { for (int i = def->header_size; i < def->sources; i++) {
const unsigned splat = def->src[i].stride == 0 ? def->exec_size : 1; /* Ignore the source splat if the source is a scalar. In that case
* always use just the first component.
*/
const unsigned splat =
(def->src[i].stride == 0 && !src.is_scalar) || def->src[i].file == IMM ? def->exec_size : 1;
const unsigned component_size =
def->src[i].component_size(def->exec_size);
if (offset == src.offset) { if (offset == src.offset) {
if (def->dst.type == def->src[i].type && if (def->dst.type == def->src[i].type &&
def->src[i].stride <= 1 && def->src[i].stride <= 1 &&
def->src[i].component_size(def->exec_size) * splat == src_size) (component_size * splat == src_size ||
(def->src[i].file == IMM && component_size == src_size)))
val = def->src[i]; val = def->src[i];
break; break;

View file

@ -749,6 +749,7 @@ brw_fs_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, fs_inst
new_reg = byte_offset(new_reg, reg->offset); new_reg = byte_offset(new_reg, reg->offset);
new_reg.abs = reg->abs; new_reg.abs = reg->abs;
new_reg.negate = reg->negate; new_reg.negate = reg->negate;
new_reg.is_scalar = reg->is_scalar;
*reg = new_reg; *reg = new_reg;
} }

View file

@ -478,11 +478,12 @@ needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
return false; return false;
return !(is_periodic(inst->src[i], lbld.dispatch_width()) || return !inst->src[i].is_scalar &&
(inst->components_read(i) == 1 && (!(is_periodic(inst->src[i], lbld.dispatch_width()) ||
lbld.dispatch_width() <= inst->exec_size)) || (inst->components_read(i) == 1 &&
(inst->flags_written(lbld.shader->devinfo) & lbld.dispatch_width() <= inst->exec_size)) ||
brw_fs_flag_mask(inst->src[i], brw_type_size_bytes(inst->src[i].type))); (inst->flags_written(lbld.shader->devinfo) &
brw_fs_flag_mask(inst->src[i], brw_type_size_bytes(inst->src[i].type))));
} }
/** /**
@ -509,7 +510,8 @@ emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
return tmp; return tmp;
} else if (is_periodic(inst->src[i], lbld.dispatch_width()) || } else if (is_periodic(inst->src[i], lbld.dispatch_width()) ||
(inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)) { (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) ||
inst->src[i].is_scalar) {
/* The source is invariant for all dispatch_width-wide groups of the /* The source is invariant for all dispatch_width-wide groups of the
* original region. * original region.
* *

View file

@ -377,6 +377,9 @@ brw_fs_validate(const fs_visitor &s)
if (inst->dst.file == VGRF) { if (inst->dst.file == VGRF) {
fsv_assert_lte(inst->dst.offset / REG_SIZE + regs_written(inst), fsv_assert_lte(inst->dst.offset / REG_SIZE + regs_written(inst),
s.alloc.sizes[inst->dst.nr]); s.alloc.sizes[inst->dst.nr]);
if (inst->exec_size > 1)
fsv_assert_ne(inst->dst.stride, 0);
} }
for (unsigned i = 0; i < inst->sources; i++) { for (unsigned i = 0; i < inst->sources; i++) {

View file

@ -182,7 +182,19 @@ typedef struct brw_reg {
unsigned vstride:4; /* source only */ unsigned vstride:4; /* source only */
unsigned width:3; /* src only, align1 only */ unsigned width:3; /* src only, align1 only */
unsigned hstride:2; /* align1 only */ unsigned hstride:2; /* align1 only */
unsigned pad1:1;
/**
* Does this register represent a scalar value?
*
* Registers are allocated in SIMD8 parcels, but may be used to
* represent convergent (i.e., scalar) values. As a destination, it
* is written as SIMD8. As a source, it may be read as <8,8,1> in
* SIMD8 instructions or <0,1,0> on other execution sizes.
*
* If the value represents a vector (e.g., a convergent load_uniform
* of a vec4), it will be stored as multiple SIMD8 registers.
*/
unsigned is_scalar:1;
}; };
double df; double df;
@ -405,7 +417,7 @@ brw_make_reg(enum brw_reg_file file,
reg.vstride = vstride; reg.vstride = vstride;
reg.width = width; reg.width = width;
reg.hstride = hstride; reg.hstride = hstride;
reg.pad1 = 0; reg.is_scalar = 0;
reg.offset = 0; reg.offset = 0;
reg.stride = 1; reg.stride = 1;