nir: Add intrinsics for register access

Note the writemask handling is chosen for consistency with the rest of NIR. In
every other instance, writemask=w requires a vec4 source. This is hardcoded into
nir_validate and nir_print as what it means to have a writemask.

More importantly, consistency with how register writemasks currently work.
nir_print hides it, but r0.w = fneg ssa_1.x is actually a vec4 instruction with
source ssa_1.xxxx. As a silly example nir_dest_num_components(that) = 4 in the
old model. I realize this is quite strange coming from a scalar ISA, but it's
perfectly natural for the class of vec4 hardware for which this was designed. In
that hardware, conceptually all instructions are vec4`, so the sequence "fneg
ssa_1 and write to channel w" is implemented as "fneg a vec4 with ssa_1.x in the
last component and write that vec4 out but mask to write only the w channel".

Isn't this inefficient? It can be. To save power, Midgard has scalar ALUs in
addition to vec4 ALUs. Those details are confined to the backend VLIW scheduler;
the instruction selection is still done as vec4. This mechanism has little in
common with AMD's SALUs. Midgard has a wave size of 1, with special hacks for
derivatives.

As a result, all backends consuming register writemasks are expecting this
pattern of code. Changing the store to take a vec1 instead of a vec4 would
require changing every backend to reswizzle the sources to resurrect the vec4. I
started typing a branch to do this yesterday, but it made a mess of both Midgard
and nir-to-tgsi. Without any good reason to think it'd actually help
performance, I abandoned the idea. Getting all 15 backends converted to the
helpers is enough of a challenge without forcing 10 backends to reswizzle their
sources too.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23089>
This commit is contained in:
Alyssa Rosenzweig 2023-05-16 11:19:49 -04:00 committed by Marge Bot
parent 80c10b2a4b
commit 7229bffcb1
5 changed files with 141 additions and 0 deletions

View file

@ -6249,6 +6249,16 @@ bool nir_mod_analysis(nir_ssa_scalar val, nir_alu_type val_type, unsigned div, u
bool bool
nir_remove_tex_shadow(nir_shader *shader, unsigned textures_bitmask); nir_remove_tex_shadow(nir_shader *shader, unsigned textures_bitmask);
static inline nir_intrinsic_instr *
nir_reg_get_decl(nir_ssa_def *reg)
{
assert(reg->parent_instr->type == nir_instr_type_intrinsic);
nir_intrinsic_instr *decl = nir_instr_as_intrinsic(reg->parent_instr);
assert(decl->intrinsic == nir_intrinsic_decl_reg);
return decl;
}
#include "nir_inline_helpers.h" #include "nir_inline_helpers.h"
#ifdef __cplusplus #ifdef __cplusplus

View file

@ -1687,6 +1687,48 @@ nir_load_param(nir_builder *build, uint32_t param_idx)
return nir_build_load_param(build, param->num_components, param->bit_size, param_idx); return nir_build_load_param(build, param->num_components, param->bit_size, param_idx);
} }
#undef nir_decl_reg
static inline nir_ssa_def *
nir_decl_reg(nir_builder *b, unsigned num_components, unsigned bit_size,
unsigned num_array_elems)
{
nir_intrinsic_instr *decl =
nir_intrinsic_instr_create(b->shader, nir_intrinsic_decl_reg);
nir_intrinsic_set_num_components(decl, num_components);
nir_intrinsic_set_bit_size(decl, bit_size);
nir_intrinsic_set_num_array_elems(decl, num_array_elems);
nir_ssa_dest_init(&decl->instr, &decl->dest, 1, 32);
nir_instr_insert(nir_before_cf_list(&b->impl->body), &decl->instr);
return &decl->dest.ssa;
}
#undef nir_load_reg
static inline nir_ssa_def *
nir_load_reg(nir_builder *b, nir_ssa_def *reg)
{
nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
unsigned num_components = nir_intrinsic_num_components(decl);
unsigned bit_size = nir_intrinsic_bit_size(decl);
return nir_build_load_reg(b, num_components, bit_size, reg);
}
#undef nir_store_reg
static inline void
nir_store_reg(nir_builder *b, nir_ssa_def *value, nir_ssa_def *reg)
{
ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
ASSERTED unsigned num_components = nir_intrinsic_num_components(decl);
ASSERTED unsigned bit_size = nir_intrinsic_bit_size(decl);
assert(value->num_components == num_components);
assert(value->bit_size == bit_size);
nir_build_store_reg(b, value, reg);
}
static inline nir_tex_src static inline nir_tex_src
nir_tex_src_for_ssa(nir_tex_src_type src_type, nir_ssa_def *def) nir_tex_src_for_ssa(nir_tex_src_type src_type, nir_ssa_def *def)
{ {

View file

@ -226,6 +226,9 @@ build_prefixed_intrinsics = [
"load_global_constant", "load_global_constant",
"store_global", "store_global",
"load_reg",
"store_reg",
"deref_mode_is", "deref_mode_is",
] ]

View file

@ -294,6 +294,24 @@ index("unsigned", "resource_block_intel")
# Various flags describing the resource access # Various flags describing the resource access
index("nir_resource_data_intel", "resource_access_intel") index("nir_resource_data_intel", "resource_access_intel")
# Register metadata
# number of vector components
index("unsigned", "num_components")
# size of array (0 for no array)
index("unsigned", "num_array_elems")
# The bit-size of each channel; must be one of 1, 8, 16, 32, or 64
index("unsigned", "bit_size")
# True if this register may have different values in different SIMD invocations
# of the shader.
index("bool", "divergent")
# On a register load, floating-point absolute value/negate loaded value.
index("bool", "legacy_fabs")
index("bool", "legacy_fneg")
# On a register store, floating-point saturate the stored value.
index("bool", "legacy_fsat")
intrinsic("nop", flags=[CAN_ELIMINATE]) intrinsic("nop", flags=[CAN_ELIMINATE])
intrinsic("convert_alu_types", dest_comp=0, src_comp=[0], intrinsic("convert_alu_types", dest_comp=0, src_comp=[0],
@ -308,6 +326,33 @@ intrinsic("store_deref", src_comp=[-1, 0], indices=[WRITE_MASK, ACCESS])
intrinsic("copy_deref", src_comp=[-1, -1], indices=[DST_ACCESS, SRC_ACCESS]) intrinsic("copy_deref", src_comp=[-1, -1], indices=[DST_ACCESS, SRC_ACCESS])
intrinsic("memcpy_deref", src_comp=[-1, -1, 1], indices=[DST_ACCESS, SRC_ACCESS]) intrinsic("memcpy_deref", src_comp=[-1, -1, 1], indices=[DST_ACCESS, SRC_ACCESS])
# Returns an opaque handle representing a register indexed by BASE. The
# logically def-use list of a register is given by the use list of this handle.
# The shape of the underlying register is given by the indices, the handle
# itself is always a 32-bit scalar.
intrinsic("decl_reg", dest_comp=1,
indices=[NUM_COMPONENTS, NUM_ARRAY_ELEMS, BIT_SIZE, DIVERGENT],
flags=[CAN_ELIMINATE])
# Load a register given as the source directly with base offset BASE.
intrinsic("load_reg", dest_comp=0, src_comp=[1],
indices=[BASE, LEGACY_FABS, LEGACY_FNEG], flags=[CAN_ELIMINATE])
# Load a register given as first source indirectly with base offset BASE and
# indirect offset as second source.
intrinsic("load_reg_indirect", dest_comp=0, src_comp=[1, 1],
indices=[BASE, LEGACY_FABS, LEGACY_FNEG], flags=[CAN_ELIMINATE])
# Store the value in the first source to a register given as the second source
# directly with base offset BASE.
intrinsic("store_reg", src_comp=[0, 1],
indices=[BASE, WRITE_MASK, LEGACY_FSAT])
# Store the value in the first source to a register given as the second
# source indirectly with base offset BASE and indirect offset as third source.
intrinsic("store_reg_indirect", src_comp=[0, 1, 1],
indices=[BASE, WRITE_MASK, LEGACY_FSAT])
# Interpolation of input. The interp_deref_at* intrinsics are similar to the # Interpolation of input. The interp_deref_at* intrinsics are similar to the
# load_var intrinsic acting on a shader input except that they interpolate the # load_var intrinsic acting on a shader input except that they interpolate the
# input differently. The at_sample, at_offset and at_vertex intrinsics take an # input differently. The at_sample, at_offset and at_vertex intrinsics take an

View file

@ -585,12 +585,53 @@ image_intrin_format(nir_intrinsic_instr *instr)
return var->data.image.format; return var->data.image.format;
} }
static void
validate_register_handle(nir_src handle_src,
unsigned num_components,
unsigned bit_size,
validate_state *state)
{
if (!validate_assert(state, handle_src.is_ssa))
return;
nir_ssa_def *handle = handle_src.ssa;
nir_instr *parent = handle->parent_instr;
if (!validate_assert(state, parent->type == nir_instr_type_intrinsic))
return;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
if (!validate_assert(state, intr->intrinsic == nir_intrinsic_decl_reg))
return;
validate_assert(state, nir_intrinsic_num_components(intr) == num_components);
validate_assert(state, nir_intrinsic_bit_size(intr) == bit_size);
}
static void static void
validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state) validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
{ {
unsigned dest_bit_size = 0; unsigned dest_bit_size = 0;
unsigned src_bit_sizes[NIR_INTRINSIC_MAX_INPUTS] = { 0, }; unsigned src_bit_sizes[NIR_INTRINSIC_MAX_INPUTS] = { 0, };
switch (instr->intrinsic) { switch (instr->intrinsic) {
case nir_intrinsic_decl_reg:
assert(state->block == nir_start_block(state->impl));
break;
case nir_intrinsic_load_reg:
case nir_intrinsic_load_reg_indirect:
validate_register_handle(instr->src[0],
nir_dest_num_components(instr->dest),
nir_dest_bit_size(instr->dest), state);
break;
case nir_intrinsic_store_reg:
case nir_intrinsic_store_reg_indirect:
validate_register_handle(instr->src[1],
nir_src_num_components(instr->src[0]),
nir_src_bit_size(instr->src[0]), state);
break;
case nir_intrinsic_convert_alu_types: { case nir_intrinsic_convert_alu_types: {
nir_alu_type src_type = nir_intrinsic_src_type(instr); nir_alu_type src_type = nir_intrinsic_src_type(instr);
nir_alu_type dest_type = nir_intrinsic_dest_type(instr); nir_alu_type dest_type = nir_intrinsic_dest_type(instr);