From 7229bffcb133b68f91607fb6bccbe0e48b6a55bd Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Tue, 16 May 2023 11:19:49 -0400 Subject: [PATCH] nir: Add intrinsics for register access Note the writemask handling is chosen for consistency with the rest of NIR. In every other instance, writemask=w requires a vec4 source. This is hardcoded into nir_validate and nir_print as what it means to have a writemask. More importantly, consistency with how register writemasks currently work. nir_print hides it, but r0.w = fneg ssa_1.x is actually a vec4 instruction with source ssa_1.xxxx. As a silly example nir_dest_num_components(that) = 4 in the old model. I realize this is quite strange coming from a scalar ISA, but it's perfectly natural for the class of vec4 hardware for which this was designed. In that hardware, conceptually all instructions are vec4`, so the sequence "fneg ssa_1 and write to channel w" is implemented as "fneg a vec4 with ssa_1.x in the last component and write that vec4 out but mask to write only the w channel". Isn't this inefficient? It can be. To save power, Midgard has scalar ALUs in addition to vec4 ALUs. Those details are confined to the backend VLIW scheduler; the instruction selection is still done as vec4. This mechanism has little in common with AMD's SALUs. Midgard has a wave size of 1, with special hacks for derivatives. As a result, all backends consuming register writemasks are expecting this pattern of code. Changing the store to take a vec1 instead of a vec4 would require changing every backend to reswizzle the sources to resurrect the vec4. I started typing a branch to do this yesterday, but it made a mess of both Midgard and nir-to-tgsi. Without any good reason to think it'd actually help performance, I abandoned the idea. Getting all 15 backends converted to the helpers is enough of a challenge without forcing 10 backends to reswizzle their sources too. Signed-off-by: Alyssa Rosenzweig Reviewed-by: Faith Ekstrand Part-of: --- src/compiler/nir/nir.h | 10 +++++ src/compiler/nir/nir_builder.h | 42 +++++++++++++++++++++ src/compiler/nir/nir_builder_opcodes_h.py | 3 ++ src/compiler/nir/nir_intrinsics.py | 45 +++++++++++++++++++++++ src/compiler/nir/nir_validate.c | 41 +++++++++++++++++++++ 5 files changed, 141 insertions(+) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index aa86d2f5618..b99f5ad7ddf 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -6249,6 +6249,16 @@ bool nir_mod_analysis(nir_ssa_scalar val, nir_alu_type val_type, unsigned div, u bool nir_remove_tex_shadow(nir_shader *shader, unsigned textures_bitmask); +static inline nir_intrinsic_instr * +nir_reg_get_decl(nir_ssa_def *reg) +{ + assert(reg->parent_instr->type == nir_instr_type_intrinsic); + nir_intrinsic_instr *decl = nir_instr_as_intrinsic(reg->parent_instr); + assert(decl->intrinsic == nir_intrinsic_decl_reg); + + return decl; +} + #include "nir_inline_helpers.h" #ifdef __cplusplus diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h index 6baf8f9791b..5e17ed743f4 100644 --- a/src/compiler/nir/nir_builder.h +++ b/src/compiler/nir/nir_builder.h @@ -1687,6 +1687,48 @@ nir_load_param(nir_builder *build, uint32_t param_idx) return nir_build_load_param(build, param->num_components, param->bit_size, param_idx); } +#undef nir_decl_reg +static inline nir_ssa_def * +nir_decl_reg(nir_builder *b, unsigned num_components, unsigned bit_size, + unsigned num_array_elems) +{ + nir_intrinsic_instr *decl = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_decl_reg); + nir_intrinsic_set_num_components(decl, num_components); + nir_intrinsic_set_bit_size(decl, bit_size); + nir_intrinsic_set_num_array_elems(decl, num_array_elems); + nir_ssa_dest_init(&decl->instr, &decl->dest, 1, 32); + + nir_instr_insert(nir_before_cf_list(&b->impl->body), &decl->instr); + + return &decl->dest.ssa; +} + +#undef nir_load_reg +static inline nir_ssa_def * +nir_load_reg(nir_builder *b, nir_ssa_def *reg) +{ + nir_intrinsic_instr *decl = nir_reg_get_decl(reg); + unsigned num_components = nir_intrinsic_num_components(decl); + unsigned bit_size = nir_intrinsic_bit_size(decl); + + return nir_build_load_reg(b, num_components, bit_size, reg); +} + +#undef nir_store_reg +static inline void +nir_store_reg(nir_builder *b, nir_ssa_def *value, nir_ssa_def *reg) +{ + ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg); + ASSERTED unsigned num_components = nir_intrinsic_num_components(decl); + ASSERTED unsigned bit_size = nir_intrinsic_bit_size(decl); + + assert(value->num_components == num_components); + assert(value->bit_size == bit_size); + + nir_build_store_reg(b, value, reg); +} + static inline nir_tex_src nir_tex_src_for_ssa(nir_tex_src_type src_type, nir_ssa_def *def) { diff --git a/src/compiler/nir/nir_builder_opcodes_h.py b/src/compiler/nir/nir_builder_opcodes_h.py index 104336d0d0d..8aba90510aa 100644 --- a/src/compiler/nir/nir_builder_opcodes_h.py +++ b/src/compiler/nir/nir_builder_opcodes_h.py @@ -226,6 +226,9 @@ build_prefixed_intrinsics = [ "load_global_constant", "store_global", + "load_reg", + "store_reg", + "deref_mode_is", ] diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 70dc50370e2..ffa02de1fba 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -294,6 +294,24 @@ index("unsigned", "resource_block_intel") # Various flags describing the resource access index("nir_resource_data_intel", "resource_access_intel") +# Register metadata +# number of vector components +index("unsigned", "num_components") +# size of array (0 for no array) +index("unsigned", "num_array_elems") +# The bit-size of each channel; must be one of 1, 8, 16, 32, or 64 +index("unsigned", "bit_size") +# True if this register may have different values in different SIMD invocations +# of the shader. +index("bool", "divergent") + +# On a register load, floating-point absolute value/negate loaded value. +index("bool", "legacy_fabs") +index("bool", "legacy_fneg") + +# On a register store, floating-point saturate the stored value. +index("bool", "legacy_fsat") + intrinsic("nop", flags=[CAN_ELIMINATE]) intrinsic("convert_alu_types", dest_comp=0, src_comp=[0], @@ -308,6 +326,33 @@ intrinsic("store_deref", src_comp=[-1, 0], indices=[WRITE_MASK, ACCESS]) intrinsic("copy_deref", src_comp=[-1, -1], indices=[DST_ACCESS, SRC_ACCESS]) intrinsic("memcpy_deref", src_comp=[-1, -1, 1], indices=[DST_ACCESS, SRC_ACCESS]) +# Returns an opaque handle representing a register indexed by BASE. The +# logically def-use list of a register is given by the use list of this handle. +# The shape of the underlying register is given by the indices, the handle +# itself is always a 32-bit scalar. +intrinsic("decl_reg", dest_comp=1, + indices=[NUM_COMPONENTS, NUM_ARRAY_ELEMS, BIT_SIZE, DIVERGENT], + flags=[CAN_ELIMINATE]) + +# Load a register given as the source directly with base offset BASE. +intrinsic("load_reg", dest_comp=0, src_comp=[1], + indices=[BASE, LEGACY_FABS, LEGACY_FNEG], flags=[CAN_ELIMINATE]) + +# Load a register given as first source indirectly with base offset BASE and +# indirect offset as second source. +intrinsic("load_reg_indirect", dest_comp=0, src_comp=[1, 1], + indices=[BASE, LEGACY_FABS, LEGACY_FNEG], flags=[CAN_ELIMINATE]) + +# Store the value in the first source to a register given as the second source +# directly with base offset BASE. +intrinsic("store_reg", src_comp=[0, 1], + indices=[BASE, WRITE_MASK, LEGACY_FSAT]) + +# Store the value in the first source to a register given as the second +# source indirectly with base offset BASE and indirect offset as third source. +intrinsic("store_reg_indirect", src_comp=[0, 1, 1], + indices=[BASE, WRITE_MASK, LEGACY_FSAT]) + # Interpolation of input. The interp_deref_at* intrinsics are similar to the # load_var intrinsic acting on a shader input except that they interpolate the # input differently. The at_sample, at_offset and at_vertex intrinsics take an diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c index 8881aa1461a..6f2eab2db1f 100644 --- a/src/compiler/nir/nir_validate.c +++ b/src/compiler/nir/nir_validate.c @@ -585,12 +585,53 @@ image_intrin_format(nir_intrinsic_instr *instr) return var->data.image.format; } +static void +validate_register_handle(nir_src handle_src, + unsigned num_components, + unsigned bit_size, + validate_state *state) +{ + if (!validate_assert(state, handle_src.is_ssa)) + return; + + nir_ssa_def *handle = handle_src.ssa; + nir_instr *parent = handle->parent_instr; + + if (!validate_assert(state, parent->type == nir_instr_type_intrinsic)) + return; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent); + if (!validate_assert(state, intr->intrinsic == nir_intrinsic_decl_reg)) + return; + + validate_assert(state, nir_intrinsic_num_components(intr) == num_components); + validate_assert(state, nir_intrinsic_bit_size(intr) == bit_size); +} + static void validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state) { unsigned dest_bit_size = 0; unsigned src_bit_sizes[NIR_INTRINSIC_MAX_INPUTS] = { 0, }; switch (instr->intrinsic) { + case nir_intrinsic_decl_reg: + assert(state->block == nir_start_block(state->impl)); + break; + + case nir_intrinsic_load_reg: + case nir_intrinsic_load_reg_indirect: + validate_register_handle(instr->src[0], + nir_dest_num_components(instr->dest), + nir_dest_bit_size(instr->dest), state); + break; + + case nir_intrinsic_store_reg: + case nir_intrinsic_store_reg_indirect: + validate_register_handle(instr->src[1], + nir_src_num_components(instr->src[0]), + nir_src_bit_size(instr->src[0]), state); + break; + case nir_intrinsic_convert_alu_types: { nir_alu_type src_type = nir_intrinsic_src_type(instr); nir_alu_type dest_type = nir_intrinsic_dest_type(instr);