From 1bff4f93ca98847a637bbf836e290b70fe4db590 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Fri, 9 Feb 2024 17:12:11 -0800 Subject: [PATCH] brw: Basic infrastructure to store convergent values as scalars In SIMD16 and SIMD32, storing convergent values in full 16- or 32-channel registers is wasteful. It wastes register space, and in most cases on SIMD32, it wastes instructions. Our register allocator is not clever enough to handle scalar allocations. It's fundamental unit of allocation is SIMD8. Start treating convergent values as SIMD8. Add a tracking bit in brw_reg to specify that a register represents a convergent, scalar value. This has two implications: 1. All channels of the SIMD8 register must contain the same value. In general, this means that writes to the register must be force_writemask_all and exec_size = 8; 2. Reads of this register can (and should) use <0,1,0> stride. SIMD8 instructions that have restrictions on source stride can us <8,8,1>. Values that are vectors (e.g., results of load_uniform or texture operations) will be stored as multiple SIMD8 hardware registers. v2: brw_fs_opt_copy_propagation_defs fix from Ken. Fix for Xe2. v3: Eliminte offset_to_scalar(). Remove mention of vec4 backend in brw_reg.h. Both suggested by Caio. The offset_to_scalar() change necessitates some trickery in the fs_builder offset() function, but I think this is an improvement overall. There is also some rework in find_value_for_offset to account for the possibility that is_scalar sources in LOAD_PAYLOAD might be <8;8,1> or <0;1,0>. Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_fs.cpp | 4 ++- src/intel/compiler/brw_fs_builder.h | 34 ++++++++++++++++++- .../compiler/brw_fs_copy_propagation.cpp | 12 +++++-- src/intel/compiler/brw_fs_lower.cpp | 1 + .../compiler/brw_fs_lower_simd_width.cpp | 14 ++++---- src/intel/compiler/brw_fs_validate.cpp | 3 ++ src/intel/compiler/brw_reg.h | 16 +++++++-- 7 files changed, 72 insertions(+), 12 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 0991b733921..62a6921b9cc 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -729,7 +729,9 @@ fs_inst::size_read(const struct intel_device_info *devinfo, int arg) const case FIXED_GRF: case VGRF: case ATTR: - return components_read(arg) * src[arg].component_size(exec_size); + /* Regardless of exec_size, values marked as scalar are SIMD8. */ + return components_read(arg) * + src[arg].component_size(src[arg].is_scalar ? 8 * reg_unit(devinfo) : exec_size); } return 0; } diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h index e6b4c735071..9d5f6acd4ef 100644 --- a/src/intel/compiler/brw_fs_builder.h +++ b/src/intel/compiler/brw_fs_builder.h @@ -28,6 +28,9 @@ #include "brw_eu.h" #include "brw_fs.h" +static inline brw_reg offset(const brw_reg &, const brw::fs_builder &, + unsigned); + namespace brw { /** * Toolbox to assemble an FS IR program out of individual instructions. @@ -402,8 +405,9 @@ namespace brw { move_to_vgrf(const brw_reg &src, unsigned num_components) const { brw_reg *const src_comps = new brw_reg[num_components]; + for (unsigned i = 0; i < num_components; i++) - src_comps[i] = offset(src, dispatch_width(), i); + src_comps[i] = offset(src, *this, i); const brw_reg dst = vgrf(src.type, num_components); LOAD_PAYLOAD(dst, src_comps, num_components, 0); @@ -891,8 +895,36 @@ namespace brw { }; } +/** + * Offset by a number of components into a VGRF + * + * It is assumed that the VGRF represents a vector (e.g., returned by + * load_uniform or a texture operation). Convergent and divergent values are + * stored differently, so care must be taken to offset properly. + */ static inline brw_reg offset(const brw_reg ®, const brw::fs_builder &bld, unsigned delta) { + /* If the value is convergent (stored as one or more SIMD8), offset using + * SIMD8 and select component 0. + */ + if (reg.is_scalar) { + const unsigned allocation_width = 8 * reg_unit(bld.shader->devinfo); + + brw_reg offset_reg = offset(reg, allocation_width, delta); + + /* If the dispatch width is larger than the allocation width, that + * implies that the register can only be used as a source. Otherwise the + * instruction would write past the allocation size of the register. + */ + if (bld.dispatch_width() > allocation_width) + return component(offset_reg, 0); + else + return offset_reg; + } + + /* Offset to the component assuming the value was allocated in + * dispatch_width units. + */ return offset(reg, bld.dispatch_width(), delta); } diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp index d23cffcd6bd..ed29dba41ff 100644 --- a/src/intel/compiler/brw_fs_copy_propagation.cpp +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -1778,11 +1778,19 @@ find_value_for_offset(fs_inst *def, const brw_reg &src, unsigned src_size) case SHADER_OPCODE_LOAD_PAYLOAD: { unsigned offset = 0; for (int i = def->header_size; i < def->sources; i++) { - const unsigned splat = def->src[i].stride == 0 ? def->exec_size : 1; + /* Ignore the source splat if the source is a scalar. In that case + * always use just the first component. + */ + const unsigned splat = + (def->src[i].stride == 0 && !src.is_scalar) || def->src[i].file == IMM ? def->exec_size : 1; + const unsigned component_size = + def->src[i].component_size(def->exec_size); + if (offset == src.offset) { if (def->dst.type == def->src[i].type && def->src[i].stride <= 1 && - def->src[i].component_size(def->exec_size) * splat == src_size) + (component_size * splat == src_size || + (def->src[i].file == IMM && component_size == src_size))) val = def->src[i]; break; diff --git a/src/intel/compiler/brw_fs_lower.cpp b/src/intel/compiler/brw_fs_lower.cpp index 9bc8adef646..11648ce1a63 100644 --- a/src/intel/compiler/brw_fs_lower.cpp +++ b/src/intel/compiler/brw_fs_lower.cpp @@ -749,6 +749,7 @@ brw_fs_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, fs_inst new_reg = byte_offset(new_reg, reg->offset); new_reg.abs = reg->abs; new_reg.negate = reg->negate; + new_reg.is_scalar = reg->is_scalar; *reg = new_reg; } diff --git a/src/intel/compiler/brw_fs_lower_simd_width.cpp b/src/intel/compiler/brw_fs_lower_simd_width.cpp index ac7ad3ed797..c385438b9bb 100644 --- a/src/intel/compiler/brw_fs_lower_simd_width.cpp +++ b/src/intel/compiler/brw_fs_lower_simd_width.cpp @@ -478,11 +478,12 @@ needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i) if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) return false; - return !(is_periodic(inst->src[i], lbld.dispatch_width()) || - (inst->components_read(i) == 1 && - lbld.dispatch_width() <= inst->exec_size)) || - (inst->flags_written(lbld.shader->devinfo) & - brw_fs_flag_mask(inst->src[i], brw_type_size_bytes(inst->src[i].type))); + return !inst->src[i].is_scalar && + (!(is_periodic(inst->src[i], lbld.dispatch_width()) || + (inst->components_read(i) == 1 && + lbld.dispatch_width() <= inst->exec_size)) || + (inst->flags_written(lbld.shader->devinfo) & + brw_fs_flag_mask(inst->src[i], brw_type_size_bytes(inst->src[i].type)))); } /** @@ -509,7 +510,8 @@ emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i) return tmp; } else if (is_periodic(inst->src[i], lbld.dispatch_width()) || - (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)) { + (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) || + inst->src[i].is_scalar) { /* The source is invariant for all dispatch_width-wide groups of the * original region. * diff --git a/src/intel/compiler/brw_fs_validate.cpp b/src/intel/compiler/brw_fs_validate.cpp index 113484580b7..698c9330c4f 100644 --- a/src/intel/compiler/brw_fs_validate.cpp +++ b/src/intel/compiler/brw_fs_validate.cpp @@ -377,6 +377,9 @@ brw_fs_validate(const fs_visitor &s) if (inst->dst.file == VGRF) { fsv_assert_lte(inst->dst.offset / REG_SIZE + regs_written(inst), s.alloc.sizes[inst->dst.nr]); + + if (inst->exec_size > 1) + fsv_assert_ne(inst->dst.stride, 0); } for (unsigned i = 0; i < inst->sources; i++) { diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h index 448ad2556d2..9b14067a87d 100644 --- a/src/intel/compiler/brw_reg.h +++ b/src/intel/compiler/brw_reg.h @@ -182,7 +182,19 @@ typedef struct brw_reg { unsigned vstride:4; /* source only */ unsigned width:3; /* src only, align1 only */ unsigned hstride:2; /* align1 only */ - unsigned pad1:1; + + /** + * Does this register represent a scalar value? + * + * Registers are allocated in SIMD8 parcels, but may be used to + * represent convergent (i.e., scalar) values. As a destination, it + * is written as SIMD8. As a source, it may be read as <8,8,1> in + * SIMD8 instructions or <0,1,0> on other execution sizes. + * + * If the value represents a vector (e.g., a convergent load_uniform + * of a vec4), it will be stored as multiple SIMD8 registers. + */ + unsigned is_scalar:1; }; double df; @@ -405,7 +417,7 @@ brw_make_reg(enum brw_reg_file file, reg.vstride = vstride; reg.width = width; reg.hstride = hstride; - reg.pad1 = 0; + reg.is_scalar = 0; reg.offset = 0; reg.stride = 1;