diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 0991b733921..62a6921b9cc 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -729,7 +729,9 @@ fs_inst::size_read(const struct intel_device_info *devinfo, int arg) const case FIXED_GRF: case VGRF: case ATTR: - return components_read(arg) * src[arg].component_size(exec_size); + /* Regardless of exec_size, values marked as scalar are SIMD8. */ + return components_read(arg) * + src[arg].component_size(src[arg].is_scalar ? 8 * reg_unit(devinfo) : exec_size); } return 0; } diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h index e6b4c735071..9d5f6acd4ef 100644 --- a/src/intel/compiler/brw_fs_builder.h +++ b/src/intel/compiler/brw_fs_builder.h @@ -28,6 +28,9 @@ #include "brw_eu.h" #include "brw_fs.h" +static inline brw_reg offset(const brw_reg &, const brw::fs_builder &, + unsigned); + namespace brw { /** * Toolbox to assemble an FS IR program out of individual instructions. @@ -402,8 +405,9 @@ namespace brw { move_to_vgrf(const brw_reg &src, unsigned num_components) const { brw_reg *const src_comps = new brw_reg[num_components]; + for (unsigned i = 0; i < num_components; i++) - src_comps[i] = offset(src, dispatch_width(), i); + src_comps[i] = offset(src, *this, i); const brw_reg dst = vgrf(src.type, num_components); LOAD_PAYLOAD(dst, src_comps, num_components, 0); @@ -891,8 +895,36 @@ namespace brw { }; } +/** + * Offset by a number of components into a VGRF + * + * It is assumed that the VGRF represents a vector (e.g., returned by + * load_uniform or a texture operation). Convergent and divergent values are + * stored differently, so care must be taken to offset properly. + */ static inline brw_reg offset(const brw_reg ®, const brw::fs_builder &bld, unsigned delta) { + /* If the value is convergent (stored as one or more SIMD8), offset using + * SIMD8 and select component 0. + */ + if (reg.is_scalar) { + const unsigned allocation_width = 8 * reg_unit(bld.shader->devinfo); + + brw_reg offset_reg = offset(reg, allocation_width, delta); + + /* If the dispatch width is larger than the allocation width, that + * implies that the register can only be used as a source. Otherwise the + * instruction would write past the allocation size of the register. + */ + if (bld.dispatch_width() > allocation_width) + return component(offset_reg, 0); + else + return offset_reg; + } + + /* Offset to the component assuming the value was allocated in + * dispatch_width units. + */ return offset(reg, bld.dispatch_width(), delta); } diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp index d23cffcd6bd..ed29dba41ff 100644 --- a/src/intel/compiler/brw_fs_copy_propagation.cpp +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -1778,11 +1778,19 @@ find_value_for_offset(fs_inst *def, const brw_reg &src, unsigned src_size) case SHADER_OPCODE_LOAD_PAYLOAD: { unsigned offset = 0; for (int i = def->header_size; i < def->sources; i++) { - const unsigned splat = def->src[i].stride == 0 ? def->exec_size : 1; + /* Ignore the source splat if the source is a scalar. In that case + * always use just the first component. + */ + const unsigned splat = + (def->src[i].stride == 0 && !src.is_scalar) || def->src[i].file == IMM ? def->exec_size : 1; + const unsigned component_size = + def->src[i].component_size(def->exec_size); + if (offset == src.offset) { if (def->dst.type == def->src[i].type && def->src[i].stride <= 1 && - def->src[i].component_size(def->exec_size) * splat == src_size) + (component_size * splat == src_size || + (def->src[i].file == IMM && component_size == src_size))) val = def->src[i]; break; diff --git a/src/intel/compiler/brw_fs_lower.cpp b/src/intel/compiler/brw_fs_lower.cpp index 9bc8adef646..11648ce1a63 100644 --- a/src/intel/compiler/brw_fs_lower.cpp +++ b/src/intel/compiler/brw_fs_lower.cpp @@ -749,6 +749,7 @@ brw_fs_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, fs_inst new_reg = byte_offset(new_reg, reg->offset); new_reg.abs = reg->abs; new_reg.negate = reg->negate; + new_reg.is_scalar = reg->is_scalar; *reg = new_reg; } diff --git a/src/intel/compiler/brw_fs_lower_simd_width.cpp b/src/intel/compiler/brw_fs_lower_simd_width.cpp index ac7ad3ed797..c385438b9bb 100644 --- a/src/intel/compiler/brw_fs_lower_simd_width.cpp +++ b/src/intel/compiler/brw_fs_lower_simd_width.cpp @@ -478,11 +478,12 @@ needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i) if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) return false; - return !(is_periodic(inst->src[i], lbld.dispatch_width()) || - (inst->components_read(i) == 1 && - lbld.dispatch_width() <= inst->exec_size)) || - (inst->flags_written(lbld.shader->devinfo) & - brw_fs_flag_mask(inst->src[i], brw_type_size_bytes(inst->src[i].type))); + return !inst->src[i].is_scalar && + (!(is_periodic(inst->src[i], lbld.dispatch_width()) || + (inst->components_read(i) == 1 && + lbld.dispatch_width() <= inst->exec_size)) || + (inst->flags_written(lbld.shader->devinfo) & + brw_fs_flag_mask(inst->src[i], brw_type_size_bytes(inst->src[i].type)))); } /** @@ -509,7 +510,8 @@ emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i) return tmp; } else if (is_periodic(inst->src[i], lbld.dispatch_width()) || - (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)) { + (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) || + inst->src[i].is_scalar) { /* The source is invariant for all dispatch_width-wide groups of the * original region. * diff --git a/src/intel/compiler/brw_fs_validate.cpp b/src/intel/compiler/brw_fs_validate.cpp index 113484580b7..698c9330c4f 100644 --- a/src/intel/compiler/brw_fs_validate.cpp +++ b/src/intel/compiler/brw_fs_validate.cpp @@ -377,6 +377,9 @@ brw_fs_validate(const fs_visitor &s) if (inst->dst.file == VGRF) { fsv_assert_lte(inst->dst.offset / REG_SIZE + regs_written(inst), s.alloc.sizes[inst->dst.nr]); + + if (inst->exec_size > 1) + fsv_assert_ne(inst->dst.stride, 0); } for (unsigned i = 0; i < inst->sources; i++) { diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h index 448ad2556d2..9b14067a87d 100644 --- a/src/intel/compiler/brw_reg.h +++ b/src/intel/compiler/brw_reg.h @@ -182,7 +182,19 @@ typedef struct brw_reg { unsigned vstride:4; /* source only */ unsigned width:3; /* src only, align1 only */ unsigned hstride:2; /* align1 only */ - unsigned pad1:1; + + /** + * Does this register represent a scalar value? + * + * Registers are allocated in SIMD8 parcels, but may be used to + * represent convergent (i.e., scalar) values. As a destination, it + * is written as SIMD8. As a source, it may be read as <8,8,1> in + * SIMD8 instructions or <0,1,0> on other execution sizes. + * + * If the value represents a vector (e.g., a convergent load_uniform + * of a vec4), it will be stored as multiple SIMD8 registers. + */ + unsigned is_scalar:1; }; double df; @@ -405,7 +417,7 @@ brw_make_reg(enum brw_reg_file file, reg.vstride = vstride; reg.width = width; reg.hstride = hstride; - reg.pad1 = 0; + reg.is_scalar = 0; reg.offset = 0; reg.stride = 1;