diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 0991b733921..62a6921b9cc 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -729,7 +729,9 @@ fs_inst::size_read(const struct intel_device_info *devinfo, int arg) const
    case FIXED_GRF:
    case VGRF:
    case ATTR:
-      return components_read(arg) * src[arg].component_size(exec_size);
+      /* Regardless of exec_size, values marked as scalar are SIMD8. */
+      return components_read(arg) *
+             src[arg].component_size(src[arg].is_scalar ? 8 * reg_unit(devinfo) : exec_size);
    }
    return 0;
 }
diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h
index e6b4c735071..9d5f6acd4ef 100644
--- a/src/intel/compiler/brw_fs_builder.h
+++ b/src/intel/compiler/brw_fs_builder.h
@@ -28,6 +28,9 @@
 #include "brw_eu.h"
 #include "brw_fs.h"
 
+static inline brw_reg offset(const brw_reg &, const brw::fs_builder &,
+                             unsigned);
+
 namespace brw {
    /**
     * Toolbox to assemble an FS IR program out of individual instructions.
@@ -402,8 +405,9 @@ namespace brw {
       move_to_vgrf(const brw_reg &src, unsigned num_components) const
       {
          brw_reg *const src_comps = new brw_reg[num_components];
+
          for (unsigned i = 0; i < num_components; i++)
-            src_comps[i] = offset(src, dispatch_width(), i);
+            src_comps[i] = offset(src, *this, i);
 
          const brw_reg dst = vgrf(src.type, num_components);
          LOAD_PAYLOAD(dst, src_comps, num_components, 0);
@@ -891,8 +895,36 @@ namespace brw {
    };
 }
 
+/**
+ * Offset by a number of components into a VGRF
+ *
+ * It is assumed that the VGRF represents a vector (e.g., returned by
+ * load_uniform or a texture operation). Convergent and divergent values are
+ * stored differently, so care must be taken to offset properly.
+ */
 static inline brw_reg
 offset(const brw_reg &reg, const brw::fs_builder &bld, unsigned delta)
 {
+   /* If the value is convergent (stored as one or more SIMD8), offset using
+    * SIMD8 and select component 0.
+    */
+   if (reg.is_scalar) {
+      const unsigned allocation_width = 8 * reg_unit(bld.shader->devinfo);
+
+      brw_reg offset_reg = offset(reg, allocation_width, delta);
+
+      /* If the dispatch width is larger than the allocation width, that
+       * implies that the register can only be used as a source. Otherwise the
+       * instruction would write past the allocation size of the register.
+       */
+      if (bld.dispatch_width() > allocation_width)
+         return component(offset_reg, 0);
+      else
+         return offset_reg;
+   }
+
+   /* Offset to the component assuming the value was allocated in
+    * dispatch_width units.
+    */
    return offset(reg, bld.dispatch_width(), delta);
 }
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
index d23cffcd6bd..ed29dba41ff 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -1778,11 +1778,19 @@ find_value_for_offset(fs_inst *def, const brw_reg &src, unsigned src_size)
    case SHADER_OPCODE_LOAD_PAYLOAD: {
       unsigned offset = 0;
       for (int i = def->header_size; i < def->sources; i++) {
-         const unsigned splat = def->src[i].stride == 0 ? def->exec_size : 1;
+         /* Ignore the source splat if the source is a scalar. In that case
+          * always use just the first component.
+          */
+         const unsigned splat =
+            (def->src[i].stride == 0 && !src.is_scalar) || def->src[i].file == IMM ? def->exec_size : 1;
+         const unsigned component_size =
+            def->src[i].component_size(def->exec_size);
+
          if (offset == src.offset) {
             if (def->dst.type == def->src[i].type &&
                 def->src[i].stride <= 1 &&
-                def->src[i].component_size(def->exec_size) * splat == src_size)
+                (component_size * splat == src_size ||
+                 (def->src[i].file == IMM && component_size == src_size)))
                val = def->src[i];
 
             break;
diff --git a/src/intel/compiler/brw_fs_lower.cpp b/src/intel/compiler/brw_fs_lower.cpp
index 9bc8adef646..11648ce1a63 100644
--- a/src/intel/compiler/brw_fs_lower.cpp
+++ b/src/intel/compiler/brw_fs_lower.cpp
@@ -749,6 +749,7 @@ brw_fs_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, fs_inst
    new_reg = byte_offset(new_reg, reg->offset);
    new_reg.abs = reg->abs;
    new_reg.negate = reg->negate;
+   new_reg.is_scalar = reg->is_scalar;
 
    *reg = new_reg;
 }
diff --git a/src/intel/compiler/brw_fs_lower_simd_width.cpp b/src/intel/compiler/brw_fs_lower_simd_width.cpp
index ac7ad3ed797..c385438b9bb 100644
--- a/src/intel/compiler/brw_fs_lower_simd_width.cpp
+++ b/src/intel/compiler/brw_fs_lower_simd_width.cpp
@@ -478,11 +478,12 @@ needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
    if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
       return false;
 
-   return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
-            (inst->components_read(i) == 1 &&
-             lbld.dispatch_width() <= inst->exec_size)) ||
-          (inst->flags_written(lbld.shader->devinfo) &
-           brw_fs_flag_mask(inst->src[i], brw_type_size_bytes(inst->src[i].type)));
+   return !inst->src[i].is_scalar &&
+          (!(is_periodic(inst->src[i], lbld.dispatch_width()) ||
+             (inst->components_read(i) == 1 &&
+              lbld.dispatch_width() <= inst->exec_size)) ||
+           (inst->flags_written(lbld.shader->devinfo) &
+            brw_fs_flag_mask(inst->src[i], brw_type_size_bytes(inst->src[i].type))));
 }
 
 /**
@@ -509,7 +510,8 @@ emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
 
       return tmp;
    } else if (is_periodic(inst->src[i], lbld.dispatch_width()) ||
-              (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)) {
+              (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) ||
+              inst->src[i].is_scalar) {
       /* The source is invariant for all dispatch_width-wide groups of the
        * original region.
        *
diff --git a/src/intel/compiler/brw_fs_validate.cpp b/src/intel/compiler/brw_fs_validate.cpp
index 113484580b7..698c9330c4f 100644
--- a/src/intel/compiler/brw_fs_validate.cpp
+++ b/src/intel/compiler/brw_fs_validate.cpp
@@ -377,6 +377,9 @@ brw_fs_validate(const fs_visitor &s)
       if (inst->dst.file == VGRF) {
          fsv_assert_lte(inst->dst.offset / REG_SIZE + regs_written(inst),
                         s.alloc.sizes[inst->dst.nr]);
+
+         if (inst->exec_size > 1)
+            fsv_assert_ne(inst->dst.stride, 0);
       }
 
       for (unsigned i = 0; i < inst->sources; i++) {
diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h
index 448ad2556d2..9b14067a87d 100644
--- a/src/intel/compiler/brw_reg.h
+++ b/src/intel/compiler/brw_reg.h
@@ -182,7 +182,19 @@ typedef struct brw_reg {
          unsigned vstride:4;      /* source only */
          unsigned width:3;        /* src only, align1 only */
          unsigned hstride:2;      /* align1 only */
-         unsigned pad1:1;
+
+         /**
+          * Does this register represent a scalar value?
+          *
+          * Registers are allocated in SIMD8 parcels, but may be used to
+          * represent convergent (i.e., scalar) values. As a destination, it
+          * is written as SIMD8. As a source, it may be read as <8,8,1> in
+          * SIMD8 instructions or <0,1,0> on other execution sizes.
+          *
+          * If the value represents a vector (e.g., a convergent load_uniform
+          * of a vec4), it will be stored as multiple SIMD8 registers.
+          */
+         unsigned is_scalar:1;
       };
 
       double df;
@@ -405,7 +417,7 @@ brw_make_reg(enum brw_reg_file file,
    reg.vstride = vstride;
    reg.width = width;
    reg.hstride = hstride;
-   reg.pad1 = 0;
+   reg.is_scalar = 0;
 
    reg.offset = 0;
    reg.stride = 1;