diff --git a/src/intel/compiler/brw/brw_lower_simd_width.cpp b/src/intel/compiler/brw/brw_lower_simd_width.cpp
index 2091ecaf4ef..587d52420ca 100644
--- a/src/intel/compiler/brw/brw_lower_simd_width.cpp
+++ b/src/intel/compiler/brw/brw_lower_simd_width.cpp
@@ -207,6 +207,31 @@ is_half_float_src_dst(const brw_inst *inst)
    return false;
 }
 
+/**
+ * Send instructions are writing physical registers so it's important to
+ * allocate physically aligned register size when lowering. With types >=
+ * 4bytes this is always the case but with fp16 sampler loads it's not.
+ */
+static bool
+is_send_inst(const brw_inst *inst)
+{
+   switch (inst->opcode) {
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+   case FS_OPCODE_FB_READ_LOGICAL:
+   case SHADER_OPCODE_SAMPLER:
+   case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
+   case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
+   case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_URB_READ_LOGICAL:
+   case SHADER_OPCODE_URB_WRITE_LOGICAL:
+      return true;
+
+   default:
+      return false;
+   }
+}
+
 /**
  * Get the closest native SIMD width supported by the hardware for instruction
  * \p inst.  The instruction will be left untouched by
@@ -493,6 +518,23 @@ needs_dst_copy(const brw_builder &lbld, const brw_inst *inst)
    if (inst->dst.is_null())
       return false;
 
+   /* If we have a SIMD16 SEND message with a destination format like this :
+    *
+    *   g0 : |hf15|hf14|hf13|       ...      |hf7|hf6|hf5|hf4|hf3|hf2|hf1|hf0|
+    *
+    * and we have to lower to SIMD8, the lowered format will be this :
+    *
+    *   g0 : |           unused              |hf7|hf6|hf5|hf4|hf3|hf2|hf1|hf0|
+    *
+    * Since SEND messages operate on physical register, we need a copy of the
+    * destination because the second lowered SIMD8 message cannot write to the
+    * upper unused part of the register.
+    */
+   if (is_send_inst(inst) &&
+       (inst->dst.component_size(lbld.dispatch_width()) %
+        (reg_unit(lbld.shader->devinfo) * REG_SIZE)) != 0)
+      return true;
+
    /* If the instruction writes more than one component we'll have to shuffle
     * the results of multiple lowered instructions in order to make sure that
     * they end up arranged correctly in the original destination region.
@@ -556,8 +598,11 @@ emit_zip(const brw_builder &lbld_before, const brw_builder &lbld_after,
    const unsigned dst_size = (inst->size_written - residency_size) /
       inst->dst.component_size(inst->exec_size);
 
-   const brw_reg tmp = lbld_after.vgrf(inst->dst.type,
-                                      dst_size + inst->has_sampler_residency());
+   /* For SEND messages, align the allocation to physical registers */
+   const brw_reg tmp = lbld_after.vgrf(
+      inst->dst.type,
+      (is_send_inst(inst) ? align(dst_size, reg_unit(devinfo)) : dst_size) +
+      inst->has_sampler_residency() * reg_unit(devinfo));
 
    if (inst->predicate) {
       /* Handle predication by copying the original contents of the
@@ -585,7 +630,8 @@ emit_zip(const brw_builder &lbld_before, const brw_builder &lbld_after,
        */
       const brw_builder rbld = lbld_after.uniform();
       brw_reg local_res_reg = component(
-         retype(offset(tmp, lbld_before, dst_size), BRW_TYPE_UW), 0);
+         retype(offset(tmp, lbld_before, dst_size),
+                BRW_TYPE_UW), 0);
       brw_reg final_res_reg =
          retype(byte_offset(inst->dst,
                             inst->size_written - residency_size +
@@ -699,9 +745,13 @@ brw_lower_simd_width(brw_shader &s)
 
          split_inst->dst = emit_zip(lbld.before(inst),
                                    lbld_after, inst);
-         split_inst->size_written =
-            split_inst->dst.component_size(lower_width) * dst_size +
-            residency_size;
+         /* For SEND messages, align the data size to physical registers */
+         unsigned data_size =
+            split_inst->dst.component_size(lower_width) * dst_size;
+         if (is_send_inst(split_inst))
+            data_size = align(data_size, REG_SIZE * reg_unit(s.devinfo));
+
+         split_inst->size_written = data_size + residency_size;
 
          lbld.after(inst).emit(split_inst);
       }