diff --git a/.pick_status.json b/.pick_status.json
index daf89bb3005..1875bb8075b 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -22,7 +22,7 @@
         "description": "intel: Don't cross DWORD boundaries with byte scratch load/store",
         "nominated": true,
         "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": "fd04f858b0aa9f688f5dfb041ccb706da96f862a"
     },
diff --git a/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c b/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c
index 2634ef0e47e..d7c11c9df1c 100644
--- a/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c
+++ b/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c
@@ -91,6 +91,8 @@ lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
    const unsigned bit_size = intrin->dest.ssa.bit_size;
    const unsigned num_components = intrin->dest.ssa.num_components;
    const unsigned bytes_read = num_components * (bit_size / 8);
+   const unsigned align_mul = nir_intrinsic_align_mul(intrin);
+   const unsigned align_offset = nir_intrinsic_align_offset(intrin);
    const unsigned align = nir_intrinsic_align(intrin);
 
    if (bit_size == 32 && align >= 32 && intrin->num_components <= 4 &&
@@ -153,9 +155,24 @@ lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
          const unsigned bytes_left = bytes_read - load_offset;
          unsigned load_bit_size, load_comps;
          if (align < 4) {
-            load_comps = 1;
             /* Choose a byte, word, or dword */
-            load_bit_size = util_next_power_of_two(MIN2(bytes_left, 4)) * 8;
+            unsigned load_bytes = util_next_power_of_two(MIN2(bytes_left, 4));
+
+            if (intrin->intrinsic == nir_intrinsic_load_scratch) {
+               /* The way scratch address swizzling works in the back-end, it
+                * happens at a DWORD granularity so we can't have a single load
+                * or store cross a DWORD boundary.
+                */
+               if ((align_offset % 4) + load_bytes > MIN2(align_mul, 4))
+                  load_bytes = MIN2(align_mul, 4) - (align_offset % 4);
+            }
+
+            /* Must be a power of two */
+            if (load_bytes == 3)
+               load_bytes = 2;
+
+            load_bit_size = load_bytes * 8;
+            load_comps = 1;
          } else {
             assert(load_offset % 4 == 0);
             load_bit_size = 32;
@@ -245,11 +262,23 @@ lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
          store_bit_size = 32;
          store_comps = needs_scalar ? 1 : MIN2(chunk_bytes, 16) / 4;
       } else {
+         unsigned store_bytes = MIN2(chunk_bytes, 4);
+
+         if (intrin->intrinsic == nir_intrinsic_store_scratch) {
+            /* The way scratch address swizzling works in the back-end, it
+             * happens at a DWORD granularity so we can't have a single load
+             * or store cross a DWORD boundary.
+             */
+            if ((align_offset % 4) + store_bytes > MIN2(align_mul, 4))
+               store_bytes = MIN2(align_mul, 4) - (align_offset % 4);
+         }
+
+         /* Must be a power of two */
+         if (store_bytes == 3)
+            store_bytes = 2;
+
+         store_bit_size = store_bytes * 8;
          store_comps = 1;
-         store_bit_size = MIN2(chunk_bytes, 4) * 8;
-         /* The bit size must be a power of two */
-         if (store_bit_size == 24)
-            store_bit_size = 16;
       }
       const unsigned store_bytes = store_comps * (store_bit_size / 8);