diff --git a/src/panfrost/compiler/bifrost/bifrost_nir.c b/src/panfrost/compiler/bifrost/bifrost_nir.c
index 7760b54e1dc..f7a275dc9f7 100644
--- a/src/panfrost/compiler/bifrost/bifrost_nir.c
+++ b/src/panfrost/compiler/bifrost/bifrost_nir.c
@@ -656,18 +656,148 @@ bifrost_nir_lower_load_output(nir_shader *nir)
       nir_metadata_control_flow, NULL);
 }
 
+static bool
+bytes_can_straddle_boundary(unsigned bytes, unsigned align_mul,
+                            unsigned align_off, unsigned boundary)
+{
+   /* addr = k*align_mul + align_off */
+   assert(IS_POT(align_mul));
+   assert(align_off < align_mul);
+   assert(IS_POT(boundary));
+
+   if (align_mul >= boundary)
+      return (align_off % boundary) + bytes > boundary;
+   else
+      return align_off + bytes > align_mul;
+}
+
+static nir_mem_access_size_align
+size_align_for_bytes(uint8_t bytes, uint8_t bit_size,
+                     uint32_t align_mul, uint32_t align_off)
+{
+   /* Grab the largest power of two which divides bytes */
+   assert(bytes != 0);
+   uint8_t bytes_max_bit_size = 8 << (ffs(bytes) - 1);
+
+   /* Clamp the bit size if needed */
+   bit_size = MIN3(bit_size, bytes_max_bit_size, 64);
+
+   return (nir_mem_access_size_align) {
+      .num_components = (bytes * 8) / bit_size,
+      .bit_size = bit_size,
+      .align = nir_combined_align(align_mul, align_off),
+      .shift = nir_mem_access_shift_method_scalar,
+   };
+}
+
+static nir_mem_access_size_align
+scratch_access_size_align_v9(uint8_t bytes, uint8_t bit_size,
+                             uint32_t align_mul, uint32_t align_off)
+{
+   /* On v9-v10, we must never straddle the 16-byte boundary. */
+   while (bytes_can_straddle_boundary(bytes, align_mul, align_off, 16))
+      bytes--;
+
+   return size_align_for_bytes(bytes, bit_size, align_mul, align_off);
+}
+
+static nir_mem_access_size_align
+scratch_access_size_align_v11(uint8_t bytes, uint8_t bit_size,
+                              uint32_t align_mul, uint32_t align_off,
+                              bool is_store)
+{
+   assert(align_off < align_mul);
+
+   /* v11+ has complex rules based on how many bytes are accessed, check if the
+    * access is legal, otherwise reduce the amount of bytes accessed.
+    */
+   for (; bytes > 1; bytes--) {
+      switch (bytes) {
+      case 2:
+         /* Must not straddle 4 bytes boundaries */
+         if (bytes_can_straddle_boundary(bytes, align_mul, align_off, 4))
+            continue;
+
+         return size_align_for_bytes(1, bit_size, align_mul, align_off);
+
+      case 3:
+         /* No restrictions for store */
+         if (is_store)
+            return size_align_for_bytes(3, 8, align_mul, align_off);
+
+         /* We can do 3-byte loads as long as they're aligned to 4 bytes.
+          * If align_off == 1, nir_lower_mem_access_bit_sizes() will upgrade
+          * to an aligned 4-byte load and shift the result.
+          */
+         if (align_mul >= 4 && align_off <= 1)
+            return size_align_for_bytes(3, 8, 4, 0);
+
+         /* Otherwise, we have to split the load */
+         continue;
+
+      case 4:
+      case 8:
+      case 16:
+         /* Must be 4-byte aligned */
+         if (nir_combined_align(align_mul, align_off) < 4)
+            continue;
+
+         return size_align_for_bytes(bytes, bit_size, align_mul, align_off);
+
+      case 6:
+         if (is_store) {
+            /* Stores must not straddle 4 bytes boundaries. */
+            if (bytes_can_straddle_boundary(bytes, align_mul, align_off, 4))
+               continue;
+
+            return size_align_for_bytes(6, bit_size, align_mul, align_off);
+         } else {
+            /* Loads must be aligned to 4 bytes. */
+            if (nir_combined_align(align_mul, align_off) < 4)
+               continue;
+
+            return size_align_for_bytes(6, bit_size, 4, 0);
+         }
+
+      default: /* 5, 7, 9-15 */
+         /* All other sizes have to fall back to a smaller load */
+         continue;
+      }
+   }
+
+   /* 1-byte accesses are always unrestricted */
+   assert(bytes == 1);
+   return size_align_for_bytes(1, 8, align_mul, align_off);
+}
+
 static nir_mem_access_size_align
 mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
                          uint8_t bit_size, uint32_t align_mul,
                          uint32_t align_offset, bool offset_is_const,
                          enum gl_access_qualifier access, const void *cb_data)
 {
+   uint64_t gpu_id = *(uint64_t *)cb_data;
    uint32_t align = nir_combined_align(align_mul, align_offset);
    assert(util_is_power_of_two_nonzero(align));
 
+   bool is_scratch = intrin == nir_intrinsic_load_scratch ||
+                     intrin == nir_intrinsic_store_scratch;
+
    /* No more than 16 bytes at a time. */
    bytes = MIN2(bytes, 16);
 
+   /* TLS memory requires special alignment, handle it separately */
+   if (pan_arch(gpu_id) >= 9 && is_scratch) {
+      bool is_store = intrin == nir_intrinsic_store_scratch;
+
+      if (pan_arch(gpu_id) >= 11)
+         return scratch_access_size_align_v11(bytes, bit_size, align_mul,
+                                                align_offset, is_store);
+      else
+         return scratch_access_size_align_v9(bytes, bit_size, align_mul,
+                                             align_offset);
+   }
+
    /* All loads must be aligned up to the next power of two of their byte
     * size. If we have insufficient alignment, split into smaller loads. */
    unsigned required_align = util_next_power_of_two(bytes);
@@ -758,6 +888,7 @@ bifrost_postprocess_nir(nir_shader *nir, uint64_t gpu_id)
                nir_var_shader_temp | nir_var_function_temp |
                nir_var_mem_global | nir_var_mem_shared,
       .callback = mem_access_size_align_cb,
+      .cb_data = (void *) &gpu_id,
    };
    NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &mem_size_options);