diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp
index 40f18a0517b..f1aa0d5ac1a 100644
--- a/src/intel/compiler/brw/brw_from_nir.cpp
+++ b/src/intel/compiler/brw/brw_from_nir.cpp
@@ -4573,10 +4573,10 @@ get_nir_buffer_intrinsic_index(nir_to_brw_state &ntb, const brw_builder &bld,
 static unsigned
 choose_block_size_dwords(const intel_device_info *devinfo, unsigned dwords)
 {
-   const unsigned min_block = 8;
+   const unsigned min_block = devinfo->has_lsc ? 1 : 4;
    const unsigned max_block = devinfo->has_lsc ? 64 : 32;
 
-   const unsigned block = 1 << util_logbase2(dwords);
+   const unsigned block = dwords > 4 ? 1 << util_logbase2(dwords) : dwords;
 
    return CLAMP(block, min_block, max_block);
 }
@@ -6188,23 +6188,32 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
       unsigned first_read_component = 0;
 
       if (convergent_block_load) {
-         /* If the address is a constant and alignment permits, skip unread
-          * leading and trailing components.  (It's probably not worth the
+         /* If the address is a constant and alignment permits, skip as many
+          * unread leading and trailing components as we can without splitting
+          * the load into more smaller blocks.  (It's probably not worth the
           * extra address math for non-constant addresses.)
           *
           * Note that SLM block loads on HDC platforms need to be 16B aligned.
           */
          if (srcs[MEMORY_LOGICAL_ADDRESS].file == IMM &&
-             alignment >= data_bit_size / 8 &&
-             (devinfo->has_lsc || mode != MEMORY_MODE_SHARED_LOCAL)) {
+             alignment >= nir_bit_size / 8) {
             first_read_component = nir_def_first_component_read(&instr->def);
-            unsigned last_component = nir_def_last_component_read(&instr->def);
+            unsigned last_component = nir_def_last_component_read(&instr->def) + 1;
+            if (!devinfo->has_lsc && mode == MEMORY_MODE_SHARED_LOCAL) {
+               first_read_component = ROUND_DOWN_TO(first_read_component, 4);
+               last_component = align(last_component, 4);
+            }
+            total = last_component - first_read_component;
+            total = brw_uniform_block_size(devinfo, total);
+            first_read_component =
+               total >= last_component ? 0 : last_component - total;
+            components = MIN2(components, last_component) - first_read_component;
             srcs[MEMORY_LOGICAL_ADDRESS].u64 +=
-               first_read_component * (data_bit_size / 8);
-            components = last_component - first_read_component + 1;
+               first_read_component * (nir_bit_size / 8);
+         } else {
+            total = brw_uniform_block_size(devinfo, components);
          }
 
-         total = align(components, REG_SIZE * reg_unit(devinfo) / 4);
          dest = ubld.vgrf(BRW_TYPE_UD, total);
       } else {
          total = components * bld.dispatch_width();
@@ -6218,6 +6227,11 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
          unsigned block_comps = choose_block_size_dwords(devinfo, total - done);
          const unsigned block_bytes = block_comps * (nir_bit_size / 8);
 
+         /* Our current choice of block sizes and 32-bit data type will
+          * always give us a GRF-aligned offset into dest
+          */
+         assert(done % (REG_SIZE / 4 * reg_unit(devinfo)) == 0);
+
          brw_reg dst_offset = is_store ? brw_reg() :
             retype(byte_offset(dest, done * 4), BRW_TYPE_UD);
          if (is_store) {
@@ -6228,7 +6242,7 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
          mem = ubld.emit(opcode, dst_offset, srcs, MEMORY_LOGICAL_NUM_SRCS)->as_mem();
          mem->has_no_mask_send_params = no_mask_handle;
          if (is_load)
-            mem->size_written = block_bytes;
+            mem->size_written = align(block_bytes, REG_SIZE * reg_unit(devinfo));
          mem->lsc_op = op;
          mem->mode = *mode;
          mem->binding_type = *binding_type;
diff --git a/src/intel/compiler/brw/brw_nir.h b/src/intel/compiler/brw/brw_nir.h
index 6cefd4be113..62eb3a5db9f 100644
--- a/src/intel/compiler/brw/brw_nir.h
+++ b/src/intel/compiler/brw/brw_nir.h
@@ -326,6 +326,24 @@ bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
                                   nir_intrinsic_instr *high,
                                   void *data);
 
+/**
+ * Gets the size of a nir_load_*_uniform_block_intel after its lowered
+ * by the backend to a block load message, note that page faults can
+ * happen if this is not accounted for when using these intrinsics.
+ */
+static inline unsigned
+brw_uniform_block_size(const struct intel_device_info *devinfo,
+                       unsigned num_components)
+{
+   /* Round up to a supported block size, or to the nearest multiple of
+    * 16 components if its any larger.
+    */
+   return num_components > 8 ? align(num_components, 16)
+      : num_components > 4 ? 8
+      : !devinfo->has_lsc ? 4
+      : num_components;
+}
+
 void brw_nir_optimize(struct brw_pass_tracker *pt);
 
 #define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0