brw: Avoid rounding every convergent block load up to a full register

To simplify things, our backend rounds convergent block loads up to a full register. This causes page faults with the scratch page disabled since the address is not always aligned to a register size. Loading smaller blocks is slightly more difficult because the SEND instruction can only write back a multiple of full registers, even if the actual data is smaller. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40149>
2026-05-07 04:58:05 +02:00 · 2026-03-29 22:49:06 -07:00 · 2026-03-29 22:49:06 -07:00 · 3ac6233655
commit 3ac6233655
parent 8ce98fedc4
2 changed files with 43 additions and 11 deletions
--- a/src/intel/compiler/brw/brw_from_nir.cpp
+++ b/src/intel/compiler/brw/brw_from_nir.cpp
@ -4573,10 +4573,10 @@ get_nir_buffer_intrinsic_index(nir_to_brw_state &ntb, const brw_builder &bld,
 static unsigned
 choose_block_size_dwords(const intel_device_info *devinfo, unsigned dwords)
 {
-   const unsigned min_block = 8;
+   const unsigned min_block = devinfo->has_lsc ? 1 : 4;
   const unsigned max_block = devinfo->has_lsc ? 64 : 32;

-   const unsigned block = 1 << util_logbase2(dwords);
+   const unsigned block = dwords > 4 ? 1 << util_logbase2(dwords) : dwords;

   return CLAMP(block, min_block, max_block);
 }
@ -6188,23 +6188,32 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
      unsigned first_read_component = 0;

      if (convergent_block_load) {
-         /* If the address is a constant and alignment permits, skip unread
-          * leading and trailing components.  (It's probably not worth the
+         /* If the address is a constant and alignment permits, skip as many
+          * unread leading and trailing components as we can without splitting
+          * the load into more smaller blocks.  (It's probably not worth the
          * extra address math for non-constant addresses.)
          *
          * Note that SLM block loads on HDC platforms need to be 16B aligned.
          */
         if (srcs[MEMORY_LOGICAL_ADDRESS].file == IMM &&
-             alignment >= data_bit_size / 8 &&
-             (devinfo->has_lsc || mode != MEMORY_MODE_SHARED_LOCAL)) {
+             alignment >= nir_bit_size / 8) {
            first_read_component = nir_def_first_component_read(&instr->def);
-            unsigned last_component = nir_def_last_component_read(&instr->def);
+            unsigned last_component = nir_def_last_component_read(&instr->def) + 1;
+            if (!devinfo->has_lsc && mode == MEMORY_MODE_SHARED_LOCAL) {
+               first_read_component = ROUND_DOWN_TO(first_read_component, 4);
+               last_component = align(last_component, 4);
+            }
+            total = last_component - first_read_component;
+            total = brw_uniform_block_size(devinfo, total);
+            first_read_component =
+               total >= last_component ? 0 : last_component - total;
+            components = MIN2(components, last_component) - first_read_component;
            srcs[MEMORY_LOGICAL_ADDRESS].u64 +=
-               first_read_component * (data_bit_size / 8);
-            components = last_component - first_read_component + 1;
+               first_read_component * (nir_bit_size / 8);
+         } else {
+            total = brw_uniform_block_size(devinfo, components);
         }

-         total = align(components, REG_SIZE * reg_unit(devinfo) / 4);
         dest = ubld.vgrf(BRW_TYPE_UD, total);
      } else {
         total = components * bld.dispatch_width();
@ -6218,6 +6227,11 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
         unsigned block_comps = choose_block_size_dwords(devinfo, total - done);
         const unsigned block_bytes = block_comps * (nir_bit_size / 8);

+         /* Our current choice of block sizes and 32-bit data type will
+          * always give us a GRF-aligned offset into dest
+          */
+         assert(done % (REG_SIZE / 4 * reg_unit(devinfo)) == 0);
+
         brw_reg dst_offset = is_store ? brw_reg() :
            retype(byte_offset(dest, done * 4), BRW_TYPE_UD);
         if (is_store) {
@ -6228,7 +6242,7 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
         mem = ubld.emit(opcode, dst_offset, srcs, MEMORY_LOGICAL_NUM_SRCS)->as_mem();
         mem->has_no_mask_send_params = no_mask_handle;
         if (is_load)
-            mem->size_written = block_bytes;
+            mem->size_written = align(block_bytes, REG_SIZE * reg_unit(devinfo));
         mem->lsc_op = op;
         mem->mode = *mode;
         mem->binding_type = *binding_type;
--- a/src/intel/compiler/brw/brw_nir.h
+++ b/src/intel/compiler/brw/brw_nir.h
@ -326,6 +326,24 @@ bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
                                  nir_intrinsic_instr *high,
                                  void *data);

+/**
+ * Gets the size of a nir_load_*_uniform_block_intel after its lowered
+ * by the backend to a block load message, note that page faults can
+ * happen if this is not accounted for when using these intrinsics.
+ */
+static inline unsigned
+brw_uniform_block_size(const struct intel_device_info *devinfo,
+                       unsigned num_components)
+{
+   /* Round up to a supported block size, or to the nearest multiple of
+    * 16 components if its any larger.
+    */
+   return num_components > 8 ? align(num_components, 16)
+      : num_components > 4 ? 8
+      : !devinfo->has_lsc ? 4
+      : num_components;
+}
+
 void brw_nir_optimize(struct brw_pass_tracker *pt);

 #define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0