From c7e48f79b7be5c0ec34d9267a32cf8cba08e2248 Mon Sep 17 00:00:00 2001
From: Calder Young <cgiacun@gmail.com>
Date: Tue, 29 Jul 2025 14:44:33 -0700
Subject: [PATCH] brw,anv: Reduce UBO robustness size alignment to 16 bytes

Instead of being encoded as a contiguous 64-bit mask of individual registers,
the robustness information is now encoded as a vector of up to 4 bytes that
represent the limits of each of the pushed UBO ranges in 16 byte units.
Some buggy Direct3D workloads are known to depend on a robustness alignment
as low as 16 bytes to work properly.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36455>
---
 src/intel/compiler/brw_compiler.h             |  15 ++-
 src/intel/compiler/brw_shader.cpp             | 106 ++++++++++++++----
 src/intel/vulkan/anv_descriptor_set.c         |   6 +-
 .../vulkan/anv_nir_compute_push_layout.c      |   8 +-
 src/intel/vulkan/anv_nir_lower_ubo_loads.c    |  31 ++++-
 src/intel/vulkan/anv_physical_device.c        |   2 +-
 src/intel/vulkan/anv_private.h                |   3 +-
 src/intel/vulkan/genX_cmd_buffer.c            |   2 +-
 src/intel/vulkan/genX_cmd_draw.c              |  47 ++++----
 9 files changed, 149 insertions(+), 71 deletions(-)

diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 50577d51d5d..1e26e9f4bde 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -599,17 +599,16 @@ struct brw_stage_prog_data {
 
    mesa_shader_stage stage;
 
-   /* zero_push_reg is a bitfield which indicates what push registers (if any)
-    * should be zeroed by SW at the start of the shader.  The corresponding
-    * push_reg_mask_param specifies the param index (in 32-bit units) where
-    * the actual runtime 64-bit mask will be pushed.  The shader will zero
-    * push reg i if
+   /* If robust_ubo_ranges not 0, push_reg_mask_param specifies the param
+    * index (in 32-bit units) where the 4 UBO range limits will be pushed
+    * as 8-bit integers. The shader will zero byte i of UBO range j if:
     *
-    *    reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i)
+    *    (robust_ubo_ranges & (1 << j)) &&
+    *    (i < push_reg_mask_param[j] * 16)
     *
-    * If this field is set, brw_compiler::compact_params must be false.
+    * brw_compiler::compact_params must be false if robust_ubo_ranges used
     */
-   uint64_t zero_push_reg;
+   uint8_t robust_ubo_ranges;
    unsigned push_reg_mask_param;
 
    unsigned curb_read_length;
diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
index 597f90f811b..b33fa4a493a 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -813,39 +813,99 @@ brw_shader::assign_curb_setup()
       }
    }
 
-   uint64_t want_zero = used & prog_data->zero_push_reg;
-   if (want_zero) {
+   if (prog_data->robust_ubo_ranges) {
       brw_builder ubld = brw_builder(this, 8).exec_all().at_start(cfg->first_block());
+      /* At most we can write 2 GRFs (HW limit), the SIMD width matching the
+       * HW generation depends on the size of the physical register.
+       */
+      const unsigned max_grf_writes = 2 * reg_unit(devinfo);
+      assert(max_grf_writes <= 4);
 
       /* push_reg_mask_param is in 32-bit units */
       unsigned mask_param = prog_data->push_reg_mask_param;
-      struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
-                                                              mask_param % 8);
+      brw_reg mask = retype(brw_vec1_grf(payload().num_regs + mask_param / 8,
+                                         mask_param % 8), BRW_TYPE_UB);
 
-      brw_reg b32;
-      for (unsigned i = 0; i < 64; i++) {
-         if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
-            brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
-            ubld.SHL(horiz_offset(shifted, 8),
-                     byte_offset(retype(mask, BRW_TYPE_W), i / 8),
-                     brw_imm_v(0x01234567));
-            ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
+      /* For each 16bit lane, generate an offset in unit of 16B */
+      brw_reg offset_base = ubld.vgrf(BRW_TYPE_UW, max_grf_writes);
+      ubld.MOV(offset_base, brw_imm_uv(0x76543210));
+      ubld.MOV(horiz_offset(offset_base, 8), brw_imm_uv(0xFEDCBA98));
+      if (max_grf_writes > 2)
+         ubld.group(16, 0).ADD(horiz_offset(offset_base, 16), offset_base, brw_imm_uw(16));
 
-            brw_builder ubld16 = ubld.group(16, 0);
-            b32 = ubld16.vgrf(BRW_TYPE_D);
-            ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
-         }
+      u_foreach_bit(i, prog_data->robust_ubo_ranges) {
+         struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i];
 
-         if (want_zero & BITFIELD64_BIT(i)) {
-            assert(i < prog_data->curb_read_length);
-            struct brw_reg push_reg =
-               retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
+         unsigned range_start = ubo_push_start[i] / 8;
+         uint64_t want_zero = (used >> range_start) & BITFIELD64_MASK(ubo_range->length);
+         if (!want_zero)
+            continue;
 
-            ubld.AND(push_reg, push_reg, component(b32, i % 16));
-         }
+         const unsigned grf_start = payload().num_regs + range_start;
+         const unsigned grf_end = grf_start + ubo_range->length;
+         const unsigned max_grf_mask = max_grf_writes * 4;
+         unsigned grf = grf_start;
+
+         do {
+            unsigned mask_length = MIN2(grf_end - grf, max_grf_mask);
+            unsigned simd_width_mask = 1 << util_last_bit(mask_length * 2 - 1);
+
+            if (!(want_zero & BITFIELD64_RANGE(grf - grf_start, mask_length))) {
+               grf += max_grf_mask;
+               continue;
+            }
+
+            /* Prepare section of mask, at 1/4 size */
+            brw_builder ubld_mask = ubld.group(simd_width_mask, 0);
+            brw_reg offset_reg = ubld_mask.vgrf(BRW_TYPE_UW);
+            unsigned mask_start = grf, mask_end = grf + mask_length;
+            ubld_mask.ADD(offset_reg, offset_base, brw_imm_uw((mask_start - grf_start) * 2));
+            /* Compare the 16B increments with the value coming from push
+             * constants and store the result into a dword. This expands a
+             * comparison between 2 values in 16B increments into a 32bit mask
+             * where each bit covers 4bits of data in the payload.
+             *
+             * This expension works because of the sign extension guaranteed
+             * by the HW.
+             *
+             * SKL PRMs, Volume 7: 3D-Media-GPGPU, Execution Data Type:
+             *
+             *   "The following rules explain the conversion of multiple
+             *   source operand types, possibly a mix of different types, to
+             *   one common execution type:
+             *      - ...
+             *      - Unsigned integers are converted to signed integers.
+             *      - Byte (B) or Unsigned Byte (UB) values are converted to a Word
+             *        or wider integer execution type.
+             *      - If source operands have different integer widths, use
+             *        the widest width specified to choose the signed integer
+             *        execution type."
+             */
+            brw_reg mask_reg = ubld_mask.vgrf(BRW_TYPE_UD);
+            ubld_mask.CMP(mask_reg, byte_offset(mask, i), offset_reg, BRW_CONDITIONAL_G);
+
+            for (unsigned and_length; grf < mask_end; grf += and_length) {
+               and_length = 1u << (util_last_bit(MIN2(grf_end - grf, max_grf_writes)) - 1);
+
+               if (!(want_zero & BITFIELD64_RANGE(grf - grf_start, and_length)))
+                  continue;
+
+               brw_reg push_reg = retype(brw_vec8_grf(grf, 0), BRW_TYPE_D);
+
+               /* Expand the masking bits one more time (1bit -> 4bit because
+                * UB -> UD) so that now each 8bits of mask cover 32bits of
+                * data to mask, while doing the masking in the payload data.
+                */
+               ubld.group(and_length * 8, 0).AND(
+                  push_reg,
+                  byte_offset(retype(mask_reg, BRW_TYPE_B),
+                              (grf - mask_start) * 8),
+                  push_reg);
+            }
+         } while (grf < grf_end);
       }
 
-      invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
+      invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS | BRW_DEPENDENCY_VARIABLES);
    }
 
    /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c
index 5b8e5df711f..4f456422fd2 100644
--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@@ -2429,13 +2429,13 @@ anv_descriptor_set_write_buffer(struct anv_device *device,
    struct anv_address bind_addr = anv_address_add(buffer->address, offset);
    desc->bind_range = vk_buffer_range(&buffer->vk, offset, range);
 
-   /* We report a bounds checking alignment of ANV_UBO_ALIGNMENT in
+   /* We report a bounds checking alignment of ANV_UBO_BOUNDS_CHECK_ALIGNMENT in
     * VkPhysicalDeviceRobustness2PropertiesEXT::robustUniformBufferAccessSizeAlignment
     * so align the range to that.
     */
    if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
        type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
-      desc->bind_range = align64(desc->bind_range, ANV_UBO_ALIGNMENT);
+      desc->bind_range = align64(desc->bind_range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);
 
    if (data & ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE) {
       struct anv_address_range_descriptor desc_data = {
@@ -3014,7 +3014,7 @@ void anv_GetDescriptorEXT(
           * messages which read an entire register worth at a time.
           */
          if (pDescriptorInfo->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
-            range = align64(range, ANV_UBO_ALIGNMENT);
+            range = align64(range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);
 
          isl_surf_usage_flags_t usage =
             pDescriptorInfo->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ?
diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c
index 3e246dba13a..773a8a06fc0 100644
--- a/src/intel/vulkan/anv_nir_compute_push_layout.c
+++ b/src/intel/vulkan/anv_nir_compute_push_layout.c
@@ -273,9 +273,8 @@ anv_nir_compute_push_layout(nir_shader *nir,
       }
 
       const unsigned max_push_buffers = needs_padding_per_primitive ? 3 : 4;
-      unsigned range_start_reg = push_constant_range.length;
 
-      for (int i = 0; i < 4; i++) {
+      for (unsigned i = 0; i < 4; i++) {
          struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i];
          if (ubo_range->length == 0)
             continue;
@@ -300,11 +299,8 @@ anv_nir_compute_push_layout(nir_shader *nir,
          /* We only bother to shader-zero pushed client UBOs */
          if (binding->set < MAX_SETS &&
              (robust_flags & BRW_ROBUSTNESS_UBO)) {
-            prog_data->zero_push_reg |= BITFIELD64_RANGE(range_start_reg,
-                                                         ubo_range->length);
+            prog_data->robust_ubo_ranges |= (uint8_t) (1 << i);
          }
-
-         range_start_reg += ubo_range->length;
       }
    } else if (push_constant_range.length > 0) {
       /* For Ivy Bridge, the push constants packets have a different
diff --git a/src/intel/vulkan/anv_nir_lower_ubo_loads.c b/src/intel/vulkan/anv_nir_lower_ubo_loads.c
index 49e6f15a719..6406701c924 100644
--- a/src/intel/vulkan/anv_nir_lower_ubo_loads.c
+++ b/src/intel/vulkan/anv_nir_lower_ubo_loads.c
@@ -53,7 +53,7 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
       assert(ANV_UBO_ALIGNMENT == 64);
 
       unsigned suboffset = offset % 64;
-      uint64_t aligned_offset = offset - suboffset;
+      unsigned aligned_offset = offset - suboffset;
 
       /* Load two just in case we go over a 64B boundary */
       nir_def *data[2];
@@ -64,11 +64,30 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
             b, 16, 32, addr,
             .access = nir_intrinsic_access(load),
             .align_mul = 64);
-         if (bound) {
-            data[i] = nir_bcsel(b,
-                                nir_igt_imm(b, bound, aligned_offset + i * 64 + 63),
-                                data[i],
-                                nir_imm_int(b, 0));
+      }
+
+      if (bound) {
+         nir_def* offsets =
+            nir_imm_uvec8(b, aligned_offset, aligned_offset + 16,
+                          aligned_offset + 32, aligned_offset + 48,
+                          aligned_offset + 64, aligned_offset + 80,
+                          aligned_offset + 96, aligned_offset + 112);
+         nir_def* mask =
+            nir_bcsel(b, nir_ilt(b, offsets, bound),
+                      nir_imm_int(b, 0xFFFFFFFF),
+                      nir_imm_int(b, 0x00000000));
+
+         for (unsigned i = 0; i < 2; i++) {
+            /* We prepared a mask where every 1 bit of mask covers 4 bits of the
+             * UBO block we've loaded, when we apply it we'll sign extend each
+             * byte of the mask to a dword to get the final bitfield, this can
+             * be optimized because Intel HW allows instructions to mix several
+             * types and perform the sign extensions implicitly.
+             */
+            data[i] =
+               nir_iand(b,
+                        nir_i2iN(b, nir_extract_bits(b, &mask, 1, i * 128, 16, 8), 32),
+                        data[i]);
          }
       }
 
diff --git a/src/intel/vulkan/anv_physical_device.c b/src/intel/vulkan/anv_physical_device.c
index 310c48672a4..98a67180890 100644
--- a/src/intel/vulkan/anv_physical_device.c
+++ b/src/intel/vulkan/anv_physical_device.c
@@ -1568,7 +1568,7 @@ get_properties(const struct anv_physical_device *pdevice,
       props->robustStorageBufferAccessSizeAlignment =
          ANV_SSBO_BOUNDS_CHECK_ALIGNMENT;
       props->robustUniformBufferAccessSizeAlignment =
-         ANV_UBO_ALIGNMENT;
+         ANV_UBO_BOUNDS_CHECK_ALIGNMENT;
    }
 
    /* VK_KHR_vertex_attribute_divisor */
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index f0322888588..05d2dc6e0cb 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -198,6 +198,7 @@ get_max_vbs(const struct intel_device_info *devinfo) {
  * GEM object.
  */
 #define ANV_UBO_ALIGNMENT 64
+#define ANV_UBO_BOUNDS_CHECK_ALIGNMENT 16
 #define ANV_SSBO_ALIGNMENT 4
 #define ANV_SSBO_BOUNDS_CHECK_ALIGNMENT 4
 #define MAX_VIEWS_FOR_PRIMITIVE_REPLICATION 16
@@ -3973,7 +3974,7 @@ struct anv_push_constants {
          uint32_t tcs_input_vertices;
 
          /** Robust access pushed registers. */
-         uint64_t push_reg_mask[MESA_SHADER_STAGES];
+         uint8_t push_reg_mask[MESA_SHADER_STAGES][4];
 
          uint32_t fs_per_prim_remap_offset;
       } gfx;
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 83331827fd7..2ef239d7209 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -1957,7 +1957,7 @@ emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
     * VkPhysicalDeviceRobustness2PropertiesEXT::robustUniformBufferAccessSizeAlignment
     */
    if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
-      range = align(range, ANV_UBO_ALIGNMENT);
+      range = align(range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);
 
    struct anv_address address =
       anv_address_add(desc->buffer->address, offset);
diff --git a/src/intel/vulkan/genX_cmd_draw.c b/src/intel/vulkan/genX_cmd_draw.c
index 7da50785a29..f7f61b38471 100644
--- a/src/intel/vulkan/genX_cmd_draw.c
+++ b/src/intel/vulkan/genX_cmd_draw.c
@@ -305,7 +305,7 @@ get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
          uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
 
          /* Align the range for consistency */
-         bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
+         bound_range = align(bound_range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);
 
          return bound_range;
       }
@@ -444,42 +444,45 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
          continue;
 
       const struct anv_shader_bin *shader = gfx->shaders[stage];
-      if (shader->prog_data->zero_push_reg) {
+      if (shader->prog_data->robust_ubo_ranges) {
          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
          struct anv_push_constants *push = &gfx->base.push_constants;
 
-         push->gfx.push_reg_mask[stage] = 0;
-         /* Start of the current range in the shader, relative to the start of
-          * push constants in the shader.
-          */
-         unsigned range_start_reg = 0;
+         unsigned ubo_range_index = 0;
          for (unsigned i = 0; i < 4; i++) {
             const struct anv_push_range *range = &bind_map->push_ranges[i];
             if (range->length == 0)
                continue;
 
-            /* Never clear this padding register as it might contain payload
-             * data.
-             */
-            if (range->set == ANV_DESCRIPTOR_SET_PER_PRIM_PADDING)
+            /* Skip any push ranges that were not promoted from UBOs */
+            if (range->set >= MAX_SETS)
                continue;
 
+            assert(shader->prog_data->robust_ubo_ranges & (1 << ubo_range_index));
+
             unsigned bound_size =
                get_push_range_bound_size(cmd_buffer, shader, range);
-            if (bound_size >= range->start * 32) {
-               unsigned bound_regs =
-                  MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
-                       range->length);
-               assert(range_start_reg + bound_regs <= 64);
-               push->gfx.push_reg_mask[stage] |=
-                  BITFIELD64_RANGE(range_start_reg, bound_regs);
+
+            uint8_t range_mask = 0;
+
+            /* Determine the bound length of the range in 16-byte units */
+            if (bound_size > range->start * 32) {
+               bound_size = MIN2(
+                  DIV_ROUND_UP(bound_size - range->start * 32, 16),
+                  2 * range->length);
+               range_mask = (uint8_t) bound_size;
+               assert(bound_size < 256);
             }
 
-            cmd_buffer->state.push_constants_dirty |=
-               mesa_to_vk_shader_stage(stage);
-            gfx->base.push_constants_data_dirty = true;
+            /* Update the pushed bound length constant if it changed */
+            if (range_mask != push->gfx.push_reg_mask[stage][ubo_range_index]) {
+               push->gfx.push_reg_mask[stage][ubo_range_index] = range_mask;
+               cmd_buffer->state.push_constants_dirty |=
+                  mesa_to_vk_shader_stage(stage);
+               gfx->base.push_constants_data_dirty = true;
+            }
 
-            range_start_reg += range->length;
+            ++ubo_range_index;
          }
       }
    }