brw,anv: Reduce UBO robustness size alignment to 16 bytes

Instead of being encoded as a contiguous 64-bit mask of individual registers, the robustness information is now encoded as a vector of up to 4 bytes that represent the limits of each of the pushed UBO ranges in 16 byte units. Some buggy Direct3D workloads are known to depend on a robustness alignment as low as 16 bytes to work properly. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36455>
2025-12-24 04:30:10 +01:00 · 2025-07-29 14:44:33 -07:00 · 2025-07-29 14:44:33 -07:00 · c7e48f79b7
commit c7e48f79b7
parent a3ecdf33a3
9 changed files with 149 additions and 71 deletions
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@ -599,17 +599,16 @@ struct brw_stage_prog_data {

   mesa_shader_stage stage;

-   /* zero_push_reg is a bitfield which indicates what push registers (if any)
-    * should be zeroed by SW at the start of the shader.  The corresponding
-    * push_reg_mask_param specifies the param index (in 32-bit units) where
-    * the actual runtime 64-bit mask will be pushed.  The shader will zero
-    * push reg i if
+   /* If robust_ubo_ranges not 0, push_reg_mask_param specifies the param
+    * index (in 32-bit units) where the 4 UBO range limits will be pushed
+    * as 8-bit integers. The shader will zero byte i of UBO range j if:
    *
-    *    reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i)
+    *    (robust_ubo_ranges & (1 << j)) &&
+    *    (i < push_reg_mask_param[j] * 16)
    *
-    * If this field is set, brw_compiler::compact_params must be false.
+    * brw_compiler::compact_params must be false if robust_ubo_ranges used
    */
-   uint64_t zero_push_reg;
+   uint8_t robust_ubo_ranges;
   unsigned push_reg_mask_param;

   unsigned curb_read_length;
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@ -813,39 +813,99 @@ brw_shader::assign_curb_setup()
      }
   }

-   uint64_t want_zero = used & prog_data->zero_push_reg;
-   if (want_zero) {
+   if (prog_data->robust_ubo_ranges) {
      brw_builder ubld = brw_builder(this, 8).exec_all().at_start(cfg->first_block());
+      /* At most we can write 2 GRFs (HW limit), the SIMD width matching the
+       * HW generation depends on the size of the physical register.
+       */
+      const unsigned max_grf_writes = 2 * reg_unit(devinfo);
+      assert(max_grf_writes <= 4);

      /* push_reg_mask_param is in 32-bit units */
      unsigned mask_param = prog_data->push_reg_mask_param;
-      struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
-                                                              mask_param % 8);
+      brw_reg mask = retype(brw_vec1_grf(payload().num_regs + mask_param / 8,
+                                         mask_param % 8), BRW_TYPE_UB);

-      brw_reg b32;
-      for (unsigned i = 0; i < 64; i++) {
-         if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
-            brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
-            ubld.SHL(horiz_offset(shifted, 8),
-                     byte_offset(retype(mask, BRW_TYPE_W), i / 8),
-                     brw_imm_v(0x01234567));
-            ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
+      /* For each 16bit lane, generate an offset in unit of 16B */
+      brw_reg offset_base = ubld.vgrf(BRW_TYPE_UW, max_grf_writes);
+      ubld.MOV(offset_base, brw_imm_uv(0x76543210));
+      ubld.MOV(horiz_offset(offset_base, 8), brw_imm_uv(0xFEDCBA98));
+      if (max_grf_writes > 2)
+         ubld.group(16, 0).ADD(horiz_offset(offset_base, 16), offset_base, brw_imm_uw(16));

-            brw_builder ubld16 = ubld.group(16, 0);
-            b32 = ubld16.vgrf(BRW_TYPE_D);
-            ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
-         }
+      u_foreach_bit(i, prog_data->robust_ubo_ranges) {
+         struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i];

-         if (want_zero & BITFIELD64_BIT(i)) {
-            assert(i < prog_data->curb_read_length);
-            struct brw_reg push_reg =
-               retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
+         unsigned range_start = ubo_push_start[i] / 8;
+         uint64_t want_zero = (used >> range_start) & BITFIELD64_MASK(ubo_range->length);
+         if (!want_zero)
+            continue;

-            ubld.AND(push_reg, push_reg, component(b32, i % 16));
-         }
+         const unsigned grf_start = payload().num_regs + range_start;
+         const unsigned grf_end = grf_start + ubo_range->length;
+         const unsigned max_grf_mask = max_grf_writes * 4;
+         unsigned grf = grf_start;
+
+         do {
+            unsigned mask_length = MIN2(grf_end - grf, max_grf_mask);
+            unsigned simd_width_mask = 1 << util_last_bit(mask_length * 2 - 1);
+
+            if (!(want_zero & BITFIELD64_RANGE(grf - grf_start, mask_length))) {
+               grf += max_grf_mask;
+               continue;
+            }
+
+            /* Prepare section of mask, at 1/4 size */
+            brw_builder ubld_mask = ubld.group(simd_width_mask, 0);
+            brw_reg offset_reg = ubld_mask.vgrf(BRW_TYPE_UW);
+            unsigned mask_start = grf, mask_end = grf + mask_length;
+            ubld_mask.ADD(offset_reg, offset_base, brw_imm_uw((mask_start - grf_start) * 2));
+            /* Compare the 16B increments with the value coming from push
+             * constants and store the result into a dword. This expands a
+             * comparison between 2 values in 16B increments into a 32bit mask
+             * where each bit covers 4bits of data in the payload.
+             *
+             * This expension works because of the sign extension guaranteed
+             * by the HW.
+             *
+             * SKL PRMs, Volume 7: 3D-Media-GPGPU, Execution Data Type:
+             *
+             *   "The following rules explain the conversion of multiple
+             *   source operand types, possibly a mix of different types, to
+             *   one common execution type:
+             *      - ...
+             *      - Unsigned integers are converted to signed integers.
+             *      - Byte (B) or Unsigned Byte (UB) values are converted to a Word
+             *        or wider integer execution type.
+             *      - If source operands have different integer widths, use
+             *        the widest width specified to choose the signed integer
+             *        execution type."
+             */
+            brw_reg mask_reg = ubld_mask.vgrf(BRW_TYPE_UD);
+            ubld_mask.CMP(mask_reg, byte_offset(mask, i), offset_reg, BRW_CONDITIONAL_G);
+
+            for (unsigned and_length; grf < mask_end; grf += and_length) {
+               and_length = 1u << (util_last_bit(MIN2(grf_end - grf, max_grf_writes)) - 1);
+
+               if (!(want_zero & BITFIELD64_RANGE(grf - grf_start, and_length)))
+                  continue;
+
+               brw_reg push_reg = retype(brw_vec8_grf(grf, 0), BRW_TYPE_D);
+
+               /* Expand the masking bits one more time (1bit -> 4bit because
+                * UB -> UD) so that now each 8bits of mask cover 32bits of
+                * data to mask, while doing the masking in the payload data.
+                */
+               ubld.group(and_length * 8, 0).AND(
+                  push_reg,
+                  byte_offset(retype(mask_reg, BRW_TYPE_B),
+                              (grf - mask_start) * 8),
+                  push_reg);
+            }
+         } while (grf < grf_end);
      }

-      invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
+      invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS | BRW_DEPENDENCY_VARIABLES);
   }

   /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@ -2429,13 +2429,13 @@ anv_descriptor_set_write_buffer(struct anv_device *device,
   struct anv_address bind_addr = anv_address_add(buffer->address, offset);
   desc->bind_range = vk_buffer_range(&buffer->vk, offset, range);

-   /* We report a bounds checking alignment of ANV_UBO_ALIGNMENT in
+   /* We report a bounds checking alignment of ANV_UBO_BOUNDS_CHECK_ALIGNMENT in
    * VkPhysicalDeviceRobustness2PropertiesEXT::robustUniformBufferAccessSizeAlignment
    * so align the range to that.
    */
   if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
       type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
-      desc->bind_range = align64(desc->bind_range, ANV_UBO_ALIGNMENT);
+      desc->bind_range = align64(desc->bind_range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);

   if (data & ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE) {
      struct anv_address_range_descriptor desc_data = {
@ -3014,7 +3014,7 @@ void anv_GetDescriptorEXT(
          * messages which read an entire register worth at a time.
          */
         if (pDescriptorInfo->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
-            range = align64(range, ANV_UBO_ALIGNMENT);
+            range = align64(range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);

         isl_surf_usage_flags_t usage =
            pDescriptorInfo->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ?
--- a/src/intel/vulkan/anv_nir_compute_push_layout.c
+++ b/src/intel/vulkan/anv_nir_compute_push_layout.c
@ -273,9 +273,8 @@ anv_nir_compute_push_layout(nir_shader *nir,
      }

      const unsigned max_push_buffers = needs_padding_per_primitive ? 3 : 4;
-      unsigned range_start_reg = push_constant_range.length;

-      for (int i = 0; i < 4; i++) {
+      for (unsigned i = 0; i < 4; i++) {
         struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i];
         if (ubo_range->length == 0)
            continue;
@ -300,11 +299,8 @@ anv_nir_compute_push_layout(nir_shader *nir,
         /* We only bother to shader-zero pushed client UBOs */
         if (binding->set < MAX_SETS &&
             (robust_flags & BRW_ROBUSTNESS_UBO)) {
-            prog_data->zero_push_reg |= BITFIELD64_RANGE(range_start_reg,
-                                                         ubo_range->length);
+            prog_data->robust_ubo_ranges |= (uint8_t) (1 << i);
         }
-
-         range_start_reg += ubo_range->length;
      }
   } else if (push_constant_range.length > 0) {
      /* For Ivy Bridge, the push constants packets have a different
--- a/src/intel/vulkan/anv_nir_lower_ubo_loads.c
+++ b/src/intel/vulkan/anv_nir_lower_ubo_loads.c
@ -53,7 +53,7 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
      assert(ANV_UBO_ALIGNMENT == 64);

      unsigned suboffset = offset % 64;
-      uint64_t aligned_offset = offset - suboffset;
+      unsigned aligned_offset = offset - suboffset;

      /* Load two just in case we go over a 64B boundary */
      nir_def *data[2];
@ -64,11 +64,30 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
            b, 16, 32, addr,
            .access = nir_intrinsic_access(load),
            .align_mul = 64);
-         if (bound) {
-            data[i] = nir_bcsel(b,
-                                nir_igt_imm(b, bound, aligned_offset + i * 64 + 63),
-                                data[i],
-                                nir_imm_int(b, 0));
+      }
+
+      if (bound) {
+         nir_def* offsets =
+            nir_imm_uvec8(b, aligned_offset, aligned_offset + 16,
+                          aligned_offset + 32, aligned_offset + 48,
+                          aligned_offset + 64, aligned_offset + 80,
+                          aligned_offset + 96, aligned_offset + 112);
+         nir_def* mask =
+            nir_bcsel(b, nir_ilt(b, offsets, bound),
+                      nir_imm_int(b, 0xFFFFFFFF),
+                      nir_imm_int(b, 0x00000000));
+
+         for (unsigned i = 0; i < 2; i++) {
+            /* We prepared a mask where every 1 bit of mask covers 4 bits of the
+             * UBO block we've loaded, when we apply it we'll sign extend each
+             * byte of the mask to a dword to get the final bitfield, this can
+             * be optimized because Intel HW allows instructions to mix several
+             * types and perform the sign extensions implicitly.
+             */
+            data[i] =
+               nir_iand(b,
+                        nir_i2iN(b, nir_extract_bits(b, &mask, 1, i * 128, 16, 8), 32),
+                        data[i]);
         }
      }

--- a/src/intel/vulkan/anv_physical_device.c
+++ b/src/intel/vulkan/anv_physical_device.c
@ -1568,7 +1568,7 @@ get_properties(const struct anv_physical_device *pdevice,
      props->robustStorageBufferAccessSizeAlignment =
         ANV_SSBO_BOUNDS_CHECK_ALIGNMENT;
      props->robustUniformBufferAccessSizeAlignment =
-         ANV_UBO_ALIGNMENT;
+         ANV_UBO_BOUNDS_CHECK_ALIGNMENT;
   }

   /* VK_KHR_vertex_attribute_divisor */
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -198,6 +198,7 @@ get_max_vbs(const struct intel_device_info *devinfo) {
 * GEM object.
 */
 #define ANV_UBO_ALIGNMENT 64
+#define ANV_UBO_BOUNDS_CHECK_ALIGNMENT 16
 #define ANV_SSBO_ALIGNMENT 4
 #define ANV_SSBO_BOUNDS_CHECK_ALIGNMENT 4
 #define MAX_VIEWS_FOR_PRIMITIVE_REPLICATION 16
@ -3973,7 +3974,7 @@ struct anv_push_constants {
         uint32_t tcs_input_vertices;

         /** Robust access pushed registers. */
-         uint64_t push_reg_mask[MESA_SHADER_STAGES];
+         uint8_t push_reg_mask[MESA_SHADER_STAGES][4];

         uint32_t fs_per_prim_remap_offset;
      } gfx;
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@ -1957,7 +1957,7 @@ emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
    * VkPhysicalDeviceRobustness2PropertiesEXT::robustUniformBufferAccessSizeAlignment
    */
   if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
-      range = align(range, ANV_UBO_ALIGNMENT);
+      range = align(range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);

   struct anv_address address =
      anv_address_add(desc->buffer->address, offset);
--- a/src/intel/vulkan/genX_cmd_draw.c
+++ b/src/intel/vulkan/genX_cmd_draw.c
@ -305,7 +305,7 @@ get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
         uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);

         /* Align the range for consistency */
-         bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
+         bound_range = align(bound_range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);

         return bound_range;
      }
@ -444,42 +444,45 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
         continue;

      const struct anv_shader_bin *shader = gfx->shaders[stage];
-      if (shader->prog_data->zero_push_reg) {
+      if (shader->prog_data->robust_ubo_ranges) {
         const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
         struct anv_push_constants *push = &gfx->base.push_constants;

-         push->gfx.push_reg_mask[stage] = 0;
-         /* Start of the current range in the shader, relative to the start of
-          * push constants in the shader.
-          */
-         unsigned range_start_reg = 0;
+         unsigned ubo_range_index = 0;
         for (unsigned i = 0; i < 4; i++) {
            const struct anv_push_range *range = &bind_map->push_ranges[i];
            if (range->length == 0)
               continue;

-            /* Never clear this padding register as it might contain payload
-             * data.
-             */
-            if (range->set == ANV_DESCRIPTOR_SET_PER_PRIM_PADDING)
+            /* Skip any push ranges that were not promoted from UBOs */
+            if (range->set >= MAX_SETS)
               continue;

+            assert(shader->prog_data->robust_ubo_ranges & (1 << ubo_range_index));
+
            unsigned bound_size =
               get_push_range_bound_size(cmd_buffer, shader, range);
-            if (bound_size >= range->start * 32) {
-               unsigned bound_regs =
-                  MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
-                       range->length);
-               assert(range_start_reg + bound_regs <= 64);
-               push->gfx.push_reg_mask[stage] |=
-                  BITFIELD64_RANGE(range_start_reg, bound_regs);
+
+            uint8_t range_mask = 0;
+
+            /* Determine the bound length of the range in 16-byte units */
+            if (bound_size > range->start * 32) {
+               bound_size = MIN2(
+                  DIV_ROUND_UP(bound_size - range->start * 32, 16),
+                  2 * range->length);
+               range_mask = (uint8_t) bound_size;
+               assert(bound_size < 256);
            }

-            cmd_buffer->state.push_constants_dirty |=
-               mesa_to_vk_shader_stage(stage);
-            gfx->base.push_constants_data_dirty = true;
+            /* Update the pushed bound length constant if it changed */
+            if (range_mask != push->gfx.push_reg_mask[stage][ubo_range_index]) {
+               push->gfx.push_reg_mask[stage][ubo_range_index] = range_mask;
+               cmd_buffer->state.push_constants_dirty |=
+                  mesa_to_vk_shader_stage(stage);
+               gfx->base.push_constants_data_dirty = true;
+            }

-            range_start_reg += range->length;
+            ++ubo_range_index;
         }
      }
   }