brw,anv: Reduce UBO robustness size alignment to 16 bytes
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Instead of being encoded as a contiguous 64-bit mask of individual registers,
the robustness information is now encoded as a vector of up to 4 bytes that
represent the limits of each of the pushed UBO ranges in 16 byte units.
Some buggy Direct3D workloads are known to depend on a robustness alignment
as low as 16 bytes to work properly.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36455>
This commit is contained in:
Calder Young 2025-07-29 14:44:33 -07:00 committed by Marge Bot
parent a3ecdf33a3
commit c7e48f79b7
9 changed files with 149 additions and 71 deletions

View file

@ -599,17 +599,16 @@ struct brw_stage_prog_data {
mesa_shader_stage stage;
/* zero_push_reg is a bitfield which indicates what push registers (if any)
* should be zeroed by SW at the start of the shader. The corresponding
* push_reg_mask_param specifies the param index (in 32-bit units) where
* the actual runtime 64-bit mask will be pushed. The shader will zero
* push reg i if
/* If robust_ubo_ranges not 0, push_reg_mask_param specifies the param
* index (in 32-bit units) where the 4 UBO range limits will be pushed
* as 8-bit integers. The shader will zero byte i of UBO range j if:
*
* reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i)
* (robust_ubo_ranges & (1 << j)) &&
* (i < push_reg_mask_param[j] * 16)
*
* If this field is set, brw_compiler::compact_params must be false.
* brw_compiler::compact_params must be false if robust_ubo_ranges used
*/
uint64_t zero_push_reg;
uint8_t robust_ubo_ranges;
unsigned push_reg_mask_param;
unsigned curb_read_length;

View file

@ -813,39 +813,99 @@ brw_shader::assign_curb_setup()
}
}
uint64_t want_zero = used & prog_data->zero_push_reg;
if (want_zero) {
if (prog_data->robust_ubo_ranges) {
brw_builder ubld = brw_builder(this, 8).exec_all().at_start(cfg->first_block());
/* At most we can write 2 GRFs (HW limit), the SIMD width matching the
* HW generation depends on the size of the physical register.
*/
const unsigned max_grf_writes = 2 * reg_unit(devinfo);
assert(max_grf_writes <= 4);
/* push_reg_mask_param is in 32-bit units */
unsigned mask_param = prog_data->push_reg_mask_param;
struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
mask_param % 8);
brw_reg mask = retype(brw_vec1_grf(payload().num_regs + mask_param / 8,
mask_param % 8), BRW_TYPE_UB);
brw_reg b32;
for (unsigned i = 0; i < 64; i++) {
if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
ubld.SHL(horiz_offset(shifted, 8),
byte_offset(retype(mask, BRW_TYPE_W), i / 8),
brw_imm_v(0x01234567));
ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
/* For each 16bit lane, generate an offset in unit of 16B */
brw_reg offset_base = ubld.vgrf(BRW_TYPE_UW, max_grf_writes);
ubld.MOV(offset_base, brw_imm_uv(0x76543210));
ubld.MOV(horiz_offset(offset_base, 8), brw_imm_uv(0xFEDCBA98));
if (max_grf_writes > 2)
ubld.group(16, 0).ADD(horiz_offset(offset_base, 16), offset_base, brw_imm_uw(16));
brw_builder ubld16 = ubld.group(16, 0);
b32 = ubld16.vgrf(BRW_TYPE_D);
ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
}
u_foreach_bit(i, prog_data->robust_ubo_ranges) {
struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i];
if (want_zero & BITFIELD64_BIT(i)) {
assert(i < prog_data->curb_read_length);
struct brw_reg push_reg =
retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
unsigned range_start = ubo_push_start[i] / 8;
uint64_t want_zero = (used >> range_start) & BITFIELD64_MASK(ubo_range->length);
if (!want_zero)
continue;
ubld.AND(push_reg, push_reg, component(b32, i % 16));
}
const unsigned grf_start = payload().num_regs + range_start;
const unsigned grf_end = grf_start + ubo_range->length;
const unsigned max_grf_mask = max_grf_writes * 4;
unsigned grf = grf_start;
do {
unsigned mask_length = MIN2(grf_end - grf, max_grf_mask);
unsigned simd_width_mask = 1 << util_last_bit(mask_length * 2 - 1);
if (!(want_zero & BITFIELD64_RANGE(grf - grf_start, mask_length))) {
grf += max_grf_mask;
continue;
}
/* Prepare section of mask, at 1/4 size */
brw_builder ubld_mask = ubld.group(simd_width_mask, 0);
brw_reg offset_reg = ubld_mask.vgrf(BRW_TYPE_UW);
unsigned mask_start = grf, mask_end = grf + mask_length;
ubld_mask.ADD(offset_reg, offset_base, brw_imm_uw((mask_start - grf_start) * 2));
/* Compare the 16B increments with the value coming from push
* constants and store the result into a dword. This expands a
* comparison between 2 values in 16B increments into a 32bit mask
* where each bit covers 4bits of data in the payload.
*
* This expension works because of the sign extension guaranteed
* by the HW.
*
* SKL PRMs, Volume 7: 3D-Media-GPGPU, Execution Data Type:
*
* "The following rules explain the conversion of multiple
* source operand types, possibly a mix of different types, to
* one common execution type:
* - ...
* - Unsigned integers are converted to signed integers.
* - Byte (B) or Unsigned Byte (UB) values are converted to a Word
* or wider integer execution type.
* - If source operands have different integer widths, use
* the widest width specified to choose the signed integer
* execution type."
*/
brw_reg mask_reg = ubld_mask.vgrf(BRW_TYPE_UD);
ubld_mask.CMP(mask_reg, byte_offset(mask, i), offset_reg, BRW_CONDITIONAL_G);
for (unsigned and_length; grf < mask_end; grf += and_length) {
and_length = 1u << (util_last_bit(MIN2(grf_end - grf, max_grf_writes)) - 1);
if (!(want_zero & BITFIELD64_RANGE(grf - grf_start, and_length)))
continue;
brw_reg push_reg = retype(brw_vec8_grf(grf, 0), BRW_TYPE_D);
/* Expand the masking bits one more time (1bit -> 4bit because
* UB -> UD) so that now each 8bits of mask cover 32bits of
* data to mask, while doing the masking in the payload data.
*/
ubld.group(and_length * 8, 0).AND(
push_reg,
byte_offset(retype(mask_reg, BRW_TYPE_B),
(grf - mask_start) * 8),
push_reg);
}
} while (grf < grf_end);
}
invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS | BRW_DEPENDENCY_VARIABLES);
}
/* This may be updated in assign_urb_setup or assign_vs_urb_setup. */

View file

@ -2429,13 +2429,13 @@ anv_descriptor_set_write_buffer(struct anv_device *device,
struct anv_address bind_addr = anv_address_add(buffer->address, offset);
desc->bind_range = vk_buffer_range(&buffer->vk, offset, range);
/* We report a bounds checking alignment of ANV_UBO_ALIGNMENT in
/* We report a bounds checking alignment of ANV_UBO_BOUNDS_CHECK_ALIGNMENT in
* VkPhysicalDeviceRobustness2PropertiesEXT::robustUniformBufferAccessSizeAlignment
* so align the range to that.
*/
if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
desc->bind_range = align64(desc->bind_range, ANV_UBO_ALIGNMENT);
desc->bind_range = align64(desc->bind_range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);
if (data & ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE) {
struct anv_address_range_descriptor desc_data = {
@ -3014,7 +3014,7 @@ void anv_GetDescriptorEXT(
* messages which read an entire register worth at a time.
*/
if (pDescriptorInfo->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
range = align64(range, ANV_UBO_ALIGNMENT);
range = align64(range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);
isl_surf_usage_flags_t usage =
pDescriptorInfo->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ?

View file

@ -273,9 +273,8 @@ anv_nir_compute_push_layout(nir_shader *nir,
}
const unsigned max_push_buffers = needs_padding_per_primitive ? 3 : 4;
unsigned range_start_reg = push_constant_range.length;
for (int i = 0; i < 4; i++) {
for (unsigned i = 0; i < 4; i++) {
struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i];
if (ubo_range->length == 0)
continue;
@ -300,11 +299,8 @@ anv_nir_compute_push_layout(nir_shader *nir,
/* We only bother to shader-zero pushed client UBOs */
if (binding->set < MAX_SETS &&
(robust_flags & BRW_ROBUSTNESS_UBO)) {
prog_data->zero_push_reg |= BITFIELD64_RANGE(range_start_reg,
ubo_range->length);
prog_data->robust_ubo_ranges |= (uint8_t) (1 << i);
}
range_start_reg += ubo_range->length;
}
} else if (push_constant_range.length > 0) {
/* For Ivy Bridge, the push constants packets have a different

View file

@ -53,7 +53,7 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
assert(ANV_UBO_ALIGNMENT == 64);
unsigned suboffset = offset % 64;
uint64_t aligned_offset = offset - suboffset;
unsigned aligned_offset = offset - suboffset;
/* Load two just in case we go over a 64B boundary */
nir_def *data[2];
@ -64,11 +64,30 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
b, 16, 32, addr,
.access = nir_intrinsic_access(load),
.align_mul = 64);
if (bound) {
data[i] = nir_bcsel(b,
nir_igt_imm(b, bound, aligned_offset + i * 64 + 63),
data[i],
nir_imm_int(b, 0));
}
if (bound) {
nir_def* offsets =
nir_imm_uvec8(b, aligned_offset, aligned_offset + 16,
aligned_offset + 32, aligned_offset + 48,
aligned_offset + 64, aligned_offset + 80,
aligned_offset + 96, aligned_offset + 112);
nir_def* mask =
nir_bcsel(b, nir_ilt(b, offsets, bound),
nir_imm_int(b, 0xFFFFFFFF),
nir_imm_int(b, 0x00000000));
for (unsigned i = 0; i < 2; i++) {
/* We prepared a mask where every 1 bit of mask covers 4 bits of the
* UBO block we've loaded, when we apply it we'll sign extend each
* byte of the mask to a dword to get the final bitfield, this can
* be optimized because Intel HW allows instructions to mix several
* types and perform the sign extensions implicitly.
*/
data[i] =
nir_iand(b,
nir_i2iN(b, nir_extract_bits(b, &mask, 1, i * 128, 16, 8), 32),
data[i]);
}
}

View file

@ -1568,7 +1568,7 @@ get_properties(const struct anv_physical_device *pdevice,
props->robustStorageBufferAccessSizeAlignment =
ANV_SSBO_BOUNDS_CHECK_ALIGNMENT;
props->robustUniformBufferAccessSizeAlignment =
ANV_UBO_ALIGNMENT;
ANV_UBO_BOUNDS_CHECK_ALIGNMENT;
}
/* VK_KHR_vertex_attribute_divisor */

View file

@ -198,6 +198,7 @@ get_max_vbs(const struct intel_device_info *devinfo) {
* GEM object.
*/
#define ANV_UBO_ALIGNMENT 64
#define ANV_UBO_BOUNDS_CHECK_ALIGNMENT 16
#define ANV_SSBO_ALIGNMENT 4
#define ANV_SSBO_BOUNDS_CHECK_ALIGNMENT 4
#define MAX_VIEWS_FOR_PRIMITIVE_REPLICATION 16
@ -3973,7 +3974,7 @@ struct anv_push_constants {
uint32_t tcs_input_vertices;
/** Robust access pushed registers. */
uint64_t push_reg_mask[MESA_SHADER_STAGES];
uint8_t push_reg_mask[MESA_SHADER_STAGES][4];
uint32_t fs_per_prim_remap_offset;
} gfx;

View file

@ -1957,7 +1957,7 @@ emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
* VkPhysicalDeviceRobustness2PropertiesEXT::robustUniformBufferAccessSizeAlignment
*/
if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
range = align(range, ANV_UBO_ALIGNMENT);
range = align(range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);
struct anv_address address =
anv_address_add(desc->buffer->address, offset);

View file

@ -305,7 +305,7 @@ get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
/* Align the range for consistency */
bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
bound_range = align(bound_range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);
return bound_range;
}
@ -444,42 +444,45 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
continue;
const struct anv_shader_bin *shader = gfx->shaders[stage];
if (shader->prog_data->zero_push_reg) {
if (shader->prog_data->robust_ubo_ranges) {
const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
struct anv_push_constants *push = &gfx->base.push_constants;
push->gfx.push_reg_mask[stage] = 0;
/* Start of the current range in the shader, relative to the start of
* push constants in the shader.
*/
unsigned range_start_reg = 0;
unsigned ubo_range_index = 0;
for (unsigned i = 0; i < 4; i++) {
const struct anv_push_range *range = &bind_map->push_ranges[i];
if (range->length == 0)
continue;
/* Never clear this padding register as it might contain payload
* data.
*/
if (range->set == ANV_DESCRIPTOR_SET_PER_PRIM_PADDING)
/* Skip any push ranges that were not promoted from UBOs */
if (range->set >= MAX_SETS)
continue;
assert(shader->prog_data->robust_ubo_ranges & (1 << ubo_range_index));
unsigned bound_size =
get_push_range_bound_size(cmd_buffer, shader, range);
if (bound_size >= range->start * 32) {
unsigned bound_regs =
MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
range->length);
assert(range_start_reg + bound_regs <= 64);
push->gfx.push_reg_mask[stage] |=
BITFIELD64_RANGE(range_start_reg, bound_regs);
uint8_t range_mask = 0;
/* Determine the bound length of the range in 16-byte units */
if (bound_size > range->start * 32) {
bound_size = MIN2(
DIV_ROUND_UP(bound_size - range->start * 32, 16),
2 * range->length);
range_mask = (uint8_t) bound_size;
assert(bound_size < 256);
}
cmd_buffer->state.push_constants_dirty |=
mesa_to_vk_shader_stage(stage);
gfx->base.push_constants_data_dirty = true;
/* Update the pushed bound length constant if it changed */
if (range_mask != push->gfx.push_reg_mask[stage][ubo_range_index]) {
push->gfx.push_reg_mask[stage][ubo_range_index] = range_mask;
cmd_buffer->state.push_constants_dirty |=
mesa_to_vk_shader_stage(stage);
gfx->base.push_constants_data_dirty = true;
}
range_start_reg += range->length;
++ubo_range_index;
}
}
}