From c7e48f79b7be5c0ec34d9267a32cf8cba08e2248 Mon Sep 17 00:00:00 2001 From: Calder Young Date: Tue, 29 Jul 2025 14:44:33 -0700 Subject: [PATCH] brw,anv: Reduce UBO robustness size alignment to 16 bytes Instead of being encoded as a contiguous 64-bit mask of individual registers, the robustness information is now encoded as a vector of up to 4 bytes that represent the limits of each of the pushed UBO ranges in 16 byte units. Some buggy Direct3D workloads are known to depend on a robustness alignment as low as 16 bytes to work properly. Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/compiler/brw_compiler.h | 15 ++- src/intel/compiler/brw_shader.cpp | 106 ++++++++++++++---- src/intel/vulkan/anv_descriptor_set.c | 6 +- .../vulkan/anv_nir_compute_push_layout.c | 8 +- src/intel/vulkan/anv_nir_lower_ubo_loads.c | 31 ++++- src/intel/vulkan/anv_physical_device.c | 2 +- src/intel/vulkan/anv_private.h | 3 +- src/intel/vulkan/genX_cmd_buffer.c | 2 +- src/intel/vulkan/genX_cmd_draw.c | 47 ++++---- 9 files changed, 149 insertions(+), 71 deletions(-) diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 50577d51d5d..1e26e9f4bde 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -599,17 +599,16 @@ struct brw_stage_prog_data { mesa_shader_stage stage; - /* zero_push_reg is a bitfield which indicates what push registers (if any) - * should be zeroed by SW at the start of the shader. The corresponding - * push_reg_mask_param specifies the param index (in 32-bit units) where - * the actual runtime 64-bit mask will be pushed. The shader will zero - * push reg i if + /* If robust_ubo_ranges not 0, push_reg_mask_param specifies the param + * index (in 32-bit units) where the 4 UBO range limits will be pushed + * as 8-bit integers. The shader will zero byte i of UBO range j if: * - * reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i) + * (robust_ubo_ranges & (1 << j)) && + * (i < push_reg_mask_param[j] * 16) * - * If this field is set, brw_compiler::compact_params must be false. + * brw_compiler::compact_params must be false if robust_ubo_ranges used */ - uint64_t zero_push_reg; + uint8_t robust_ubo_ranges; unsigned push_reg_mask_param; unsigned curb_read_length; diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index 597f90f811b..b33fa4a493a 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -813,39 +813,99 @@ brw_shader::assign_curb_setup() } } - uint64_t want_zero = used & prog_data->zero_push_reg; - if (want_zero) { + if (prog_data->robust_ubo_ranges) { brw_builder ubld = brw_builder(this, 8).exec_all().at_start(cfg->first_block()); + /* At most we can write 2 GRFs (HW limit), the SIMD width matching the + * HW generation depends on the size of the physical register. + */ + const unsigned max_grf_writes = 2 * reg_unit(devinfo); + assert(max_grf_writes <= 4); /* push_reg_mask_param is in 32-bit units */ unsigned mask_param = prog_data->push_reg_mask_param; - struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8, - mask_param % 8); + brw_reg mask = retype(brw_vec1_grf(payload().num_regs + mask_param / 8, + mask_param % 8), BRW_TYPE_UB); - brw_reg b32; - for (unsigned i = 0; i < 64; i++) { - if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) { - brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2); - ubld.SHL(horiz_offset(shifted, 8), - byte_offset(retype(mask, BRW_TYPE_W), i / 8), - brw_imm_v(0x01234567)); - ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8)); + /* For each 16bit lane, generate an offset in unit of 16B */ + brw_reg offset_base = ubld.vgrf(BRW_TYPE_UW, max_grf_writes); + ubld.MOV(offset_base, brw_imm_uv(0x76543210)); + ubld.MOV(horiz_offset(offset_base, 8), brw_imm_uv(0xFEDCBA98)); + if (max_grf_writes > 2) + ubld.group(16, 0).ADD(horiz_offset(offset_base, 16), offset_base, brw_imm_uw(16)); - brw_builder ubld16 = ubld.group(16, 0); - b32 = ubld16.vgrf(BRW_TYPE_D); - ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15)); - } + u_foreach_bit(i, prog_data->robust_ubo_ranges) { + struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i]; - if (want_zero & BITFIELD64_BIT(i)) { - assert(i < prog_data->curb_read_length); - struct brw_reg push_reg = - retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D); + unsigned range_start = ubo_push_start[i] / 8; + uint64_t want_zero = (used >> range_start) & BITFIELD64_MASK(ubo_range->length); + if (!want_zero) + continue; - ubld.AND(push_reg, push_reg, component(b32, i % 16)); - } + const unsigned grf_start = payload().num_regs + range_start; + const unsigned grf_end = grf_start + ubo_range->length; + const unsigned max_grf_mask = max_grf_writes * 4; + unsigned grf = grf_start; + + do { + unsigned mask_length = MIN2(grf_end - grf, max_grf_mask); + unsigned simd_width_mask = 1 << util_last_bit(mask_length * 2 - 1); + + if (!(want_zero & BITFIELD64_RANGE(grf - grf_start, mask_length))) { + grf += max_grf_mask; + continue; + } + + /* Prepare section of mask, at 1/4 size */ + brw_builder ubld_mask = ubld.group(simd_width_mask, 0); + brw_reg offset_reg = ubld_mask.vgrf(BRW_TYPE_UW); + unsigned mask_start = grf, mask_end = grf + mask_length; + ubld_mask.ADD(offset_reg, offset_base, brw_imm_uw((mask_start - grf_start) * 2)); + /* Compare the 16B increments with the value coming from push + * constants and store the result into a dword. This expands a + * comparison between 2 values in 16B increments into a 32bit mask + * where each bit covers 4bits of data in the payload. + * + * This expension works because of the sign extension guaranteed + * by the HW. + * + * SKL PRMs, Volume 7: 3D-Media-GPGPU, Execution Data Type: + * + * "The following rules explain the conversion of multiple + * source operand types, possibly a mix of different types, to + * one common execution type: + * - ... + * - Unsigned integers are converted to signed integers. + * - Byte (B) or Unsigned Byte (UB) values are converted to a Word + * or wider integer execution type. + * - If source operands have different integer widths, use + * the widest width specified to choose the signed integer + * execution type." + */ + brw_reg mask_reg = ubld_mask.vgrf(BRW_TYPE_UD); + ubld_mask.CMP(mask_reg, byte_offset(mask, i), offset_reg, BRW_CONDITIONAL_G); + + for (unsigned and_length; grf < mask_end; grf += and_length) { + and_length = 1u << (util_last_bit(MIN2(grf_end - grf, max_grf_writes)) - 1); + + if (!(want_zero & BITFIELD64_RANGE(grf - grf_start, and_length))) + continue; + + brw_reg push_reg = retype(brw_vec8_grf(grf, 0), BRW_TYPE_D); + + /* Expand the masking bits one more time (1bit -> 4bit because + * UB -> UD) so that now each 8bits of mask cover 32bits of + * data to mask, while doing the masking in the payload data. + */ + ubld.group(and_length * 8, 0).AND( + push_reg, + byte_offset(retype(mask_reg, BRW_TYPE_B), + (grf - mask_start) * 8), + push_reg); + } + } while (grf < grf_end); } - invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS); + invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS | BRW_DEPENDENCY_VARIABLES); } /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */ diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c index 5b8e5df711f..4f456422fd2 100644 --- a/src/intel/vulkan/anv_descriptor_set.c +++ b/src/intel/vulkan/anv_descriptor_set.c @@ -2429,13 +2429,13 @@ anv_descriptor_set_write_buffer(struct anv_device *device, struct anv_address bind_addr = anv_address_add(buffer->address, offset); desc->bind_range = vk_buffer_range(&buffer->vk, offset, range); - /* We report a bounds checking alignment of ANV_UBO_ALIGNMENT in + /* We report a bounds checking alignment of ANV_UBO_BOUNDS_CHECK_ALIGNMENT in * VkPhysicalDeviceRobustness2PropertiesEXT::robustUniformBufferAccessSizeAlignment * so align the range to that. */ if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER || type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) - desc->bind_range = align64(desc->bind_range, ANV_UBO_ALIGNMENT); + desc->bind_range = align64(desc->bind_range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT); if (data & ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE) { struct anv_address_range_descriptor desc_data = { @@ -3014,7 +3014,7 @@ void anv_GetDescriptorEXT( * messages which read an entire register worth at a time. */ if (pDescriptorInfo->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) - range = align64(range, ANV_UBO_ALIGNMENT); + range = align64(range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT); isl_surf_usage_flags_t usage = pDescriptorInfo->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ? diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c index 3e246dba13a..773a8a06fc0 100644 --- a/src/intel/vulkan/anv_nir_compute_push_layout.c +++ b/src/intel/vulkan/anv_nir_compute_push_layout.c @@ -273,9 +273,8 @@ anv_nir_compute_push_layout(nir_shader *nir, } const unsigned max_push_buffers = needs_padding_per_primitive ? 3 : 4; - unsigned range_start_reg = push_constant_range.length; - for (int i = 0; i < 4; i++) { + for (unsigned i = 0; i < 4; i++) { struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i]; if (ubo_range->length == 0) continue; @@ -300,11 +299,8 @@ anv_nir_compute_push_layout(nir_shader *nir, /* We only bother to shader-zero pushed client UBOs */ if (binding->set < MAX_SETS && (robust_flags & BRW_ROBUSTNESS_UBO)) { - prog_data->zero_push_reg |= BITFIELD64_RANGE(range_start_reg, - ubo_range->length); + prog_data->robust_ubo_ranges |= (uint8_t) (1 << i); } - - range_start_reg += ubo_range->length; } } else if (push_constant_range.length > 0) { /* For Ivy Bridge, the push constants packets have a different diff --git a/src/intel/vulkan/anv_nir_lower_ubo_loads.c b/src/intel/vulkan/anv_nir_lower_ubo_loads.c index 49e6f15a719..6406701c924 100644 --- a/src/intel/vulkan/anv_nir_lower_ubo_loads.c +++ b/src/intel/vulkan/anv_nir_lower_ubo_loads.c @@ -53,7 +53,7 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load, assert(ANV_UBO_ALIGNMENT == 64); unsigned suboffset = offset % 64; - uint64_t aligned_offset = offset - suboffset; + unsigned aligned_offset = offset - suboffset; /* Load two just in case we go over a 64B boundary */ nir_def *data[2]; @@ -64,11 +64,30 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load, b, 16, 32, addr, .access = nir_intrinsic_access(load), .align_mul = 64); - if (bound) { - data[i] = nir_bcsel(b, - nir_igt_imm(b, bound, aligned_offset + i * 64 + 63), - data[i], - nir_imm_int(b, 0)); + } + + if (bound) { + nir_def* offsets = + nir_imm_uvec8(b, aligned_offset, aligned_offset + 16, + aligned_offset + 32, aligned_offset + 48, + aligned_offset + 64, aligned_offset + 80, + aligned_offset + 96, aligned_offset + 112); + nir_def* mask = + nir_bcsel(b, nir_ilt(b, offsets, bound), + nir_imm_int(b, 0xFFFFFFFF), + nir_imm_int(b, 0x00000000)); + + for (unsigned i = 0; i < 2; i++) { + /* We prepared a mask where every 1 bit of mask covers 4 bits of the + * UBO block we've loaded, when we apply it we'll sign extend each + * byte of the mask to a dword to get the final bitfield, this can + * be optimized because Intel HW allows instructions to mix several + * types and perform the sign extensions implicitly. + */ + data[i] = + nir_iand(b, + nir_i2iN(b, nir_extract_bits(b, &mask, 1, i * 128, 16, 8), 32), + data[i]); } } diff --git a/src/intel/vulkan/anv_physical_device.c b/src/intel/vulkan/anv_physical_device.c index 310c48672a4..98a67180890 100644 --- a/src/intel/vulkan/anv_physical_device.c +++ b/src/intel/vulkan/anv_physical_device.c @@ -1568,7 +1568,7 @@ get_properties(const struct anv_physical_device *pdevice, props->robustStorageBufferAccessSizeAlignment = ANV_SSBO_BOUNDS_CHECK_ALIGNMENT; props->robustUniformBufferAccessSizeAlignment = - ANV_UBO_ALIGNMENT; + ANV_UBO_BOUNDS_CHECK_ALIGNMENT; } /* VK_KHR_vertex_attribute_divisor */ diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index f0322888588..05d2dc6e0cb 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -198,6 +198,7 @@ get_max_vbs(const struct intel_device_info *devinfo) { * GEM object. */ #define ANV_UBO_ALIGNMENT 64 +#define ANV_UBO_BOUNDS_CHECK_ALIGNMENT 16 #define ANV_SSBO_ALIGNMENT 4 #define ANV_SSBO_BOUNDS_CHECK_ALIGNMENT 4 #define MAX_VIEWS_FOR_PRIMITIVE_REPLICATION 16 @@ -3973,7 +3974,7 @@ struct anv_push_constants { uint32_t tcs_input_vertices; /** Robust access pushed registers. */ - uint64_t push_reg_mask[MESA_SHADER_STAGES]; + uint8_t push_reg_mask[MESA_SHADER_STAGES][4]; uint32_t fs_per_prim_remap_offset; } gfx; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 83331827fd7..2ef239d7209 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -1957,7 +1957,7 @@ emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer *cmd_buffer, * VkPhysicalDeviceRobustness2PropertiesEXT::robustUniformBufferAccessSizeAlignment */ if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) - range = align(range, ANV_UBO_ALIGNMENT); + range = align(range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT); struct anv_address address = anv_address_add(desc->buffer->address, offset); diff --git a/src/intel/vulkan/genX_cmd_draw.c b/src/intel/vulkan/genX_cmd_draw.c index 7da50785a29..f7f61b38471 100644 --- a/src/intel/vulkan/genX_cmd_draw.c +++ b/src/intel/vulkan/genX_cmd_draw.c @@ -305,7 +305,7 @@ get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer, uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset); /* Align the range for consistency */ - bound_range = align(bound_range, ANV_UBO_ALIGNMENT); + bound_range = align(bound_range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT); return bound_range; } @@ -444,42 +444,45 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer, continue; const struct anv_shader_bin *shader = gfx->shaders[stage]; - if (shader->prog_data->zero_push_reg) { + if (shader->prog_data->robust_ubo_ranges) { const struct anv_pipeline_bind_map *bind_map = &shader->bind_map; struct anv_push_constants *push = &gfx->base.push_constants; - push->gfx.push_reg_mask[stage] = 0; - /* Start of the current range in the shader, relative to the start of - * push constants in the shader. - */ - unsigned range_start_reg = 0; + unsigned ubo_range_index = 0; for (unsigned i = 0; i < 4; i++) { const struct anv_push_range *range = &bind_map->push_ranges[i]; if (range->length == 0) continue; - /* Never clear this padding register as it might contain payload - * data. - */ - if (range->set == ANV_DESCRIPTOR_SET_PER_PRIM_PADDING) + /* Skip any push ranges that were not promoted from UBOs */ + if (range->set >= MAX_SETS) continue; + assert(shader->prog_data->robust_ubo_ranges & (1 << ubo_range_index)); + unsigned bound_size = get_push_range_bound_size(cmd_buffer, shader, range); - if (bound_size >= range->start * 32) { - unsigned bound_regs = - MIN2(DIV_ROUND_UP(bound_size, 32) - range->start, - range->length); - assert(range_start_reg + bound_regs <= 64); - push->gfx.push_reg_mask[stage] |= - BITFIELD64_RANGE(range_start_reg, bound_regs); + + uint8_t range_mask = 0; + + /* Determine the bound length of the range in 16-byte units */ + if (bound_size > range->start * 32) { + bound_size = MIN2( + DIV_ROUND_UP(bound_size - range->start * 32, 16), + 2 * range->length); + range_mask = (uint8_t) bound_size; + assert(bound_size < 256); } - cmd_buffer->state.push_constants_dirty |= - mesa_to_vk_shader_stage(stage); - gfx->base.push_constants_data_dirty = true; + /* Update the pushed bound length constant if it changed */ + if (range_mask != push->gfx.push_reg_mask[stage][ubo_range_index]) { + push->gfx.push_reg_mask[stage][ubo_range_index] = range_mask; + cmd_buffer->state.push_constants_dirty |= + mesa_to_vk_shader_stage(stage); + gfx->base.push_constants_data_dirty = true; + } - range_start_reg += range->length; + ++ubo_range_index; } } }