mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 04:30:10 +01:00
brw,anv: Reduce UBO robustness size alignment to 16 bytes
Instead of being encoded as a contiguous 64-bit mask of individual registers, the robustness information is now encoded as a vector of up to 4 bytes that represent the limits of each of the pushed UBO ranges in 16 byte units. Some buggy Direct3D workloads are known to depend on a robustness alignment as low as 16 bytes to work properly. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36455>
This commit is contained in:
parent
a3ecdf33a3
commit
c7e48f79b7
9 changed files with 149 additions and 71 deletions
|
|
@ -599,17 +599,16 @@ struct brw_stage_prog_data {
|
|||
|
||||
mesa_shader_stage stage;
|
||||
|
||||
/* zero_push_reg is a bitfield which indicates what push registers (if any)
|
||||
* should be zeroed by SW at the start of the shader. The corresponding
|
||||
* push_reg_mask_param specifies the param index (in 32-bit units) where
|
||||
* the actual runtime 64-bit mask will be pushed. The shader will zero
|
||||
* push reg i if
|
||||
/* If robust_ubo_ranges not 0, push_reg_mask_param specifies the param
|
||||
* index (in 32-bit units) where the 4 UBO range limits will be pushed
|
||||
* as 8-bit integers. The shader will zero byte i of UBO range j if:
|
||||
*
|
||||
* reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i)
|
||||
* (robust_ubo_ranges & (1 << j)) &&
|
||||
* (i < push_reg_mask_param[j] * 16)
|
||||
*
|
||||
* If this field is set, brw_compiler::compact_params must be false.
|
||||
* brw_compiler::compact_params must be false if robust_ubo_ranges used
|
||||
*/
|
||||
uint64_t zero_push_reg;
|
||||
uint8_t robust_ubo_ranges;
|
||||
unsigned push_reg_mask_param;
|
||||
|
||||
unsigned curb_read_length;
|
||||
|
|
|
|||
|
|
@ -813,39 +813,99 @@ brw_shader::assign_curb_setup()
|
|||
}
|
||||
}
|
||||
|
||||
uint64_t want_zero = used & prog_data->zero_push_reg;
|
||||
if (want_zero) {
|
||||
if (prog_data->robust_ubo_ranges) {
|
||||
brw_builder ubld = brw_builder(this, 8).exec_all().at_start(cfg->first_block());
|
||||
/* At most we can write 2 GRFs (HW limit), the SIMD width matching the
|
||||
* HW generation depends on the size of the physical register.
|
||||
*/
|
||||
const unsigned max_grf_writes = 2 * reg_unit(devinfo);
|
||||
assert(max_grf_writes <= 4);
|
||||
|
||||
/* push_reg_mask_param is in 32-bit units */
|
||||
unsigned mask_param = prog_data->push_reg_mask_param;
|
||||
struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
|
||||
mask_param % 8);
|
||||
brw_reg mask = retype(brw_vec1_grf(payload().num_regs + mask_param / 8,
|
||||
mask_param % 8), BRW_TYPE_UB);
|
||||
|
||||
brw_reg b32;
|
||||
for (unsigned i = 0; i < 64; i++) {
|
||||
if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
|
||||
brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
|
||||
ubld.SHL(horiz_offset(shifted, 8),
|
||||
byte_offset(retype(mask, BRW_TYPE_W), i / 8),
|
||||
brw_imm_v(0x01234567));
|
||||
ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
|
||||
/* For each 16bit lane, generate an offset in unit of 16B */
|
||||
brw_reg offset_base = ubld.vgrf(BRW_TYPE_UW, max_grf_writes);
|
||||
ubld.MOV(offset_base, brw_imm_uv(0x76543210));
|
||||
ubld.MOV(horiz_offset(offset_base, 8), brw_imm_uv(0xFEDCBA98));
|
||||
if (max_grf_writes > 2)
|
||||
ubld.group(16, 0).ADD(horiz_offset(offset_base, 16), offset_base, brw_imm_uw(16));
|
||||
|
||||
brw_builder ubld16 = ubld.group(16, 0);
|
||||
b32 = ubld16.vgrf(BRW_TYPE_D);
|
||||
ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
|
||||
}
|
||||
u_foreach_bit(i, prog_data->robust_ubo_ranges) {
|
||||
struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i];
|
||||
|
||||
if (want_zero & BITFIELD64_BIT(i)) {
|
||||
assert(i < prog_data->curb_read_length);
|
||||
struct brw_reg push_reg =
|
||||
retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
|
||||
unsigned range_start = ubo_push_start[i] / 8;
|
||||
uint64_t want_zero = (used >> range_start) & BITFIELD64_MASK(ubo_range->length);
|
||||
if (!want_zero)
|
||||
continue;
|
||||
|
||||
ubld.AND(push_reg, push_reg, component(b32, i % 16));
|
||||
}
|
||||
const unsigned grf_start = payload().num_regs + range_start;
|
||||
const unsigned grf_end = grf_start + ubo_range->length;
|
||||
const unsigned max_grf_mask = max_grf_writes * 4;
|
||||
unsigned grf = grf_start;
|
||||
|
||||
do {
|
||||
unsigned mask_length = MIN2(grf_end - grf, max_grf_mask);
|
||||
unsigned simd_width_mask = 1 << util_last_bit(mask_length * 2 - 1);
|
||||
|
||||
if (!(want_zero & BITFIELD64_RANGE(grf - grf_start, mask_length))) {
|
||||
grf += max_grf_mask;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Prepare section of mask, at 1/4 size */
|
||||
brw_builder ubld_mask = ubld.group(simd_width_mask, 0);
|
||||
brw_reg offset_reg = ubld_mask.vgrf(BRW_TYPE_UW);
|
||||
unsigned mask_start = grf, mask_end = grf + mask_length;
|
||||
ubld_mask.ADD(offset_reg, offset_base, brw_imm_uw((mask_start - grf_start) * 2));
|
||||
/* Compare the 16B increments with the value coming from push
|
||||
* constants and store the result into a dword. This expands a
|
||||
* comparison between 2 values in 16B increments into a 32bit mask
|
||||
* where each bit covers 4bits of data in the payload.
|
||||
*
|
||||
* This expension works because of the sign extension guaranteed
|
||||
* by the HW.
|
||||
*
|
||||
* SKL PRMs, Volume 7: 3D-Media-GPGPU, Execution Data Type:
|
||||
*
|
||||
* "The following rules explain the conversion of multiple
|
||||
* source operand types, possibly a mix of different types, to
|
||||
* one common execution type:
|
||||
* - ...
|
||||
* - Unsigned integers are converted to signed integers.
|
||||
* - Byte (B) or Unsigned Byte (UB) values are converted to a Word
|
||||
* or wider integer execution type.
|
||||
* - If source operands have different integer widths, use
|
||||
* the widest width specified to choose the signed integer
|
||||
* execution type."
|
||||
*/
|
||||
brw_reg mask_reg = ubld_mask.vgrf(BRW_TYPE_UD);
|
||||
ubld_mask.CMP(mask_reg, byte_offset(mask, i), offset_reg, BRW_CONDITIONAL_G);
|
||||
|
||||
for (unsigned and_length; grf < mask_end; grf += and_length) {
|
||||
and_length = 1u << (util_last_bit(MIN2(grf_end - grf, max_grf_writes)) - 1);
|
||||
|
||||
if (!(want_zero & BITFIELD64_RANGE(grf - grf_start, and_length)))
|
||||
continue;
|
||||
|
||||
brw_reg push_reg = retype(brw_vec8_grf(grf, 0), BRW_TYPE_D);
|
||||
|
||||
/* Expand the masking bits one more time (1bit -> 4bit because
|
||||
* UB -> UD) so that now each 8bits of mask cover 32bits of
|
||||
* data to mask, while doing the masking in the payload data.
|
||||
*/
|
||||
ubld.group(and_length * 8, 0).AND(
|
||||
push_reg,
|
||||
byte_offset(retype(mask_reg, BRW_TYPE_B),
|
||||
(grf - mask_start) * 8),
|
||||
push_reg);
|
||||
}
|
||||
} while (grf < grf_end);
|
||||
}
|
||||
|
||||
invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
|
||||
invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS | BRW_DEPENDENCY_VARIABLES);
|
||||
}
|
||||
|
||||
/* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
|
||||
|
|
|
|||
|
|
@ -2429,13 +2429,13 @@ anv_descriptor_set_write_buffer(struct anv_device *device,
|
|||
struct anv_address bind_addr = anv_address_add(buffer->address, offset);
|
||||
desc->bind_range = vk_buffer_range(&buffer->vk, offset, range);
|
||||
|
||||
/* We report a bounds checking alignment of ANV_UBO_ALIGNMENT in
|
||||
/* We report a bounds checking alignment of ANV_UBO_BOUNDS_CHECK_ALIGNMENT in
|
||||
* VkPhysicalDeviceRobustness2PropertiesEXT::robustUniformBufferAccessSizeAlignment
|
||||
* so align the range to that.
|
||||
*/
|
||||
if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
|
||||
type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
|
||||
desc->bind_range = align64(desc->bind_range, ANV_UBO_ALIGNMENT);
|
||||
desc->bind_range = align64(desc->bind_range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);
|
||||
|
||||
if (data & ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE) {
|
||||
struct anv_address_range_descriptor desc_data = {
|
||||
|
|
@ -3014,7 +3014,7 @@ void anv_GetDescriptorEXT(
|
|||
* messages which read an entire register worth at a time.
|
||||
*/
|
||||
if (pDescriptorInfo->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
|
||||
range = align64(range, ANV_UBO_ALIGNMENT);
|
||||
range = align64(range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);
|
||||
|
||||
isl_surf_usage_flags_t usage =
|
||||
pDescriptorInfo->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ?
|
||||
|
|
|
|||
|
|
@ -273,9 +273,8 @@ anv_nir_compute_push_layout(nir_shader *nir,
|
|||
}
|
||||
|
||||
const unsigned max_push_buffers = needs_padding_per_primitive ? 3 : 4;
|
||||
unsigned range_start_reg = push_constant_range.length;
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i];
|
||||
if (ubo_range->length == 0)
|
||||
continue;
|
||||
|
|
@ -300,11 +299,8 @@ anv_nir_compute_push_layout(nir_shader *nir,
|
|||
/* We only bother to shader-zero pushed client UBOs */
|
||||
if (binding->set < MAX_SETS &&
|
||||
(robust_flags & BRW_ROBUSTNESS_UBO)) {
|
||||
prog_data->zero_push_reg |= BITFIELD64_RANGE(range_start_reg,
|
||||
ubo_range->length);
|
||||
prog_data->robust_ubo_ranges |= (uint8_t) (1 << i);
|
||||
}
|
||||
|
||||
range_start_reg += ubo_range->length;
|
||||
}
|
||||
} else if (push_constant_range.length > 0) {
|
||||
/* For Ivy Bridge, the push constants packets have a different
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
|
|||
assert(ANV_UBO_ALIGNMENT == 64);
|
||||
|
||||
unsigned suboffset = offset % 64;
|
||||
uint64_t aligned_offset = offset - suboffset;
|
||||
unsigned aligned_offset = offset - suboffset;
|
||||
|
||||
/* Load two just in case we go over a 64B boundary */
|
||||
nir_def *data[2];
|
||||
|
|
@ -64,11 +64,30 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
|
|||
b, 16, 32, addr,
|
||||
.access = nir_intrinsic_access(load),
|
||||
.align_mul = 64);
|
||||
if (bound) {
|
||||
data[i] = nir_bcsel(b,
|
||||
nir_igt_imm(b, bound, aligned_offset + i * 64 + 63),
|
||||
data[i],
|
||||
nir_imm_int(b, 0));
|
||||
}
|
||||
|
||||
if (bound) {
|
||||
nir_def* offsets =
|
||||
nir_imm_uvec8(b, aligned_offset, aligned_offset + 16,
|
||||
aligned_offset + 32, aligned_offset + 48,
|
||||
aligned_offset + 64, aligned_offset + 80,
|
||||
aligned_offset + 96, aligned_offset + 112);
|
||||
nir_def* mask =
|
||||
nir_bcsel(b, nir_ilt(b, offsets, bound),
|
||||
nir_imm_int(b, 0xFFFFFFFF),
|
||||
nir_imm_int(b, 0x00000000));
|
||||
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
/* We prepared a mask where every 1 bit of mask covers 4 bits of the
|
||||
* UBO block we've loaded, when we apply it we'll sign extend each
|
||||
* byte of the mask to a dword to get the final bitfield, this can
|
||||
* be optimized because Intel HW allows instructions to mix several
|
||||
* types and perform the sign extensions implicitly.
|
||||
*/
|
||||
data[i] =
|
||||
nir_iand(b,
|
||||
nir_i2iN(b, nir_extract_bits(b, &mask, 1, i * 128, 16, 8), 32),
|
||||
data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1568,7 +1568,7 @@ get_properties(const struct anv_physical_device *pdevice,
|
|||
props->robustStorageBufferAccessSizeAlignment =
|
||||
ANV_SSBO_BOUNDS_CHECK_ALIGNMENT;
|
||||
props->robustUniformBufferAccessSizeAlignment =
|
||||
ANV_UBO_ALIGNMENT;
|
||||
ANV_UBO_BOUNDS_CHECK_ALIGNMENT;
|
||||
}
|
||||
|
||||
/* VK_KHR_vertex_attribute_divisor */
|
||||
|
|
|
|||
|
|
@ -198,6 +198,7 @@ get_max_vbs(const struct intel_device_info *devinfo) {
|
|||
* GEM object.
|
||||
*/
|
||||
#define ANV_UBO_ALIGNMENT 64
|
||||
#define ANV_UBO_BOUNDS_CHECK_ALIGNMENT 16
|
||||
#define ANV_SSBO_ALIGNMENT 4
|
||||
#define ANV_SSBO_BOUNDS_CHECK_ALIGNMENT 4
|
||||
#define MAX_VIEWS_FOR_PRIMITIVE_REPLICATION 16
|
||||
|
|
@ -3973,7 +3974,7 @@ struct anv_push_constants {
|
|||
uint32_t tcs_input_vertices;
|
||||
|
||||
/** Robust access pushed registers. */
|
||||
uint64_t push_reg_mask[MESA_SHADER_STAGES];
|
||||
uint8_t push_reg_mask[MESA_SHADER_STAGES][4];
|
||||
|
||||
uint32_t fs_per_prim_remap_offset;
|
||||
} gfx;
|
||||
|
|
|
|||
|
|
@ -1957,7 +1957,7 @@ emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
|
|||
* VkPhysicalDeviceRobustness2PropertiesEXT::robustUniformBufferAccessSizeAlignment
|
||||
*/
|
||||
if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
|
||||
range = align(range, ANV_UBO_ALIGNMENT);
|
||||
range = align(range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);
|
||||
|
||||
struct anv_address address =
|
||||
anv_address_add(desc->buffer->address, offset);
|
||||
|
|
|
|||
|
|
@ -305,7 +305,7 @@ get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
|
|||
uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
|
||||
|
||||
/* Align the range for consistency */
|
||||
bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
|
||||
bound_range = align(bound_range, ANV_UBO_BOUNDS_CHECK_ALIGNMENT);
|
||||
|
||||
return bound_range;
|
||||
}
|
||||
|
|
@ -444,42 +444,45 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
|
|||
continue;
|
||||
|
||||
const struct anv_shader_bin *shader = gfx->shaders[stage];
|
||||
if (shader->prog_data->zero_push_reg) {
|
||||
if (shader->prog_data->robust_ubo_ranges) {
|
||||
const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
|
||||
struct anv_push_constants *push = &gfx->base.push_constants;
|
||||
|
||||
push->gfx.push_reg_mask[stage] = 0;
|
||||
/* Start of the current range in the shader, relative to the start of
|
||||
* push constants in the shader.
|
||||
*/
|
||||
unsigned range_start_reg = 0;
|
||||
unsigned ubo_range_index = 0;
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
const struct anv_push_range *range = &bind_map->push_ranges[i];
|
||||
if (range->length == 0)
|
||||
continue;
|
||||
|
||||
/* Never clear this padding register as it might contain payload
|
||||
* data.
|
||||
*/
|
||||
if (range->set == ANV_DESCRIPTOR_SET_PER_PRIM_PADDING)
|
||||
/* Skip any push ranges that were not promoted from UBOs */
|
||||
if (range->set >= MAX_SETS)
|
||||
continue;
|
||||
|
||||
assert(shader->prog_data->robust_ubo_ranges & (1 << ubo_range_index));
|
||||
|
||||
unsigned bound_size =
|
||||
get_push_range_bound_size(cmd_buffer, shader, range);
|
||||
if (bound_size >= range->start * 32) {
|
||||
unsigned bound_regs =
|
||||
MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
|
||||
range->length);
|
||||
assert(range_start_reg + bound_regs <= 64);
|
||||
push->gfx.push_reg_mask[stage] |=
|
||||
BITFIELD64_RANGE(range_start_reg, bound_regs);
|
||||
|
||||
uint8_t range_mask = 0;
|
||||
|
||||
/* Determine the bound length of the range in 16-byte units */
|
||||
if (bound_size > range->start * 32) {
|
||||
bound_size = MIN2(
|
||||
DIV_ROUND_UP(bound_size - range->start * 32, 16),
|
||||
2 * range->length);
|
||||
range_mask = (uint8_t) bound_size;
|
||||
assert(bound_size < 256);
|
||||
}
|
||||
|
||||
cmd_buffer->state.push_constants_dirty |=
|
||||
mesa_to_vk_shader_stage(stage);
|
||||
gfx->base.push_constants_data_dirty = true;
|
||||
/* Update the pushed bound length constant if it changed */
|
||||
if (range_mask != push->gfx.push_reg_mask[stage][ubo_range_index]) {
|
||||
push->gfx.push_reg_mask[stage][ubo_range_index] = range_mask;
|
||||
cmd_buffer->state.push_constants_dirty |=
|
||||
mesa_to_vk_shader_stage(stage);
|
||||
gfx->base.push_constants_data_dirty = true;
|
||||
}
|
||||
|
||||
range_start_reg += range->length;
|
||||
++ubo_range_index;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue