brw: Extend load_urb/store_urb to handle 32-bit non-vec4-aligned access

(Based on the original implementation by Lionel Landwerlin, but adapted
to my respun URB lowering framework.)

The mesh shader URB payload requires reading and writing fields at
arbitrary DWord offsets.  For example, the Primitive Indices array
starts at DWord 1, and it can be a vec1[], vec2[], or vec3[] array,
leading to very unaligned and sometimes double-parked elements.

Still, most fields are still conveniently vec4-aligned.

To handle this, we add a new cb_data::vec4_access flag.  If set, access
remains in vec4 units, with vec4 alignment.  We use this for non-mesh
stages.  When unset, offset is in 32-bit units, allowing unaligned
DWord access.

This is trivial to support on Xe2, where the LSC URB messages support
arbitrary byte-aligned addressing.  On older platforms, we have to
convert this to vec4 aligned offsets plus a component offset (either
returning a subset of the channels loaded, or using component masking
to store a subset of a vec4/vec8).

Thankfully, since the OWord URB messages support accessing a vec8 at
a time, this means we can do any vec4 access in one message, even if
it's double-parked.  We use mod-analysis to see if we can statically
determine the sub-vec4 component offset required (we often can).  If
not, we use the ability to have dynamic writemasks to sort it out.

Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38918>
This commit is contained in:
Kenneth Graunke 2025-12-03 14:59:25 -08:00 committed by Marge Bot
parent 97857d3224
commit 788c49ecc6

View file

@ -69,6 +69,14 @@ is_output(nir_intrinsic_instr *intrin)
struct brw_lower_urb_cb_data {
const struct intel_device_info *devinfo;
/* If true, all access is guaranteed to be vec4 (128-bit) aligned.
* offset and base are in units of 128-bit vec4 slots.
*
* If false, all access is guaranteed to be 32-bit aligned.
* offset is in 32-bit units, but base is still in 128-bit vec4 units,
*/
bool vec4_access;
/** Map from VARYING_SLOT_* to a vec4 slot index */
const int8_t *varying_to_slot;
@ -83,6 +91,23 @@ struct brw_lower_urb_cb_data {
int tes_per_patch_slots;
};
/**
* Given an URB offset in 32-bit units, determine whether (offset % 4)
* is statically known. If so, add this to the value of first_component.
*/
static bool
io_vec4_static_mod(nir_def *offset_32b, unsigned *first_component)
{
unsigned mod;
const bool mod_known =
nir_mod_analysis(nir_get_scalar(offset_32b, 0), nir_type_uint, 4, &mod);
if (mod_known)
*first_component += mod;
return mod_known;
}
static unsigned
io_component(nir_intrinsic_instr *io)
{
@ -115,6 +140,10 @@ urb_offset(nir_builder *b,
{
nir_def *offset = nir_get_io_offset_src(io)->ssa;
/* Convert vec4 slot offset to 32-bit dwords */
if (!cb_data->vec4_access)
offset = nir_ishl_imm(b, offset, 2);
nir_src *index = nir_get_io_arrayed_index_src(io);
if (index) {
nir_def *stride = cb_data->dynamic_tes
@ -156,24 +185,51 @@ load_urb(nir_builder *b,
const unsigned base = io_base_slot(intrin, cb_data);
if (devinfo->ver >= 20) {
nir_def *addr = nir_iadd(b, handle, nir_ishl_imm(b, offset, 4));
return nir_load_urb_lsc_intel(b, intrin->def.num_components, bits, addr,
offset = nir_ishl_imm(b, offset, cb_data->vec4_access ? 4 : 2);
return nir_load_urb_lsc_intel(b, intrin->def.num_components, bits,
nir_iadd(b, handle, offset),
16 * base + 4 * io_component(intrin),
.access = access);
}
/* Load a whole vec4 and return the desired portion */
const unsigned first_component = io_component(intrin);
const unsigned components = intrin->def.num_components + first_component;
assert(components <= 4);
/* Load a whole vec4 or vec8 and return the desired portion */
unsigned first_component = io_component(intrin);
nir_component_mask_t mask = nir_component_mask(intrin->def.num_components);
/* If the offset is in vec4 units, do a straightforward load */
if (cb_data->vec4_access) {
assert(intrin->def.num_components <= 4);
nir_def *load =
nir_load_urb_vec4_intel(b, 4, bits, handle, offset,
.base = base, .access = access);
return nir_channels(b, load, mask << first_component);
}
/* Otherwise, the offset is in 32-bit units. Split it into a vec4-aligned
* slot offset and a 32-bit component offset.
*/
nir_def *mod = nir_iand_imm(b, offset, 0x3);
nir_def *vec4_offset = nir_ishr_imm(b, offset, 2);
const bool static_mod = io_vec4_static_mod(offset, &first_component);
const bool single_vec4 = (static_mod || intrin->def.num_components == 1)
&& first_component + intrin->def.num_components <= 4;
nir_def *load =
nir_load_urb_vec4_intel(b, components, bits, handle, offset,
.base = base, .access = access);
nir_component_mask_t mask =
nir_component_mask(intrin->def.num_components) << first_component;
nir_load_urb_vec4_intel(b, single_vec4 ? 4 : 8, bits, handle,
vec4_offset, .base = base, .access = access);
return nir_channels(b, load, mask);
if (static_mod) {
return nir_channels(b, load, mask << first_component);
} else {
nir_def *comps[NIR_MAX_VEC_COMPONENTS];
for (unsigned i = 0; i < intrin->def.num_components; i++) {
comps[i] =
nir_vector_extract(b, load,
nir_iadd_imm(b, mod, first_component + i));
}
return nir_vec(b, comps, intrin->def.num_components);
}
}
static void
@ -185,13 +241,15 @@ store_urb(nir_builder *b,
{
const struct intel_device_info *devinfo = cb_data->devinfo;
const unsigned base = io_base_slot(intrin, cb_data);
unsigned first_component = io_component(intrin);
nir_def *src = intrin->src[0].ssa;
unsigned mask = nir_intrinsic_write_mask(intrin);
if (devinfo->ver >= 20) {
nir_def *addr = nir_iadd(b, urb_handle, nir_ishl_imm(b, offset, 4));
offset = nir_ishl_imm(b, offset, cb_data->vec4_access ? 4 : 2);
nir_def *addr = nir_iadd(b, urb_handle, offset);
while (mask) {
int start, count;
u_bit_scan_consecutive_range(&mask, &start, &count);
@ -203,18 +261,42 @@ store_urb(nir_builder *b,
nir_store_urb_lsc_intel(b, nir_channels(b, src, cur_mask), addr,
.base = cur_base);
}
} else {
const unsigned first_component = io_component(intrin);
if (first_component) {
const unsigned components = src->num_components + first_component;
assert(components <= 4);
mask <<= first_component;
src = nir_shift_channels(b, src, first_component, components);
}
nir_store_urb_vec4_intel(b, src, urb_handle, offset,
nir_imm_int(b, mask), .base = base);
return;
}
nir_def *channel_mask = nir_imm_int(b, mask);
const bool static_mod = cb_data->vec4_access ||
io_vec4_static_mod(offset, &first_component);
if (static_mod) {
src = nir_shift_channels(b, src, first_component,
align(src->num_components + first_component, 4));
channel_mask = nir_ishl_imm(b, channel_mask, first_component);
} else {
offset = nir_iadd_imm(b, offset, first_component);
nir_def *undef = nir_undef(b, 1, src->bit_size);
nir_def *mod = nir_iand_imm(b, offset, 0x3);
channel_mask = nir_ishl(b, channel_mask, mod);
nir_def *comps[8];
for (unsigned i = 0; i < 8; i++) {
nir_def *cond = nir_i2b(b, nir_iand_imm(b, channel_mask, 1u << i));
nir_def *src_idx = nir_imax_imm(b, nir_isub_imm(b, i, mod), 0);
nir_def *src_comp = src->num_components == 1 ? src :
nir_vector_extract(b, src, src_idx);
comps[i] = nir_bcsel(b, cond, src_comp, undef);
}
src = nir_vec(b, comps, 8);
}
nir_def *vec4_offset =
cb_data->vec4_access ? offset : nir_ishr_imm(b, offset, 2);
nir_store_urb_vec4_intel(b, src, urb_handle, vec4_offset, channel_mask,
.base = base);
}
static nir_def *
@ -248,7 +330,7 @@ try_load_push_input(nir_builder *b,
nir_intrinsic_instr *io,
nir_def *offset)
{
if (!nir_def_is_const(offset))
if (!nir_def_is_const(offset) || !cb_data->vec4_access)
return NULL;
const unsigned base = io_base_slot(io, cb_data) +
@ -839,6 +921,7 @@ brw_nir_lower_tes_inputs(nir_shader *nir,
const struct brw_lower_urb_cb_data cb_data = {
.devinfo = devinfo,
.vec4_access = true,
.varying_to_slot = vue_map->varying_to_slot,
.per_vertex_stride = vue_map->num_per_vertex_slots * 16,
.dynamic_tes = vue_map->layout == INTEL_VUE_LAYOUT_SEPARATE,
@ -1116,6 +1199,7 @@ brw_nir_lower_tcs_inputs(nir_shader *nir,
const struct brw_lower_urb_cb_data cb_data = {
.devinfo = devinfo,
.vec4_access = true,
.varying_to_slot = input_vue_map->varying_to_slot,
};
NIR_PASS(_, nir, lower_inputs_to_urb_intrinsics, &cb_data);
@ -1142,6 +1226,7 @@ brw_nir_lower_tcs_outputs(nir_shader *nir,
const struct brw_lower_urb_cb_data cb_data = {
.devinfo = devinfo,
.vec4_access = true,
.varying_to_slot = vue_map->varying_to_slot,
.per_vertex_stride = vue_map->num_per_vertex_slots * 16,
};