brw: Convert GS pulled inputs to use URB intrinsics

We leave GS pushed inputs using load_per_vertex_input for now - they're
relatively simple, and using load_attribute_payload doesn't work well
since it's assumed to be convergent (for TES, FS inputs) while GS inputs
are divergent.

Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38990>
This commit is contained in:
Kenneth Graunke 2025-10-27 22:09:26 -07:00 committed by Marge Bot
parent eae3bd19d4
commit d83c699045
2 changed files with 110 additions and 197 deletions

View file

@ -2690,160 +2690,6 @@ brw_combine_with_vec(const brw_builder &bld, const brw_reg &dst,
bld.VEC(dst, comps, n);
}
static void
emit_gs_input_load(nir_to_brw_state &ntb, const brw_reg &dst,
const nir_src &vertex_src,
unsigned base_offset,
const nir_src &offset_src,
unsigned num_components,
unsigned first_component)
{
const brw_builder &bld = ntb.bld;
const struct intel_device_info *devinfo = ntb.devinfo;
brw_shader &s = ntb.s;
assert(brw_type_size_bytes(dst.type) == 4);
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
/* TODO: figure out push input layout for invocations == 1 */
if (gs_prog_data->invocations == 1 &&
nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
nir_src_as_uint(vertex_src) * push_reg_count;
const brw_reg attr = offset(brw_attr_reg(0, dst.type), bld,
first_component + imm_offset);
brw_combine_with_vec(bld, dst, attr, num_components);
return;
}
/* Resort to the pull model. Ensure the VUE handles are provided. */
assert(gs_prog_data->base.include_vue_handles);
brw_reg start = s.gs_payload().icp_handle_start;
brw_reg icp_handle = ntb.bld.vgrf(BRW_TYPE_UD);
const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo);
if (gs_prog_data->invocations == 1) {
if (nir_src_is_const(vertex_src)) {
/* The vertex index is constant; just select the proper URB handle. */
icp_handle =
byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes);
} else {
/* The vertex index is non-constant. We need to use indirect
* addressing to fetch the proper URB handle.
*
* First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
* indicating that channel <n> should read the handle from
* DWord <n>. We convert that to bytes by multiplying by 4.
*
* Next, we convert the vertex index to bytes by multiplying
* by 32/64 (shifting by 5/6), and add the two together. This is
* the final indirect byte offset.
*/
brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION();
/* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u));
/* Convert vertex_index to bytes (multiply by 32/64) */
assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */
brw_reg vertex_offset_bytes =
bld.SHL(retype(get_nir_src(ntb, vertex_src, 0), BRW_TYPE_UD),
brw_imm_ud(ffs(grf_size_bytes) - 1));
brw_reg icp_offset_bytes =
bld.ADD(vertex_offset_bytes, channel_offsets);
/* Use first_icp_handle as the base offset. There is one register
* of URB handles per vertex, so inform the register allocator that
* we might read up to nir->info.gs.vertices_in registers.
*/
bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
brw_reg(icp_offset_bytes),
brw_imm_ud(s.nir->info.gs.vertices_in * grf_size_bytes));
}
} else {
assert(gs_prog_data->invocations > 1);
if (nir_src_is_const(vertex_src)) {
unsigned vertex = nir_src_as_uint(vertex_src);
bld.MOV(icp_handle, component(start, vertex));
} else {
/* The vertex index is non-constant. We need to use indirect
* addressing to fetch the proper URB handle.
*
* Convert vertex_index to bytes (multiply by 4)
*/
brw_reg icp_offset_bytes =
bld.SHL(retype(get_nir_src(ntb, vertex_src, 0), BRW_TYPE_UD),
brw_imm_ud(2u));
/* Use first_icp_handle as the base offset. There is one DWord
* of URB handles per vertex, so inform the register allocator that
* we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
*/
bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
brw_reg(icp_offset_bytes),
brw_imm_ud(DIV_ROUND_UP(s.nir->info.gs.vertices_in, 8) *
grf_size_bytes));
}
}
brw_urb_inst *urb;
brw_reg indirect_offset = get_nir_src(ntb, offset_src, 0);
if (nir_src_is_const(offset_src)) {
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
/* Constant indexing - use global offset. */
if (first_component != 0) {
unsigned read_components = num_components + first_component;
brw_reg tmp = bld.vgrf(dst.type, read_components);
urb = bld.URB_READ(tmp, srcs, ARRAY_SIZE(srcs));
urb->size_written = read_components *
tmp.component_size(urb->exec_size);
brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
num_components);
} else {
urb = bld.URB_READ(dst, srcs, ARRAY_SIZE(srcs));
urb->size_written = num_components *
dst.component_size(urb->exec_size);
}
urb->offset = base_offset + nir_src_as_uint(offset_src);
} else {
/* Indirect indexing - use per-slot offsets as well. */
unsigned read_components = num_components + first_component;
brw_reg tmp = bld.vgrf(dst.type, read_components);
/* Convert oword offset to bytes on Xe2+ */
if (devinfo->ver >= 20)
indirect_offset = bld.SHL(indirect_offset, brw_imm_ud(4u));
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
if (first_component != 0) {
urb = bld.URB_READ(tmp, srcs, ARRAY_SIZE(srcs));
urb->size_written = read_components *
tmp.component_size(urb->exec_size);
brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
num_components);
} else {
urb = bld.URB_READ(dst, srcs, ARRAY_SIZE(srcs));
urb->size_written = num_components *
dst.component_size(urb->exec_size);
}
urb->offset = base_offset;
}
if (devinfo->ver >= 20)
urb->offset *= 16;
}
static void
brw_from_nir_emit_vs_intrinsic(nir_to_brw_state &ntb,
nir_intrinsic_instr *instr)
@ -3198,6 +3044,8 @@ brw_from_nir_emit_gs_intrinsic(nir_to_brw_state &ntb,
assert(s.stage == MESA_SHADER_GEOMETRY);
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
brw_reg dest;
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
dest = get_nir_def(ntb, instr->def);
@ -3212,11 +3060,91 @@ brw_from_nir_emit_gs_intrinsic(nir_to_brw_state &ntb,
case nir_intrinsic_load_input:
UNREACHABLE("load_input intrinsics are invalid for the GS stage");
case nir_intrinsic_load_per_vertex_input:
emit_gs_input_load(ntb, dest, instr->src[0], nir_intrinsic_base(instr),
instr->src[1], instr->num_components,
nir_intrinsic_component(instr));
case nir_intrinsic_load_per_vertex_input: {
/* Load a push input (assuming single invocation layout) */
assert(s.nir->info.gs.invocations == 1);
assert(nir_src_as_uint(instr->src[1]) == 0);
const unsigned vertex = nir_src_as_uint(instr->src[0]);
const unsigned stride = gs_prog_data->base.urb_read_length * 8;
const unsigned imm_offset = vertex * stride +
4 * nir_intrinsic_base(instr) +
nir_intrinsic_component(instr);
const brw_reg attr = offset(brw_attr_reg(0, dest.type), bld, imm_offset);
brw_combine_with_vec(bld, dest, attr, instr->num_components);
break;
}
case nir_intrinsic_load_urb_input_handle_indexed_intel: {
const unsigned grf_size_bytes = REG_SIZE * reg_unit(ntb.devinfo);
brw_reg start = s.gs_payload().icp_handle_start;
dest.type = start.type;
if (gs_prog_data->invocations == 1) {
if (nir_src_is_const(instr->src[0])) {
/* Vertex index is constant; just select the proper URB handle. */
bld.MOV(dest, byte_offset(start, grf_size_bytes *
nir_src_as_uint(instr->src[0])));
} else {
/* The vertex index is non-constant. We need to use indirect
* addressing to fetch the proper URB handle.
*
* First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
* indicating that channel <n> should read the handle from
* DWord <n>. We convert that to bytes by multiplying by 4.
*
* Next, we convert the vertex index to bytes by multiplying
* by 32/64 (shifting by 5/6), and add the two together. This is
* the final indirect byte offset.
*/
brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION();
/* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u));
/* Convert vertex_index to bytes (multiply by 32/64) */
assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* ffs() */
brw_reg vertex_offset_bytes =
bld.SHL(retype(get_nir_src(ntb, instr->src[0], 0), BRW_TYPE_UD),
brw_imm_ud(ffs(grf_size_bytes) - 1));
brw_reg icp_offset_bytes =
bld.ADD(vertex_offset_bytes, channel_offsets);
/* Use first_icp_handle as the base offset. There is one register
* of URB handles per vertex, so inform the register allocator that
* we might read up to nir->info.gs.vertices_in registers.
*/
bld.emit(SHADER_OPCODE_MOV_INDIRECT, dest, start,
brw_reg(icp_offset_bytes),
brw_imm_ud(s.nir->info.gs.vertices_in * grf_size_bytes));
}
} else {
assert(gs_prog_data->invocations > 1);
if (nir_src_is_const(instr->src[0])) {
unsigned vertex = nir_src_as_uint(instr->src[0]);
bld.MOV(dest, component(start, vertex));
} else {
/* The vertex index is non-constant. We need to use indirect
* addressing to fetch the proper URB handle.
*
* Convert vertex_index to bytes (multiply by 4)
*/
brw_reg icp_offset_bytes =
bld.SHL(retype(get_nir_src(ntb, instr->src[0], 0), BRW_TYPE_UD),
brw_imm_ud(2u));
/* Use first_icp_handle as the base offset. There is one DWord
* of URB handles per vertex, so inform the register allocator that
* we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
*/
bld.emit(SHADER_OPCODE_MOV_INDIRECT, dest, start,
brw_reg(icp_offset_bytes),
brw_imm_ud(DIV_ROUND_UP(s.nir->info.gs.vertices_in, 8) *
grf_size_bytes));
}
}
break;
}
case nir_intrinsic_emit_vertex_with_counter:
emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr));

View file

@ -345,11 +345,13 @@ try_load_push_input(nir_builder *b,
nir_intrinsic_instr *io,
nir_def *offset)
{
const enum mesa_shader_stage stage = b->shader->info.stage;
if (!nir_def_is_const(offset))
return NULL;
const unsigned offset_unit = cb_data->vec4_access ? 16 : 4;
const uint32_t byte_offset =
uint32_t byte_offset =
16 * io_base_slot(io, cb_data) + 4 * io_component(io, cb_data) +
offset_unit * nir_src_as_uint(nir_src_for_ssa(offset));
assert((byte_offset % 4) == 0);
@ -357,6 +359,16 @@ try_load_push_input(nir_builder *b,
if (byte_offset >= cb_data->max_push_bytes)
return NULL;
if (stage == MESA_SHADER_GEOMETRY) {
/* GS push inputs still use load_per_vertex_input */
const nir_io_semantics io_sem = nir_intrinsic_io_semantics(io);
const int slot = cb_data->varying_to_slot[io_sem.location];
assert(slot != -1);
nir_intrinsic_set_base(io, slot);
nir_intrinsic_set_component(io, io_component(io, cb_data));
return &io->def;
}
return load_push_input(b, io, byte_offset);
}
@ -377,7 +389,8 @@ lower_urb_inputs(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
load = load_urb(b, cb_data, intrin, input_handle(b, intrin), offset,
ACCESS_CAN_REORDER | ACCESS_NON_WRITEABLE);
}
nir_def_replace(&intrin->def, load);
if (load != &intrin->def)
nir_def_replace(&intrin->def, load);
return true;
}
return false;
@ -909,9 +922,6 @@ brw_nir_lower_gs_inputs(nir_shader *nir,
const struct intel_vue_map *vue_map,
unsigned *out_urb_read_length)
{
nir_foreach_shader_in_variable(var, nir)
var->data.driver_location = var->data.location;
/* Inputs are stored in vec4 slots, so use type_size_vec4(). */
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in, type_size_vec4,
nir_lower_io_lower_64bit_to_32);
@ -919,40 +929,6 @@ brw_nir_lower_gs_inputs(nir_shader *nir,
/* Fold constant offset srcs for IO. */
NIR_PASS(_, nir, nir_opt_constant_folding);
nir_foreach_function_impl(impl, nir) {
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic == nir_intrinsic_load_input ||
intrin->intrinsic == nir_intrinsic_load_per_vertex_input) {
/* Offset 0 is the VUE header, which contains
* VARYING_SLOT_LAYER [.y], VARYING_SLOT_VIEWPORT [.z], and
* VARYING_SLOT_PSIZ [.w].
*/
nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
gl_varying_slot varying = io_sem.location;
int vue_slot;
switch (varying) {
case VARYING_SLOT_PSIZ:
nir_intrinsic_set_base(intrin, 0);
nir_intrinsic_set_component(intrin, 3);
break;
default:
vue_slot = vue_map->varying_to_slot[varying];
assert(vue_slot != -1);
nir_intrinsic_set_base(intrin, vue_slot);
break;
}
}
}
}
}
unsigned urb_read_length = 0;
if (nir->info.gs.invocations == 1) {
@ -974,6 +950,15 @@ brw_nir_lower_gs_inputs(nir_shader *nir,
}
*out_urb_read_length = urb_read_length;
const struct brw_lower_urb_cb_data cb_data = {
.devinfo = devinfo,
.vec4_access = true,
/* pushed bytes per vertex */
.max_push_bytes = urb_read_length * 8 * sizeof(uint32_t),
.varying_to_slot = vue_map->varying_to_slot,
};
NIR_PASS(_, nir, brw_nir_lower_inputs_to_urb_intrinsics, &cb_data);
}
void