brw: Move GS URB Read Length limiting to brw_nir_lower_gs_inputs()

We're going to be deciding on push vs. pull in the NIR lowering pass
soon, so move the code to limit our register usage from brw's thread
payload code to brw_nir_lower_gs_inputs().

Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38990>
This commit is contained in:
Kenneth Graunke 2025-10-27 22:09:26 -07:00 committed by Marge Bot
parent 8889802271
commit eae3bd19d4
4 changed files with 30 additions and 24 deletions

View file

@ -176,7 +176,8 @@ brw_compile_gs(const struct brw_compiler *compiler,
pos_slots); pos_slots);
brw_nir_apply_key(nir, compiler, &key->base, dispatch_width); brw_nir_apply_key(nir, compiler, &key->base, dispatch_width);
brw_nir_lower_gs_inputs(nir, &input_vue_map); brw_nir_lower_gs_inputs(nir, compiler->devinfo, &input_vue_map,
&prog_data->base.urb_read_length);
brw_nir_lower_vue_outputs(nir); brw_nir_lower_vue_outputs(nir);
brw_postprocess_nir(nir, compiler, dispatch_width, brw_postprocess_nir(nir, compiler, dispatch_width,
params->base.archiver, debug_enabled, params->base.archiver, debug_enabled,
@ -338,11 +339,6 @@ brw_compile_gs(const struct brw_compiler *compiler,
prog_data->vertices_in = nir->info.gs.vertices_in; prog_data->vertices_in = nir->info.gs.vertices_in;
/* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
* need to program a URB read length of ceiling(num_slots / 2).
*/
prog_data->base.urb_read_length = (input_vue_map.num_slots + 1) / 2;
/* Now that prog_data setup is done, we are ready to actually compile the /* Now that prog_data setup is done, we are ready to actually compile the
* program. * program.
*/ */

View file

@ -905,7 +905,9 @@ brw_nir_lower_vs_inputs(nir_shader *nir)
void void
brw_nir_lower_gs_inputs(nir_shader *nir, brw_nir_lower_gs_inputs(nir_shader *nir,
const struct intel_vue_map *vue_map) const struct intel_device_info *devinfo,
const struct intel_vue_map *vue_map,
unsigned *out_urb_read_length)
{ {
nir_foreach_shader_in_variable(var, nir) nir_foreach_shader_in_variable(var, nir)
var->data.driver_location = var->data.location; var->data.driver_location = var->data.location;
@ -950,6 +952,28 @@ brw_nir_lower_gs_inputs(nir_shader *nir,
} }
} }
} }
unsigned urb_read_length = 0;
if (nir->info.gs.invocations == 1) {
/* URB read length is in 256-bit units, which is two vec4s. */
urb_read_length = DIV_ROUND_UP(vue_map->num_slots, 2);
/* Because we're operating in scalar mode, the two vec4s take
* up 8 registers. Additionally, the GS reads URB Read Length
* for each vertex being processed, each unit of read length
* takes up 8 * VerticesIn registers.
*/
const unsigned regs_per_read = 8 * nir->info.gs.vertices_in;
/* Limit to 24 registers worth of pushed inputs */
const unsigned max_push_regs = 24;
if (urb_read_length * regs_per_read > max_push_regs)
urb_read_length = max_push_regs / regs_per_read;
}
*out_urb_read_length = urb_read_length;
} }
void void

View file

@ -235,7 +235,9 @@ bool brw_nir_lower_outputs_to_urb_intrinsics(nir_shader *, const struct brw_lowe
void brw_nir_lower_vs_inputs(nir_shader *nir); void brw_nir_lower_vs_inputs(nir_shader *nir);
void brw_nir_lower_gs_inputs(nir_shader *nir, void brw_nir_lower_gs_inputs(nir_shader *nir,
const struct intel_vue_map *vue_map); const struct intel_device_info *devinfo,
const struct intel_vue_map *vue_map,
unsigned *out_urb_read_length);
void brw_nir_lower_tes_inputs(nir_shader *nir, void brw_nir_lower_tes_inputs(nir_shader *nir,
const struct intel_device_info *devinfo, const struct intel_device_info *devinfo,
const struct intel_vue_map *vue); const struct intel_vue_map *vue);

View file

@ -100,7 +100,6 @@ brw_tes_thread_payload::brw_tes_thread_payload(const brw_shader &v)
brw_gs_thread_payload::brw_gs_thread_payload(brw_shader &v) brw_gs_thread_payload::brw_gs_thread_payload(brw_shader &v)
{ {
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data);
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(v.prog_data); struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(v.prog_data);
const brw_builder bld = brw_builder(&v); const brw_builder bld = brw_builder(&v);
@ -136,21 +135,6 @@ brw_gs_thread_payload::brw_gs_thread_payload(brw_shader &v)
r += v.nir->info.gs.vertices_in * reg_unit(v.devinfo); r += v.nir->info.gs.vertices_in * reg_unit(v.devinfo);
num_regs = r; num_regs = r;
/* Use a maximum of 24 registers for push-model inputs. */
const unsigned max_push_components = 24;
/* If pushing our inputs would take too many registers, reduce the URB read
* length (which is in HWords, or 8 registers), and resort to pulling.
*
* Note that the GS reads <URB Read Length> HWords for every vertex - so we
* have to multiply by VerticesIn to obtain the total storage requirement.
*/
if (8 * vue_prog_data->urb_read_length * v.nir->info.gs.vertices_in >
max_push_components) {
vue_prog_data->urb_read_length =
ROUND_DOWN_TO(max_push_components / v.nir->info.gs.vertices_in, 8) / 8;
}
} }
static inline void static inline void