brw: Move GS URB Read Length limiting to brw_nir_lower_gs_inputs()

We're going to be deciding on push vs. pull in the NIR lowering pass soon, so move the code to limit our register usage from brw's thread payload code to brw_nir_lower_gs_inputs(). Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38990>
2025-12-20 13:50:11 +01:00 · 2025-10-27 22:09:26 -07:00 · 2025-10-27 22:09:26 -07:00 · eae3bd19d4
commit eae3bd19d4
parent 8889802271
4 changed files with 30 additions and 24 deletions
--- a/src/intel/compiler/brw/brw_compile_gs.cpp
+++ b/src/intel/compiler/brw/brw_compile_gs.cpp
@ -176,7 +176,8 @@ brw_compile_gs(const struct brw_compiler *compiler,
                       pos_slots);
   brw_nir_apply_key(nir, compiler, &key->base, dispatch_width);
-   brw_nir_lower_gs_inputs(nir, &input_vue_map);
+   brw_nir_lower_gs_inputs(nir, compiler->devinfo, &input_vue_map,
                           &prog_data->base.urb_read_length);
   brw_nir_lower_vue_outputs(nir);
   brw_postprocess_nir(nir, compiler, dispatch_width,
                       params->base.archiver, debug_enabled,
@ -338,11 +339,6 @@ brw_compile_gs(const struct brw_compiler *compiler,
   prog_data->vertices_in = nir->info.gs.vertices_in;
   /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
    * need to program a URB read length of ceiling(num_slots / 2).
    */
   prog_data->base.urb_read_length = (input_vue_map.num_slots + 1) / 2;
   /* Now that prog_data setup is done, we are ready to actually compile the
    * program.
    */
--- a/src/intel/compiler/brw/brw_nir.c
+++ b/src/intel/compiler/brw/brw_nir.c
@ -905,7 +905,9 @@ brw_nir_lower_vs_inputs(nir_shader *nir)
 void
 brw_nir_lower_gs_inputs(nir_shader *nir,
-                        const struct intel_vue_map *vue_map)
+                        const struct intel_device_info *devinfo,
                        const struct intel_vue_map *vue_map,
                        unsigned *out_urb_read_length)
 {
   nir_foreach_shader_in_variable(var, nir)
      var->data.driver_location = var->data.location;
@ -950,6 +952,28 @@ brw_nir_lower_gs_inputs(nir_shader *nir,
         }
      }
   }
   unsigned urb_read_length = 0;
   if (nir->info.gs.invocations == 1) {
      /* URB read length is in 256-bit units, which is two vec4s. */
      urb_read_length = DIV_ROUND_UP(vue_map->num_slots, 2);
      /* Because we're operating in scalar mode, the two vec4s take
       * up 8 registers.  Additionally, the GS reads URB Read Length
       * for each vertex being processed, each unit of read length
       * takes up 8 * VerticesIn registers.
       */
      const unsigned regs_per_read = 8 * nir->info.gs.vertices_in;
      /* Limit to 24 registers worth of pushed inputs */
      const unsigned max_push_regs = 24;
      if (urb_read_length * regs_per_read > max_push_regs)
         urb_read_length = max_push_regs / regs_per_read;
   }
   *out_urb_read_length = urb_read_length;
 }
 void
--- a/src/intel/compiler/brw/brw_nir.h
+++ b/src/intel/compiler/brw/brw_nir.h
@ -235,7 +235,9 @@ bool brw_nir_lower_outputs_to_urb_intrinsics(nir_shader *, const struct brw_lowe
 void brw_nir_lower_vs_inputs(nir_shader *nir);
 void brw_nir_lower_gs_inputs(nir_shader *nir,
-                             const struct intel_vue_map *vue_map);
+                             const struct intel_device_info *devinfo,
                             const struct intel_vue_map *vue_map,
                             unsigned *out_urb_read_length);
 void brw_nir_lower_tes_inputs(nir_shader *nir,
                              const struct intel_device_info *devinfo,
                              const struct intel_vue_map *vue);
--- a/src/intel/compiler/brw/brw_thread_payload.cpp
+++ b/src/intel/compiler/brw/brw_thread_payload.cpp
@ -100,7 +100,6 @@ brw_tes_thread_payload::brw_tes_thread_payload(const brw_shader &v)
 brw_gs_thread_payload::brw_gs_thread_payload(brw_shader &v)
 {
   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data);
   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(v.prog_data);
   const brw_builder bld = brw_builder(&v);
@ -136,21 +135,6 @@ brw_gs_thread_payload::brw_gs_thread_payload(brw_shader &v)
   r += v.nir->info.gs.vertices_in * reg_unit(v.devinfo);
   num_regs = r;
   /* Use a maximum of 24 registers for push-model inputs. */
   const unsigned max_push_components = 24;
   /* If pushing our inputs would take too many registers, reduce the URB read
    * length (which is in HWords, or 8 registers), and resort to pulling.
    *
    * Note that the GS reads <URB Read Length> HWords for every vertex - so we
    * have to multiply by VerticesIn to obtain the total storage requirement.
    */
   if (8 * vue_prog_data->urb_read_length * v.nir->info.gs.vertices_in >
       max_push_components) {
      vue_prog_data->urb_read_length =
         ROUND_DOWN_TO(max_push_components / v.nir->info.gs.vertices_in, 8) / 8;
   }
 }
 static inline void