diff --git a/src/intel/compiler/brw/brw_compile_gs.cpp b/src/intel/compiler/brw/brw_compile_gs.cpp index 7639070dc60..8f69c33cd63 100644 --- a/src/intel/compiler/brw/brw_compile_gs.cpp +++ b/src/intel/compiler/brw/brw_compile_gs.cpp @@ -176,7 +176,8 @@ brw_compile_gs(const struct brw_compiler *compiler, pos_slots); brw_nir_apply_key(nir, compiler, &key->base, dispatch_width); - brw_nir_lower_gs_inputs(nir, &input_vue_map); + brw_nir_lower_gs_inputs(nir, compiler->devinfo, &input_vue_map, + &prog_data->base.urb_read_length); brw_nir_lower_vue_outputs(nir); brw_postprocess_nir(nir, compiler, dispatch_width, params->base.archiver, debug_enabled, @@ -338,11 +339,6 @@ brw_compile_gs(const struct brw_compiler *compiler, prog_data->vertices_in = nir->info.gs.vertices_in; - /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we - * need to program a URB read length of ceiling(num_slots / 2). - */ - prog_data->base.urb_read_length = (input_vue_map.num_slots + 1) / 2; - /* Now that prog_data setup is done, we are ready to actually compile the * program. */ diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c index 0f8d596d0c9..618f1010d57 100644 --- a/src/intel/compiler/brw/brw_nir.c +++ b/src/intel/compiler/brw/brw_nir.c @@ -905,7 +905,9 @@ brw_nir_lower_vs_inputs(nir_shader *nir) void brw_nir_lower_gs_inputs(nir_shader *nir, - const struct intel_vue_map *vue_map) + const struct intel_device_info *devinfo, + const struct intel_vue_map *vue_map, + unsigned *out_urb_read_length) { nir_foreach_shader_in_variable(var, nir) var->data.driver_location = var->data.location; @@ -950,6 +952,28 @@ brw_nir_lower_gs_inputs(nir_shader *nir, } } } + + unsigned urb_read_length = 0; + + if (nir->info.gs.invocations == 1) { + /* URB read length is in 256-bit units, which is two vec4s. */ + urb_read_length = DIV_ROUND_UP(vue_map->num_slots, 2); + + /* Because we're operating in scalar mode, the two vec4s take + * up 8 registers. Additionally, the GS reads URB Read Length + * for each vertex being processed, each unit of read length + * takes up 8 * VerticesIn registers. + */ + const unsigned regs_per_read = 8 * nir->info.gs.vertices_in; + + /* Limit to 24 registers worth of pushed inputs */ + const unsigned max_push_regs = 24; + + if (urb_read_length * regs_per_read > max_push_regs) + urb_read_length = max_push_regs / regs_per_read; + } + + *out_urb_read_length = urb_read_length; } void diff --git a/src/intel/compiler/brw/brw_nir.h b/src/intel/compiler/brw/brw_nir.h index 556f481e62f..f0f014542c7 100644 --- a/src/intel/compiler/brw/brw_nir.h +++ b/src/intel/compiler/brw/brw_nir.h @@ -235,7 +235,9 @@ bool brw_nir_lower_outputs_to_urb_intrinsics(nir_shader *, const struct brw_lowe void brw_nir_lower_vs_inputs(nir_shader *nir); void brw_nir_lower_gs_inputs(nir_shader *nir, - const struct intel_vue_map *vue_map); + const struct intel_device_info *devinfo, + const struct intel_vue_map *vue_map, + unsigned *out_urb_read_length); void brw_nir_lower_tes_inputs(nir_shader *nir, const struct intel_device_info *devinfo, const struct intel_vue_map *vue); diff --git a/src/intel/compiler/brw/brw_thread_payload.cpp b/src/intel/compiler/brw/brw_thread_payload.cpp index 1fc6751f1a4..78b45cd64f7 100644 --- a/src/intel/compiler/brw/brw_thread_payload.cpp +++ b/src/intel/compiler/brw/brw_thread_payload.cpp @@ -100,7 +100,6 @@ brw_tes_thread_payload::brw_tes_thread_payload(const brw_shader &v) brw_gs_thread_payload::brw_gs_thread_payload(brw_shader &v) { - struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data); struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(v.prog_data); const brw_builder bld = brw_builder(&v); @@ -136,21 +135,6 @@ brw_gs_thread_payload::brw_gs_thread_payload(brw_shader &v) r += v.nir->info.gs.vertices_in * reg_unit(v.devinfo); num_regs = r; - - /* Use a maximum of 24 registers for push-model inputs. */ - const unsigned max_push_components = 24; - - /* If pushing our inputs would take too many registers, reduce the URB read - * length (which is in HWords, or 8 registers), and resort to pulling. - * - * Note that the GS reads HWords for every vertex - so we - * have to multiply by VerticesIn to obtain the total storage requirement. - */ - if (8 * vue_prog_data->urb_read_length * v.nir->info.gs.vertices_in > - max_push_components) { - vue_prog_data->urb_read_length = - ROUND_DOWN_TO(max_push_components / v.nir->info.gs.vertices_in, 8) / 8; - } } static inline void